Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
stenzek
GitHub Repository: stenzek/duckstation
Path: blob/master/dep/lzma/src/Bcj2Enc.c
4253 views
1
/* Bcj2Enc.c -- BCJ2 Encoder converter for x86 code (Branch CALL/JUMP variant2)
2
2023-04-02 : Igor Pavlov : Public domain */
3
4
#include "Precomp.h"
5
6
/* #define SHOW_STAT */
7
#ifdef SHOW_STAT
8
#include <stdio.h>
9
#define PRF2(s) printf("%s ip=%8x tempPos=%d src= %8x\n", s, (unsigned)p->ip64, p->tempPos, (unsigned)(p->srcLim - p->src));
10
#else
11
#define PRF2(s)
12
#endif
13
14
#include "Bcj2.h"
15
#include "CpuArch.h"
16
17
#define kTopValue ((UInt32)1 << 24)
18
#define kNumBitModelTotalBits 11
19
#define kBitModelTotal (1 << kNumBitModelTotalBits)
20
#define kNumMoveBits 5
21
22
void Bcj2Enc_Init(CBcj2Enc *p)
23
{
24
unsigned i;
25
p->state = BCJ2_ENC_STATE_ORIG;
26
p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
27
p->context = 0;
28
p->flushRem = 5;
29
p->isFlushState = 0;
30
p->cache = 0;
31
p->range = 0xffffffff;
32
p->low = 0;
33
p->cacheSize = 1;
34
p->ip64 = 0;
35
p->fileIp64 = 0;
36
p->fileSize64_minus1 = BCJ2_ENC_FileSizeField_UNLIMITED;
37
p->relatLimit = BCJ2_ENC_RELAT_LIMIT_DEFAULT;
38
// p->relatExcludeBits = 0;
39
p->tempPos = 0;
40
for (i = 0; i < sizeof(p->probs) / sizeof(p->probs[0]); i++)
41
p->probs[i] = kBitModelTotal >> 1;
42
}
43
44
// Z7_NO_INLINE
45
Z7_FORCE_INLINE
46
static BoolInt Bcj2_RangeEnc_ShiftLow(CBcj2Enc *p)
47
{
48
const UInt32 low = (UInt32)p->low;
49
const unsigned high = (unsigned)
50
#if defined(Z7_MSC_VER_ORIGINAL) \
51
&& defined(MY_CPU_X86) \
52
&& defined(MY_CPU_LE) \
53
&& !defined(MY_CPU_64BIT)
54
// we try to rid of __aullshr() call in MSVS-x86
55
(((const UInt32 *)&p->low)[1]); // [1] : for little-endian only
56
#else
57
(p->low >> 32);
58
#endif
59
if (low < (UInt32)0xff000000 || high != 0)
60
{
61
Byte *buf = p->bufs[BCJ2_STREAM_RC];
62
do
63
{
64
if (buf == p->lims[BCJ2_STREAM_RC])
65
{
66
p->state = BCJ2_STREAM_RC;
67
p->bufs[BCJ2_STREAM_RC] = buf;
68
return True;
69
}
70
*buf++ = (Byte)(p->cache + high);
71
p->cache = 0xff;
72
}
73
while (--p->cacheSize);
74
p->bufs[BCJ2_STREAM_RC] = buf;
75
p->cache = (Byte)(low >> 24);
76
}
77
p->cacheSize++;
78
p->low = low << 8;
79
return False;
80
}
81
82
83
/*
84
We can use 2 alternative versions of code:
85
1) non-marker version:
86
Byte CBcj2Enc::context
87
Byte temp[8];
88
Last byte of marker (e8/e9/[0f]8x) can be written to temp[] buffer.
89
Encoder writes last byte of marker (e8/e9/[0f]8x) to dest, only in conjunction
90
with writing branch symbol to range coder in same Bcj2Enc_Encode_2() call.
91
92
2) marker version:
93
UInt32 CBcj2Enc::context
94
Byte CBcj2Enc::temp[4];
95
MARKER_FLAG in CBcj2Enc::context shows that CBcj2Enc::context contains finded marker.
96
it's allowed that
97
one call of Bcj2Enc_Encode_2() writes last byte of marker (e8/e9/[0f]8x) to dest,
98
and another call of Bcj2Enc_Encode_2() does offset conversion.
99
So different values of (fileIp) and (fileSize) are possible
100
in these different Bcj2Enc_Encode_2() calls.
101
102
Also marker version requires additional if((v & MARKER_FLAG) == 0) check in main loop.
103
So we use non-marker version.
104
*/
105
106
/*
107
Corner cases with overlap in multi-block.
108
before v23: there was one corner case, where converted instruction
109
could start in one sub-stream and finish in next sub-stream.
110
If multi-block (solid) encoding is used,
111
and BCJ2_ENC_FINISH_MODE_END_BLOCK is used for each sub-stream.
112
and (0f) is last byte of previous sub-stream
113
and (8x) is first byte of current sub-stream
114
then (0f 8x) pair is treated as marker by BCJ2 encoder and decoder.
115
BCJ2 encoder can converts 32-bit offset for that (0f 8x) cortage,
116
if that offset meets limit requirements.
117
If encoder allows 32-bit offset conversion for such overlap case,
118
then the data in 3 uncompressed BCJ2 streams for some sub-stream
119
can depend from data of previous sub-stream.
120
That corner case is not big problem, and it's rare case.
121
Since v23.00 we do additional check to prevent conversions in such overlap cases.
122
*/
123
124
/*
125
Bcj2Enc_Encode_2() output variables at exit:
126
{
127
if (Bcj2Enc_Encode_2() exits with (p->state == BCJ2_ENC_STATE_ORIG))
128
{
129
it means that encoder needs more input data.
130
if (p->srcLim == p->src) at exit, then
131
{
132
(p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
133
all input data were read and processed, and we are ready for
134
new input data.
135
}
136
else
137
{
138
(p->srcLim != p->src)
139
(p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
140
The encoder have found e8/e9/0f_8x marker,
141
and p->src points to last byte of that marker,
142
Bcj2Enc_Encode_2() needs more input data to get totally
143
5 bytes (last byte of marker and 32-bit branch offset)
144
as continuous array starting from p->src.
145
(p->srcLim - p->src < 5) requirement is met after exit.
146
So non-processed resedue from p->src to p->srcLim is always less than 5 bytes.
147
}
148
}
149
}
150
*/
151
152
Z7_NO_INLINE
153
static void Bcj2Enc_Encode_2(CBcj2Enc *p)
154
{
155
if (!p->isFlushState)
156
{
157
const Byte *src;
158
UInt32 v;
159
{
160
const unsigned state = p->state;
161
if (BCJ2_IS_32BIT_STREAM(state))
162
{
163
Byte *cur = p->bufs[state];
164
if (cur == p->lims[state])
165
return;
166
SetBe32a(cur, p->tempTarget)
167
p->bufs[state] = cur + 4;
168
}
169
}
170
p->state = BCJ2_ENC_STATE_ORIG; // for main reason of exit
171
src = p->src;
172
v = p->context;
173
174
// #define WRITE_CONTEXT p->context = v; // for marker version
175
#define WRITE_CONTEXT p->context = (Byte)v;
176
#define WRITE_CONTEXT_AND_SRC p->src = src; WRITE_CONTEXT
177
178
for (;;)
179
{
180
// const Byte *src;
181
// UInt32 v;
182
CBcj2Enc_ip_unsigned ip;
183
if (p->range < kTopValue)
184
{
185
// to reduce register pressure and code size: we save and restore local variables.
186
WRITE_CONTEXT_AND_SRC
187
if (Bcj2_RangeEnc_ShiftLow(p))
188
return;
189
p->range <<= 8;
190
src = p->src;
191
v = p->context;
192
}
193
// src = p->src;
194
// #define MARKER_FLAG ((UInt32)1 << 17)
195
// if ((v & MARKER_FLAG) == 0) // for marker version
196
{
197
const Byte *srcLim;
198
Byte *dest = p->bufs[BCJ2_STREAM_MAIN];
199
{
200
const SizeT remSrc = (SizeT)(p->srcLim - src);
201
SizeT rem = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest);
202
if (rem >= remSrc)
203
rem = remSrc;
204
srcLim = src + rem;
205
}
206
/* p->context contains context of previous byte:
207
bits [0 : 7] : src[-1], if (src) was changed in this call
208
bits [8 : 31] : are undefined for non-marker version
209
*/
210
// v = p->context;
211
#define NUM_SHIFT_BITS 24
212
#define CONV_FLAG ((UInt32)1 << 16)
213
#define ONE_ITER { \
214
b = src[0]; \
215
*dest++ = (Byte)b; \
216
v = (v << NUM_SHIFT_BITS) | b; \
217
if (((b + (0x100 - 0xe8)) & 0xfe) == 0) break; \
218
if (((v - (((UInt32)0x0f << (NUM_SHIFT_BITS)) + 0x80)) & \
219
((((UInt32)1 << (4 + NUM_SHIFT_BITS)) - 0x1) << 4)) == 0) break; \
220
src++; if (src == srcLim) { break; } }
221
222
if (src != srcLim)
223
for (;;)
224
{
225
/* clang can generate ineffective code with setne instead of two jcc instructions.
226
we can use 2 iterations and external (unsigned b) to avoid that ineffective code genaration. */
227
unsigned b;
228
ONE_ITER
229
ONE_ITER
230
}
231
232
ip = p->ip64 + (CBcj2Enc_ip_unsigned)(SizeT)(dest - p->bufs[BCJ2_STREAM_MAIN]);
233
p->bufs[BCJ2_STREAM_MAIN] = dest;
234
p->ip64 = ip;
235
236
if (src == srcLim)
237
{
238
WRITE_CONTEXT_AND_SRC
239
if (src != p->srcLim)
240
{
241
p->state = BCJ2_STREAM_MAIN;
242
return;
243
}
244
/* (p->src == p->srcLim)
245
(p->state == BCJ2_ENC_STATE_ORIG) */
246
if (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
247
return;
248
/* (p->finishMode == BCJ2_ENC_FINISH_MODE_END_STREAM */
249
// (p->flushRem == 5);
250
p->isFlushState = 1;
251
break;
252
}
253
src++;
254
// p->src = src;
255
}
256
// ip = p->ip; // for marker version
257
/* marker was found */
258
/* (v) contains marker that was found:
259
bits [NUM_SHIFT_BITS : NUM_SHIFT_BITS + 7]
260
: value of src[-2] : xx/xx/0f
261
bits [0 : 7] : value of src[-1] : e8/e9/8x
262
*/
263
{
264
{
265
#if NUM_SHIFT_BITS != 24
266
v &= ~(UInt32)CONV_FLAG;
267
#endif
268
// UInt32 relat = 0;
269
if ((SizeT)(p->srcLim - src) >= 4)
270
{
271
/*
272
if (relat != 0 || (Byte)v != 0xe8)
273
BoolInt isBigOffset = True;
274
*/
275
const UInt32 relat = GetUi32(src);
276
/*
277
#define EXCLUDE_FLAG ((UInt32)1 << 4)
278
#define NEED_CONVERT(rel) ((((rel) + EXCLUDE_FLAG) & (0 - EXCLUDE_FLAG * 2)) != 0)
279
if (p->relatExcludeBits != 0)
280
{
281
const UInt32 flag = (UInt32)1 << (p->relatExcludeBits - 1);
282
isBigOffset = (((relat + flag) & (0 - flag * 2)) != 0);
283
}
284
// isBigOffset = False; // for debug
285
*/
286
ip -= p->fileIp64;
287
// Use the following if check, if (ip) is 64-bit:
288
if (ip > (((v + 0x20) >> 5) & 1)) // 23.00 : we eliminate milti-block overlap for (Of 80) and (e8/e9)
289
if ((CBcj2Enc_ip_unsigned)((CBcj2Enc_ip_signed)ip + 4 + (Int32)relat) <= p->fileSize64_minus1)
290
if (((UInt32)(relat + p->relatLimit) >> 1) < p->relatLimit)
291
v |= CONV_FLAG;
292
}
293
else if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
294
{
295
// (p->srcLim - src < 4)
296
// /*
297
// for non-marker version
298
p->ip64--; // p->ip = ip - 1;
299
p->bufs[BCJ2_STREAM_MAIN]--;
300
src--;
301
v >>= NUM_SHIFT_BITS;
302
// (0 < p->srcLim - p->src <= 4)
303
// */
304
// v |= MARKER_FLAG; // for marker version
305
/* (p->state == BCJ2_ENC_STATE_ORIG) */
306
WRITE_CONTEXT_AND_SRC
307
return;
308
}
309
{
310
const unsigned c = ((v + 0x17) >> 6) & 1;
311
CBcj2Prob *prob = p->probs + (unsigned)
312
(((0 - c) & (Byte)(v >> NUM_SHIFT_BITS)) + c + ((v >> 5) & 1));
313
/*
314
((Byte)v == 0xe8 ? 2 + ((Byte)(v >> 8)) :
315
((Byte)v < 0xe8 ? 0 : 1)); // ((v >> 5) & 1));
316
*/
317
const unsigned ttt = *prob;
318
const UInt32 bound = (p->range >> kNumBitModelTotalBits) * ttt;
319
if ((v & CONV_FLAG) == 0)
320
{
321
// static int yyy = 0; yyy++; printf("\n!needConvert = %d\n", yyy);
322
// v = (Byte)v; // for marker version
323
p->range = bound;
324
*prob = (CBcj2Prob)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
325
// WRITE_CONTEXT_AND_SRC
326
continue;
327
}
328
p->low += bound;
329
p->range -= bound;
330
*prob = (CBcj2Prob)(ttt - (ttt >> kNumMoveBits));
331
}
332
// p->context = src[3];
333
{
334
// const unsigned cj = ((Byte)v == 0xe8 ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP);
335
const unsigned cj = (((v + 0x57) >> 6) & 1) + BCJ2_STREAM_CALL;
336
ip = p->ip64;
337
v = GetUi32(src); // relat
338
ip += 4;
339
p->ip64 = ip;
340
src += 4;
341
// p->src = src;
342
{
343
const UInt32 absol = (UInt32)ip + v;
344
Byte *cur = p->bufs[cj];
345
v >>= 24;
346
// WRITE_CONTEXT
347
if (cur == p->lims[cj])
348
{
349
p->state = cj;
350
p->tempTarget = absol;
351
WRITE_CONTEXT_AND_SRC
352
return;
353
}
354
SetBe32a(cur, absol)
355
p->bufs[cj] = cur + 4;
356
}
357
}
358
}
359
}
360
} // end of loop
361
}
362
363
for (; p->flushRem != 0; p->flushRem--)
364
if (Bcj2_RangeEnc_ShiftLow(p))
365
return;
366
p->state = BCJ2_ENC_STATE_FINISHED;
367
}
368
369
370
/*
371
BCJ2 encoder needs look ahead for up to 4 bytes in (src) buffer.
372
So base function Bcj2Enc_Encode_2()
373
in BCJ2_ENC_FINISH_MODE_CONTINUE mode can return with
374
(p->state == BCJ2_ENC_STATE_ORIG && p->src < p->srcLim)
375
Bcj2Enc_Encode() solves that look ahead problem by using p->temp[] buffer.
376
so if (p->state == BCJ2_ENC_STATE_ORIG) after Bcj2Enc_Encode(),
377
then (p->src == p->srcLim).
378
And the caller's code is simpler with Bcj2Enc_Encode().
379
*/
380
381
Z7_NO_INLINE
382
void Bcj2Enc_Encode(CBcj2Enc *p)
383
{
384
PRF2("\n----")
385
if (p->tempPos != 0)
386
{
387
/* extra: number of bytes that were copied from (src) to (temp) buffer in this call */
388
unsigned extra = 0;
389
/* We will touch only minimal required number of bytes in input (src) stream.
390
So we will add input bytes from (src) stream to temp[] with step of 1 byte.
391
We don't add new bytes to temp[] before Bcj2Enc_Encode_2() call
392
in first loop iteration because
393
- previous call of Bcj2Enc_Encode() could use another (finishMode),
394
- previous call could finish with (p->state != BCJ2_ENC_STATE_ORIG).
395
the case with full temp[] buffer (p->tempPos == 4) is possible here.
396
*/
397
for (;;)
398
{
399
// (0 < p->tempPos <= 5) // in non-marker version
400
/* p->src : the current src data position including extra bytes
401
that were copied to temp[] buffer in this call */
402
const Byte *src = p->src;
403
const Byte *srcLim = p->srcLim;
404
const EBcj2Enc_FinishMode finishMode = p->finishMode;
405
if (src != srcLim)
406
{
407
/* if there are some src data after the data copied to temp[],
408
then we use MODE_CONTINUE for temp data */
409
p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
410
}
411
p->src = p->temp;
412
p->srcLim = p->temp + p->tempPos;
413
PRF2(" ")
414
Bcj2Enc_Encode_2(p);
415
{
416
const unsigned num = (unsigned)(p->src - p->temp);
417
const unsigned tempPos = p->tempPos - num;
418
unsigned i;
419
p->tempPos = tempPos;
420
for (i = 0; i < tempPos; i++)
421
p->temp[i] = p->temp[(SizeT)i + num];
422
// tempPos : number of bytes in temp buffer
423
p->src = src;
424
p->srcLim = srcLim;
425
p->finishMode = finishMode;
426
if (p->state != BCJ2_ENC_STATE_ORIG)
427
{
428
// (p->tempPos <= 4) // in non-marker version
429
/* if (the reason of exit from Bcj2Enc_Encode_2()
430
is not BCJ2_ENC_STATE_ORIG),
431
then we exit from Bcj2Enc_Encode() with same reason */
432
// optional code begin : we rollback (src) and tempPos, if it's possible:
433
if (extra >= tempPos)
434
extra = tempPos;
435
p->src = src - extra;
436
p->tempPos = tempPos - extra;
437
// optional code end : rollback of (src) and tempPos
438
return;
439
}
440
/* (p->tempPos <= 4)
441
(p->state == BCJ2_ENC_STATE_ORIG)
442
so encoder needs more data than in temp[] */
443
if (src == srcLim)
444
return; // src buffer has no more input data.
445
/* (src != srcLim)
446
so we can provide more input data from src for Bcj2Enc_Encode_2() */
447
if (extra >= tempPos)
448
{
449
/* (extra >= tempPos) means that temp buffer contains
450
only data from src buffer of this call.
451
So now we can encode without temp buffer */
452
p->src = src - tempPos; // rollback (src)
453
p->tempPos = 0;
454
break;
455
}
456
// we append one additional extra byte from (src) to temp[] buffer:
457
p->temp[tempPos] = *src;
458
p->tempPos = tempPos + 1;
459
// (0 < p->tempPos <= 5) // in non-marker version
460
p->src = src + 1;
461
extra++;
462
}
463
}
464
}
465
466
PRF2("++++")
467
// (p->tempPos == 0)
468
Bcj2Enc_Encode_2(p);
469
PRF2("====")
470
471
if (p->state == BCJ2_ENC_STATE_ORIG)
472
{
473
const Byte *src = p->src;
474
const Byte *srcLim = p->srcLim;
475
const unsigned rem = (unsigned)(srcLim - src);
476
/* (rem <= 4) here.
477
if (p->src != p->srcLim), then
478
- we copy non-processed bytes from (p->src) to temp[] buffer,
479
- we set p->src equal to p->srcLim.
480
*/
481
if (rem)
482
{
483
unsigned i = 0;
484
p->src = srcLim;
485
p->tempPos = rem;
486
// (0 < p->tempPos <= 4)
487
do
488
p->temp[i] = src[i];
489
while (++i != rem);
490
}
491
// (p->tempPos <= 4)
492
// (p->src == p->srcLim)
493
}
494
}
495
496
#undef PRF2
497
#undef CONV_FLAG
498
#undef MARKER_FLAG
499
#undef WRITE_CONTEXT
500
#undef WRITE_CONTEXT_AND_SRC
501
#undef ONE_ITER
502
#undef NUM_SHIFT_BITS
503
#undef kTopValue
504
#undef kNumBitModelTotalBits
505
#undef kBitModelTotal
506
#undef kNumMoveBits
507
508