Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/zstd/lib/decompress/zstd_decompress_block.c
48378 views
1
/*
2
* Copyright (c) Yann Collet, Facebook, Inc.
3
* All rights reserved.
4
*
5
* This source code is licensed under both the BSD-style license (found in the
6
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
* in the COPYING file in the root directory of this source tree).
8
* You may select, at your option, one of the above-listed licenses.
9
*/
10
11
/* zstd_decompress_block :
12
* this module takes care of decompressing _compressed_ block */
13
14
/*-*******************************************************
15
* Dependencies
16
*********************************************************/
17
#include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
18
#include "../common/compiler.h" /* prefetch */
19
#include "../common/cpu.h" /* bmi2 */
20
#include "../common/mem.h" /* low level memory routines */
21
#define FSE_STATIC_LINKING_ONLY
22
#include "../common/fse.h"
23
#define HUF_STATIC_LINKING_ONLY
24
#include "../common/huf.h"
25
#include "../common/zstd_internal.h"
26
#include "zstd_decompress_internal.h" /* ZSTD_DCtx */
27
#include "zstd_ddict.h" /* ZSTD_DDictDictContent */
28
#include "zstd_decompress_block.h"
29
30
/*_*******************************************************
31
* Macros
32
**********************************************************/
33
34
/* These two optional macros force the use one way or another of the two
35
* ZSTD_decompressSequences implementations. You can't force in both directions
36
* at the same time.
37
*/
38
#if defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
39
defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
40
#error "Cannot force the use of the short and the long ZSTD_decompressSequences variants!"
41
#endif
42
43
44
/*_*******************************************************
45
* Memory operations
46
**********************************************************/
47
static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }
48
49
50
/*-*************************************************************
51
* Block decoding
52
***************************************************************/
53
54
/*! ZSTD_getcBlockSize() :
55
* Provides the size of compressed block from block header `src` */
56
size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
57
blockProperties_t* bpPtr)
58
{
59
RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong, "");
60
61
{ U32 const cBlockHeader = MEM_readLE24(src);
62
U32 const cSize = cBlockHeader >> 3;
63
bpPtr->lastBlock = cBlockHeader & 1;
64
bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3);
65
bpPtr->origSize = cSize; /* only useful for RLE */
66
if (bpPtr->blockType == bt_rle) return 1;
67
RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected, "");
68
return cSize;
69
}
70
}
71
72
/* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */
73
static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,
74
const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately)
75
{
76
if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH)
77
{
78
/* room for litbuffer to fit without read faulting */
79
dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH;
80
dctx->litBufferEnd = dctx->litBuffer + litSize;
81
dctx->litBufferLocation = ZSTD_in_dst;
82
}
83
else if (litSize > ZSTD_LITBUFFEREXTRASIZE)
84
{
85
/* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
86
if (splitImmediately) {
87
/* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
88
dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
89
dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
90
}
91
else {
92
/* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
93
dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
94
dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
95
}
96
dctx->litBufferLocation = ZSTD_split;
97
}
98
else
99
{
100
/* fits entirely within litExtraBuffer, so no split is necessary */
101
dctx->litBuffer = dctx->litExtraBuffer;
102
dctx->litBufferEnd = dctx->litBuffer + litSize;
103
dctx->litBufferLocation = ZSTD_not_in_dst;
104
}
105
}
106
107
/* Hidden declaration for fullbench */
108
size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
109
const void* src, size_t srcSize,
110
void* dst, size_t dstCapacity, const streaming_operation streaming);
111
/*! ZSTD_decodeLiteralsBlock() :
112
* Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored
113
* in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current
114
* block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being
115
* stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write.
116
*
117
* @return : nb of bytes read from src (< srcSize )
118
* note : symbol not declared but exposed for fullbench */
119
size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
120
const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */
121
void* dst, size_t dstCapacity, const streaming_operation streaming)
122
{
123
DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");
124
RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
125
126
{ const BYTE* const istart = (const BYTE*) src;
127
symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
128
129
switch(litEncType)
130
{
131
case set_repeat:
132
DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");
133
RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, "");
134
ZSTD_FALLTHROUGH;
135
136
case set_compressed:
137
RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
138
{ size_t lhSize, litSize, litCSize;
139
U32 singleStream=0;
140
U32 const lhlCode = (istart[0] >> 2) & 3;
141
U32 const lhc = MEM_readLE32(istart);
142
size_t hufSuccess;
143
size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
144
switch(lhlCode)
145
{
146
case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */
147
/* 2 - 2 - 10 - 10 */
148
singleStream = !lhlCode;
149
lhSize = 3;
150
litSize = (lhc >> 4) & 0x3FF;
151
litCSize = (lhc >> 14) & 0x3FF;
152
break;
153
case 2:
154
/* 2 - 2 - 14 - 14 */
155
lhSize = 4;
156
litSize = (lhc >> 4) & 0x3FFF;
157
litCSize = lhc >> 18;
158
break;
159
case 3:
160
/* 2 - 2 - 18 - 18 */
161
lhSize = 5;
162
litSize = (lhc >> 4) & 0x3FFFF;
163
litCSize = (lhc >> 22) + ((size_t)istart[4] << 10);
164
break;
165
}
166
RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
167
RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
168
RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
169
RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
170
ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
171
172
/* prefetch huffman table if cold */
173
if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
174
PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable));
175
}
176
177
if (litEncType==set_repeat) {
178
if (singleStream) {
179
hufSuccess = HUF_decompress1X_usingDTable_bmi2(
180
dctx->litBuffer, litSize, istart+lhSize, litCSize,
181
dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
182
} else {
183
hufSuccess = HUF_decompress4X_usingDTable_bmi2(
184
dctx->litBuffer, litSize, istart+lhSize, litCSize,
185
dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
186
}
187
} else {
188
if (singleStream) {
189
#if defined(HUF_FORCE_DECOMPRESS_X2)
190
hufSuccess = HUF_decompress1X_DCtx_wksp(
191
dctx->entropy.hufTable, dctx->litBuffer, litSize,
192
istart+lhSize, litCSize, dctx->workspace,
193
sizeof(dctx->workspace));
194
#else
195
hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
196
dctx->entropy.hufTable, dctx->litBuffer, litSize,
197
istart+lhSize, litCSize, dctx->workspace,
198
sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
199
#endif
200
} else {
201
hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
202
dctx->entropy.hufTable, dctx->litBuffer, litSize,
203
istart+lhSize, litCSize, dctx->workspace,
204
sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
205
}
206
}
207
if (dctx->litBufferLocation == ZSTD_split)
208
{
209
ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
210
ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
211
dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
212
dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;
213
}
214
215
RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
216
217
dctx->litPtr = dctx->litBuffer;
218
dctx->litSize = litSize;
219
dctx->litEntropy = 1;
220
if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable;
221
return litCSize + lhSize;
222
}
223
224
case set_basic:
225
{ size_t litSize, lhSize;
226
U32 const lhlCode = ((istart[0]) >> 2) & 3;
227
size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
228
switch(lhlCode)
229
{
230
case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
231
lhSize = 1;
232
litSize = istart[0] >> 3;
233
break;
234
case 1:
235
lhSize = 2;
236
litSize = MEM_readLE16(istart) >> 4;
237
break;
238
case 3:
239
lhSize = 3;
240
litSize = MEM_readLE24(istart) >> 4;
241
break;
242
}
243
244
RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
245
RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
246
ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
247
if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */
248
RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, "");
249
if (dctx->litBufferLocation == ZSTD_split)
250
{
251
ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE);
252
ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
253
}
254
else
255
{
256
ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize);
257
}
258
dctx->litPtr = dctx->litBuffer;
259
dctx->litSize = litSize;
260
return lhSize+litSize;
261
}
262
/* direct reference into compressed stream */
263
dctx->litPtr = istart+lhSize;
264
dctx->litSize = litSize;
265
dctx->litBufferEnd = dctx->litPtr + litSize;
266
dctx->litBufferLocation = ZSTD_not_in_dst;
267
return lhSize+litSize;
268
}
269
270
case set_rle:
271
{ U32 const lhlCode = ((istart[0]) >> 2) & 3;
272
size_t litSize, lhSize;
273
size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
274
switch(lhlCode)
275
{
276
case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */
277
lhSize = 1;
278
litSize = istart[0] >> 3;
279
break;
280
case 1:
281
lhSize = 2;
282
litSize = MEM_readLE16(istart) >> 4;
283
break;
284
case 3:
285
lhSize = 3;
286
litSize = MEM_readLE24(istart) >> 4;
287
RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
288
break;
289
}
290
RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
291
RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
292
RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
293
ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
294
if (dctx->litBufferLocation == ZSTD_split)
295
{
296
ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE);
297
ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE);
298
}
299
else
300
{
301
ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize);
302
}
303
dctx->litPtr = dctx->litBuffer;
304
dctx->litSize = litSize;
305
return lhSize+1;
306
}
307
default:
308
RETURN_ERROR(corruption_detected, "impossible");
309
}
310
}
311
}
312
313
/* Default FSE distribution tables.
314
* These are pre-calculated FSE decoding tables using default distributions as defined in specification :
315
* https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
316
* They were generated programmatically with following method :
317
* - start from default distributions, present in /lib/common/zstd_internal.h
318
* - generate tables normally, using ZSTD_buildFSETable()
319
* - printout the content of tables
320
* - pretify output, report below, test with fuzzer to ensure it's correct */
321
322
/* Default FSE distribution table for Literal Lengths */
323
static const ZSTD_seqSymbol LL_defaultDTable[(1<<LL_DEFAULTNORMLOG)+1] = {
324
{ 1, 1, 1, LL_DEFAULTNORMLOG}, /* header : fastMode, tableLog */
325
/* nextState, nbAddBits, nbBits, baseVal */
326
{ 0, 0, 4, 0}, { 16, 0, 4, 0},
327
{ 32, 0, 5, 1}, { 0, 0, 5, 3},
328
{ 0, 0, 5, 4}, { 0, 0, 5, 6},
329
{ 0, 0, 5, 7}, { 0, 0, 5, 9},
330
{ 0, 0, 5, 10}, { 0, 0, 5, 12},
331
{ 0, 0, 6, 14}, { 0, 1, 5, 16},
332
{ 0, 1, 5, 20}, { 0, 1, 5, 22},
333
{ 0, 2, 5, 28}, { 0, 3, 5, 32},
334
{ 0, 4, 5, 48}, { 32, 6, 5, 64},
335
{ 0, 7, 5, 128}, { 0, 8, 6, 256},
336
{ 0, 10, 6, 1024}, { 0, 12, 6, 4096},
337
{ 32, 0, 4, 0}, { 0, 0, 4, 1},
338
{ 0, 0, 5, 2}, { 32, 0, 5, 4},
339
{ 0, 0, 5, 5}, { 32, 0, 5, 7},
340
{ 0, 0, 5, 8}, { 32, 0, 5, 10},
341
{ 0, 0, 5, 11}, { 0, 0, 6, 13},
342
{ 32, 1, 5, 16}, { 0, 1, 5, 18},
343
{ 32, 1, 5, 22}, { 0, 2, 5, 24},
344
{ 32, 3, 5, 32}, { 0, 3, 5, 40},
345
{ 0, 6, 4, 64}, { 16, 6, 4, 64},
346
{ 32, 7, 5, 128}, { 0, 9, 6, 512},
347
{ 0, 11, 6, 2048}, { 48, 0, 4, 0},
348
{ 16, 0, 4, 1}, { 32, 0, 5, 2},
349
{ 32, 0, 5, 3}, { 32, 0, 5, 5},
350
{ 32, 0, 5, 6}, { 32, 0, 5, 8},
351
{ 32, 0, 5, 9}, { 32, 0, 5, 11},
352
{ 32, 0, 5, 12}, { 0, 0, 6, 15},
353
{ 32, 1, 5, 18}, { 32, 1, 5, 20},
354
{ 32, 2, 5, 24}, { 32, 2, 5, 28},
355
{ 32, 3, 5, 40}, { 32, 4, 5, 48},
356
{ 0, 16, 6,65536}, { 0, 15, 6,32768},
357
{ 0, 14, 6,16384}, { 0, 13, 6, 8192},
358
}; /* LL_defaultDTable */
359
360
/* Default FSE distribution table for Offset Codes */
361
static const ZSTD_seqSymbol OF_defaultDTable[(1<<OF_DEFAULTNORMLOG)+1] = {
362
{ 1, 1, 1, OF_DEFAULTNORMLOG}, /* header : fastMode, tableLog */
363
/* nextState, nbAddBits, nbBits, baseVal */
364
{ 0, 0, 5, 0}, { 0, 6, 4, 61},
365
{ 0, 9, 5, 509}, { 0, 15, 5,32765},
366
{ 0, 21, 5,2097149}, { 0, 3, 5, 5},
367
{ 0, 7, 4, 125}, { 0, 12, 5, 4093},
368
{ 0, 18, 5,262141}, { 0, 23, 5,8388605},
369
{ 0, 5, 5, 29}, { 0, 8, 4, 253},
370
{ 0, 14, 5,16381}, { 0, 20, 5,1048573},
371
{ 0, 2, 5, 1}, { 16, 7, 4, 125},
372
{ 0, 11, 5, 2045}, { 0, 17, 5,131069},
373
{ 0, 22, 5,4194301}, { 0, 4, 5, 13},
374
{ 16, 8, 4, 253}, { 0, 13, 5, 8189},
375
{ 0, 19, 5,524285}, { 0, 1, 5, 1},
376
{ 16, 6, 4, 61}, { 0, 10, 5, 1021},
377
{ 0, 16, 5,65533}, { 0, 28, 5,268435453},
378
{ 0, 27, 5,134217725}, { 0, 26, 5,67108861},
379
{ 0, 25, 5,33554429}, { 0, 24, 5,16777213},
380
}; /* OF_defaultDTable */
381
382
383
/* Default FSE distribution table for Match Lengths */
384
static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = {
385
{ 1, 1, 1, ML_DEFAULTNORMLOG}, /* header : fastMode, tableLog */
386
/* nextState, nbAddBits, nbBits, baseVal */
387
{ 0, 0, 6, 3}, { 0, 0, 4, 4},
388
{ 32, 0, 5, 5}, { 0, 0, 5, 6},
389
{ 0, 0, 5, 8}, { 0, 0, 5, 9},
390
{ 0, 0, 5, 11}, { 0, 0, 6, 13},
391
{ 0, 0, 6, 16}, { 0, 0, 6, 19},
392
{ 0, 0, 6, 22}, { 0, 0, 6, 25},
393
{ 0, 0, 6, 28}, { 0, 0, 6, 31},
394
{ 0, 0, 6, 34}, { 0, 1, 6, 37},
395
{ 0, 1, 6, 41}, { 0, 2, 6, 47},
396
{ 0, 3, 6, 59}, { 0, 4, 6, 83},
397
{ 0, 7, 6, 131}, { 0, 9, 6, 515},
398
{ 16, 0, 4, 4}, { 0, 0, 4, 5},
399
{ 32, 0, 5, 6}, { 0, 0, 5, 7},
400
{ 32, 0, 5, 9}, { 0, 0, 5, 10},
401
{ 0, 0, 6, 12}, { 0, 0, 6, 15},
402
{ 0, 0, 6, 18}, { 0, 0, 6, 21},
403
{ 0, 0, 6, 24}, { 0, 0, 6, 27},
404
{ 0, 0, 6, 30}, { 0, 0, 6, 33},
405
{ 0, 1, 6, 35}, { 0, 1, 6, 39},
406
{ 0, 2, 6, 43}, { 0, 3, 6, 51},
407
{ 0, 4, 6, 67}, { 0, 5, 6, 99},
408
{ 0, 8, 6, 259}, { 32, 0, 4, 4},
409
{ 48, 0, 4, 4}, { 16, 0, 4, 5},
410
{ 32, 0, 5, 7}, { 32, 0, 5, 8},
411
{ 32, 0, 5, 10}, { 32, 0, 5, 11},
412
{ 0, 0, 6, 14}, { 0, 0, 6, 17},
413
{ 0, 0, 6, 20}, { 0, 0, 6, 23},
414
{ 0, 0, 6, 26}, { 0, 0, 6, 29},
415
{ 0, 0, 6, 32}, { 0, 16, 6,65539},
416
{ 0, 15, 6,32771}, { 0, 14, 6,16387},
417
{ 0, 13, 6, 8195}, { 0, 12, 6, 4099},
418
{ 0, 11, 6, 2051}, { 0, 10, 6, 1027},
419
}; /* ML_defaultDTable */
420
421
422
static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits)
423
{
424
void* ptr = dt;
425
ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr;
426
ZSTD_seqSymbol* const cell = dt + 1;
427
428
DTableH->tableLog = 0;
429
DTableH->fastMode = 0;
430
431
cell->nbBits = 0;
432
cell->nextState = 0;
433
assert(nbAddBits < 255);
434
cell->nbAdditionalBits = nbAddBits;
435
cell->baseValue = baseValue;
436
}
437
438
439
/* ZSTD_buildFSETable() :
440
* generate FSE decoding table for one symbol (ll, ml or off)
441
* cannot fail if input is valid =>
442
* all inputs are presumed validated at this stage */
443
FORCE_INLINE_TEMPLATE
444
void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
445
const short* normalizedCounter, unsigned maxSymbolValue,
446
const U32* baseValue, const U8* nbAdditionalBits,
447
unsigned tableLog, void* wksp, size_t wkspSize)
448
{
449
ZSTD_seqSymbol* const tableDecode = dt+1;
450
U32 const maxSV1 = maxSymbolValue + 1;
451
U32 const tableSize = 1 << tableLog;
452
453
U16* symbolNext = (U16*)wksp;
454
BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1);
455
U32 highThreshold = tableSize - 1;
456
457
458
/* Sanity Checks */
459
assert(maxSymbolValue <= MaxSeq);
460
assert(tableLog <= MaxFSELog);
461
assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE);
462
(void)wkspSize;
463
/* Init, lay down lowprob symbols */
464
{ ZSTD_seqSymbol_header DTableH;
465
DTableH.tableLog = tableLog;
466
DTableH.fastMode = 1;
467
{ S16 const largeLimit= (S16)(1 << (tableLog-1));
468
U32 s;
469
for (s=0; s<maxSV1; s++) {
470
if (normalizedCounter[s]==-1) {
471
tableDecode[highThreshold--].baseValue = s;
472
symbolNext[s] = 1;
473
} else {
474
if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
475
assert(normalizedCounter[s]>=0);
476
symbolNext[s] = (U16)normalizedCounter[s];
477
} } }
478
ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
479
}
480
481
/* Spread symbols */
482
assert(tableSize <= 512);
483
/* Specialized symbol spreading for the case when there are
484
* no low probability (-1 count) symbols. When compressing
485
* small blocks we avoid low probability symbols to hit this
486
* case, since header decoding speed matters more.
487
*/
488
if (highThreshold == tableSize - 1) {
489
size_t const tableMask = tableSize-1;
490
size_t const step = FSE_TABLESTEP(tableSize);
491
/* First lay down the symbols in order.
492
* We use a uint64_t to lay down 8 bytes at a time. This reduces branch
493
* misses since small blocks generally have small table logs, so nearly
494
* all symbols have counts <= 8. We ensure we have 8 bytes at the end of
495
* our buffer to handle the over-write.
496
*/
497
{
498
U64 const add = 0x0101010101010101ull;
499
size_t pos = 0;
500
U64 sv = 0;
501
U32 s;
502
for (s=0; s<maxSV1; ++s, sv += add) {
503
int i;
504
int const n = normalizedCounter[s];
505
MEM_write64(spread + pos, sv);
506
for (i = 8; i < n; i += 8) {
507
MEM_write64(spread + pos + i, sv);
508
}
509
pos += n;
510
}
511
}
512
/* Now we spread those positions across the table.
513
* The benefit of doing it in two stages is that we avoid the the
514
* variable size inner loop, which caused lots of branch misses.
515
* Now we can run through all the positions without any branch misses.
516
* We unroll the loop twice, since that is what emperically worked best.
517
*/
518
{
519
size_t position = 0;
520
size_t s;
521
size_t const unroll = 2;
522
assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
523
for (s = 0; s < (size_t)tableSize; s += unroll) {
524
size_t u;
525
for (u = 0; u < unroll; ++u) {
526
size_t const uPosition = (position + (u * step)) & tableMask;
527
tableDecode[uPosition].baseValue = spread[s + u];
528
}
529
position = (position + (unroll * step)) & tableMask;
530
}
531
assert(position == 0);
532
}
533
} else {
534
U32 const tableMask = tableSize-1;
535
U32 const step = FSE_TABLESTEP(tableSize);
536
U32 s, position = 0;
537
for (s=0; s<maxSV1; s++) {
538
int i;
539
int const n = normalizedCounter[s];
540
for (i=0; i<n; i++) {
541
tableDecode[position].baseValue = s;
542
position = (position + step) & tableMask;
543
while (position > highThreshold) position = (position + step) & tableMask; /* lowprob area */
544
} }
545
assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
546
}
547
548
/* Build Decoding table */
549
{
550
U32 u;
551
for (u=0; u<tableSize; u++) {
552
U32 const symbol = tableDecode[u].baseValue;
553
U32 const nextState = symbolNext[symbol]++;
554
tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
555
tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
556
assert(nbAdditionalBits[symbol] < 255);
557
tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
558
tableDecode[u].baseValue = baseValue[symbol];
559
}
560
}
561
}
562
563
/* Avoids the FORCE_INLINE of the _body() function. */
564
static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
565
const short* normalizedCounter, unsigned maxSymbolValue,
566
const U32* baseValue, const U8* nbAdditionalBits,
567
unsigned tableLog, void* wksp, size_t wkspSize)
568
{
569
ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
570
baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
571
}
572
573
#if DYNAMIC_BMI2
574
BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,
575
const short* normalizedCounter, unsigned maxSymbolValue,
576
const U32* baseValue, const U8* nbAdditionalBits,
577
unsigned tableLog, void* wksp, size_t wkspSize)
578
{
579
ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
580
baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
581
}
582
#endif
583
584
void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
585
const short* normalizedCounter, unsigned maxSymbolValue,
586
const U32* baseValue, const U8* nbAdditionalBits,
587
unsigned tableLog, void* wksp, size_t wkspSize, int bmi2)
588
{
589
#if DYNAMIC_BMI2
590
if (bmi2) {
591
ZSTD_buildFSETable_body_bmi2(dt, normalizedCounter, maxSymbolValue,
592
baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
593
return;
594
}
595
#endif
596
(void)bmi2;
597
ZSTD_buildFSETable_body_default(dt, normalizedCounter, maxSymbolValue,
598
baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
599
}
600
601
602
/*! ZSTD_buildSeqTable() :
603
* @return : nb bytes read from src,
604
* or an error code if it fails */
605
static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,
606
symbolEncodingType_e type, unsigned max, U32 maxLog,
607
const void* src, size_t srcSize,
608
const U32* baseValue, const U8* nbAdditionalBits,
609
const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
610
int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize,
611
int bmi2)
612
{
613
switch(type)
614
{
615
case set_rle :
616
RETURN_ERROR_IF(!srcSize, srcSize_wrong, "");
617
RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, "");
618
{ U32 const symbol = *(const BYTE*)src;
619
U32 const baseline = baseValue[symbol];
620
U8 const nbBits = nbAdditionalBits[symbol];
621
ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits);
622
}
623
*DTablePtr = DTableSpace;
624
return 1;
625
case set_basic :
626
*DTablePtr = defaultTable;
627
return 0;
628
case set_repeat:
629
RETURN_ERROR_IF(!flagRepeatTable, corruption_detected, "");
630
/* prefetch FSE table if used */
631
if (ddictIsCold && (nbSeq > 24 /* heuristic */)) {
632
const void* const pStart = *DTablePtr;
633
size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog));
634
PREFETCH_AREA(pStart, pSize);
635
}
636
return 0;
637
case set_compressed :
638
{ unsigned tableLog;
639
S16 norm[MaxSeq+1];
640
size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
641
RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, "");
642
RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, "");
643
ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2);
644
*DTablePtr = DTableSpace;
645
return headerSize;
646
}
647
default :
648
assert(0);
649
RETURN_ERROR(GENERIC, "impossible");
650
}
651
}
652
653
size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
654
const void* src, size_t srcSize)
655
{
656
const BYTE* const istart = (const BYTE*)src;
657
const BYTE* const iend = istart + srcSize;
658
const BYTE* ip = istart;
659
int nbSeq;
660
DEBUGLOG(5, "ZSTD_decodeSeqHeaders");
661
662
/* check */
663
RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong, "");
664
665
/* SeqHead */
666
nbSeq = *ip++;
667
if (!nbSeq) {
668
*nbSeqPtr=0;
669
RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
670
return 1;
671
}
672
if (nbSeq > 0x7F) {
673
if (nbSeq == 0xFF) {
674
RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
675
nbSeq = MEM_readLE16(ip) + LONGNBSEQ;
676
ip+=2;
677
} else {
678
RETURN_ERROR_IF(ip >= iend, srcSize_wrong, "");
679
nbSeq = ((nbSeq-0x80)<<8) + *ip++;
680
}
681
}
682
*nbSeqPtr = nbSeq;
683
684
/* FSE table descriptors */
685
RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
686
{ symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
687
symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
688
symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
689
ip++;
690
691
/* Build DTables */
692
{ size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr,
693
LLtype, MaxLL, LLFSELog,
694
ip, iend-ip,
695
LL_base, LL_bits,
696
LL_defaultDTable, dctx->fseEntropy,
697
dctx->ddictIsCold, nbSeq,
698
dctx->workspace, sizeof(dctx->workspace),
699
ZSTD_DCtx_get_bmi2(dctx));
700
RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");
701
ip += llhSize;
702
}
703
704
{ size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr,
705
OFtype, MaxOff, OffFSELog,
706
ip, iend-ip,
707
OF_base, OF_bits,
708
OF_defaultDTable, dctx->fseEntropy,
709
dctx->ddictIsCold, nbSeq,
710
dctx->workspace, sizeof(dctx->workspace),
711
ZSTD_DCtx_get_bmi2(dctx));
712
RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");
713
ip += ofhSize;
714
}
715
716
{ size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr,
717
MLtype, MaxML, MLFSELog,
718
ip, iend-ip,
719
ML_base, ML_bits,
720
ML_defaultDTable, dctx->fseEntropy,
721
dctx->ddictIsCold, nbSeq,
722
dctx->workspace, sizeof(dctx->workspace),
723
ZSTD_DCtx_get_bmi2(dctx));
724
RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");
725
ip += mlhSize;
726
}
727
}
728
729
return ip-istart;
730
}
731
732
733
typedef struct {
734
size_t litLength;
735
size_t matchLength;
736
size_t offset;
737
} seq_t;
738
739
typedef struct {
740
size_t state;
741
const ZSTD_seqSymbol* table;
742
} ZSTD_fseState;
743
744
typedef struct {
745
BIT_DStream_t DStream;
746
ZSTD_fseState stateLL;
747
ZSTD_fseState stateOffb;
748
ZSTD_fseState stateML;
749
size_t prevOffset[ZSTD_REP_NUM];
750
} seqState_t;
751
752
/*! ZSTD_overlapCopy8() :
753
* Copies 8 bytes from ip to op and updates op and ip where ip <= op.
754
* If the offset is < 8 then the offset is spread to at least 8 bytes.
755
*
756
* Precondition: *ip <= *op
757
* Postcondition: *op - *op >= 8
758
*/
759
HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
760
assert(*ip <= *op);
761
if (offset < 8) {
762
/* close range match, overlap */
763
static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */
764
static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */
765
int const sub2 = dec64table[offset];
766
(*op)[0] = (*ip)[0];
767
(*op)[1] = (*ip)[1];
768
(*op)[2] = (*ip)[2];
769
(*op)[3] = (*ip)[3];
770
*ip += dec32table[offset];
771
ZSTD_copy4(*op+4, *ip);
772
*ip -= sub2;
773
} else {
774
ZSTD_copy8(*op, *ip);
775
}
776
*ip += 8;
777
*op += 8;
778
assert(*op - *ip >= 8);
779
}
780
781
/*! ZSTD_safecopy() :
782
* Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer
783
* and write up to 16 bytes past oend_w (op >= oend_w is allowed).
784
* This function is only called in the uncommon case where the sequence is near the end of the block. It
785
* should be fast for a single long sequence, but can be slow for several short sequences.
786
*
787
* @param ovtype controls the overlap detection
788
* - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart.
789
* - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.
790
* The src buffer must be before the dst buffer.
791
*/
792
static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
793
ptrdiff_t const diff = op - ip;
794
BYTE* const oend = op + length;
795
796
assert((ovtype == ZSTD_no_overlap && (diff <= -8 || diff >= 8 || op >= oend_w)) ||
797
(ovtype == ZSTD_overlap_src_before_dst && diff >= 0));
798
799
if (length < 8) {
800
/* Handle short lengths. */
801
while (op < oend) *op++ = *ip++;
802
return;
803
}
804
if (ovtype == ZSTD_overlap_src_before_dst) {
805
/* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */
806
assert(length >= 8);
807
ZSTD_overlapCopy8(&op, &ip, diff);
808
length -= 8;
809
assert(op - ip >= 8);
810
assert(op <= oend);
811
}
812
813
if (oend <= oend_w) {
814
/* No risk of overwrite. */
815
ZSTD_wildcopy(op, ip, length, ovtype);
816
return;
817
}
818
if (op <= oend_w) {
819
/* Wildcopy until we get close to the end. */
820
assert(oend > oend_w);
821
ZSTD_wildcopy(op, ip, oend_w - op, ovtype);
822
ip += oend_w - op;
823
op += oend_w - op;
824
}
825
/* Handle the leftovers. */
826
while (op < oend) *op++ = *ip++;
827
}
828
829
/* ZSTD_safecopyDstBeforeSrc():
830
* This version allows overlap with dst before src, or handles the non-overlap case with dst after src
831
* Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */
832
static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) {
833
ptrdiff_t const diff = op - ip;
834
BYTE* const oend = op + length;
835
836
if (length < 8 || diff > -8) {
837
/* Handle short lengths, close overlaps, and dst not before src. */
838
while (op < oend) *op++ = *ip++;
839
return;
840
}
841
842
if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) {
843
ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap);
844
ip += oend - WILDCOPY_OVERLENGTH - op;
845
op += oend - WILDCOPY_OVERLENGTH - op;
846
}
847
848
/* Handle the leftovers. */
849
while (op < oend) *op++ = *ip++;
850
}
851
852
/* ZSTD_execSequenceEnd():
853
* This version handles cases that are near the end of the output buffer. It requires
854
* more careful checks to make sure there is no overflow. By separating out these hard
855
* and unlikely cases, we can speed up the common cases.
856
*
857
* NOTE: This function needs to be fast for a single long sequence, but doesn't need
858
* to be optimized for many small sequences, since those fall into ZSTD_execSequence().
859
*/
860
FORCE_NOINLINE
861
size_t ZSTD_execSequenceEnd(BYTE* op,
862
BYTE* const oend, seq_t sequence,
863
const BYTE** litPtr, const BYTE* const litLimit,
864
const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
865
{
866
BYTE* const oLitEnd = op + sequence.litLength;
867
size_t const sequenceLength = sequence.litLength + sequence.matchLength;
868
const BYTE* const iLitEnd = *litPtr + sequence.litLength;
869
const BYTE* match = oLitEnd - sequence.offset;
870
BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
871
872
/* bounds checks : careful of address space overflow in 32-bit mode */
873
RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
874
RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
875
assert(op < op + sequenceLength);
876
assert(oLitEnd < op + sequenceLength);
877
878
/* copy literals */
879
ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap);
880
op = oLitEnd;
881
*litPtr = iLitEnd;
882
883
/* copy Match */
884
if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
885
/* offset beyond prefix */
886
RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
887
match = dictEnd - (prefixStart - match);
888
if (match + sequence.matchLength <= dictEnd) {
889
ZSTD_memmove(oLitEnd, match, sequence.matchLength);
890
return sequenceLength;
891
}
892
/* span extDict & currentPrefixSegment */
893
{ size_t const length1 = dictEnd - match;
894
ZSTD_memmove(oLitEnd, match, length1);
895
op = oLitEnd + length1;
896
sequence.matchLength -= length1;
897
match = prefixStart;
898
}
899
}
900
ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
901
return sequenceLength;
902
}
903
904
/* ZSTD_execSequenceEndSplitLitBuffer():
905
* This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case.
906
*/
907
FORCE_NOINLINE
908
size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
909
BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
910
const BYTE** litPtr, const BYTE* const litLimit,
911
const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
912
{
913
BYTE* const oLitEnd = op + sequence.litLength;
914
size_t const sequenceLength = sequence.litLength + sequence.matchLength;
915
const BYTE* const iLitEnd = *litPtr + sequence.litLength;
916
const BYTE* match = oLitEnd - sequence.offset;
917
918
919
/* bounds checks : careful of address space overflow in 32-bit mode */
920
RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
921
RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
922
assert(op < op + sequenceLength);
923
assert(oLitEnd < op + sequenceLength);
924
925
/* copy literals */
926
RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer");
927
ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength);
928
op = oLitEnd;
929
*litPtr = iLitEnd;
930
931
/* copy Match */
932
if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
933
/* offset beyond prefix */
934
RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
935
match = dictEnd - (prefixStart - match);
936
if (match + sequence.matchLength <= dictEnd) {
937
ZSTD_memmove(oLitEnd, match, sequence.matchLength);
938
return sequenceLength;
939
}
940
/* span extDict & currentPrefixSegment */
941
{ size_t const length1 = dictEnd - match;
942
ZSTD_memmove(oLitEnd, match, length1);
943
op = oLitEnd + length1;
944
sequence.matchLength -= length1;
945
match = prefixStart;
946
}
947
}
948
ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
949
return sequenceLength;
950
}
951
952
HINT_INLINE
953
size_t ZSTD_execSequence(BYTE* op,
954
BYTE* const oend, seq_t sequence,
955
const BYTE** litPtr, const BYTE* const litLimit,
956
const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
957
{
958
BYTE* const oLitEnd = op + sequence.litLength;
959
size_t const sequenceLength = sequence.litLength + sequence.matchLength;
960
BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
961
BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; /* risk : address space underflow on oend=NULL */
962
const BYTE* const iLitEnd = *litPtr + sequence.litLength;
963
const BYTE* match = oLitEnd - sequence.offset;
964
965
assert(op != NULL /* Precondition */);
966
assert(oend_w < oend /* No underflow */);
967
/* Handle edge cases in a slow path:
968
* - Read beyond end of literals
969
* - Match end is within WILDCOPY_OVERLIMIT of oend
970
* - 32-bit mode and the match length overflows
971
*/
972
if (UNLIKELY(
973
iLitEnd > litLimit ||
974
oMatchEnd > oend_w ||
975
(MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
976
return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
977
978
/* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
979
assert(op <= oLitEnd /* No overflow */);
980
assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
981
assert(oMatchEnd <= oend /* No underflow */);
982
assert(iLitEnd <= litLimit /* Literal length is in bounds */);
983
assert(oLitEnd <= oend_w /* Can wildcopy literals */);
984
assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
985
986
/* Copy Literals:
987
* Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
988
* We likely don't need the full 32-byte wildcopy.
989
*/
990
assert(WILDCOPY_OVERLENGTH >= 16);
991
ZSTD_copy16(op, (*litPtr));
992
if (UNLIKELY(sequence.litLength > 16)) {
993
ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap);
994
}
995
op = oLitEnd;
996
*litPtr = iLitEnd; /* update for next sequence */
997
998
/* Copy Match */
999
if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
1000
/* offset beyond prefix -> go into extDict */
1001
RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
1002
match = dictEnd + (match - prefixStart);
1003
if (match + sequence.matchLength <= dictEnd) {
1004
ZSTD_memmove(oLitEnd, match, sequence.matchLength);
1005
return sequenceLength;
1006
}
1007
/* span extDict & currentPrefixSegment */
1008
{ size_t const length1 = dictEnd - match;
1009
ZSTD_memmove(oLitEnd, match, length1);
1010
op = oLitEnd + length1;
1011
sequence.matchLength -= length1;
1012
match = prefixStart;
1013
}
1014
}
1015
/* Match within prefix of 1 or more bytes */
1016
assert(op <= oMatchEnd);
1017
assert(oMatchEnd <= oend_w);
1018
assert(match >= prefixStart);
1019
assert(sequence.matchLength >= 1);
1020
1021
/* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
1022
* without overlap checking.
1023
*/
1024
if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
1025
/* We bet on a full wildcopy for matches, since we expect matches to be
1026
* longer than literals (in general). In silesia, ~10% of matches are longer
1027
* than 16 bytes.
1028
*/
1029
ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
1030
return sequenceLength;
1031
}
1032
assert(sequence.offset < WILDCOPY_VECLEN);
1033
1034
/* Copy 8 bytes and spread the offset to be >= 8. */
1035
ZSTD_overlapCopy8(&op, &match, sequence.offset);
1036
1037
/* If the match length is > 8 bytes, then continue with the wildcopy. */
1038
if (sequence.matchLength > 8) {
1039
assert(op < oMatchEnd);
1040
ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst);
1041
}
1042
return sequenceLength;
1043
}
1044
1045
HINT_INLINE
1046
size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
1047
BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
1048
const BYTE** litPtr, const BYTE* const litLimit,
1049
const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
1050
{
1051
BYTE* const oLitEnd = op + sequence.litLength;
1052
size_t const sequenceLength = sequence.litLength + sequence.matchLength;
1053
BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
1054
const BYTE* const iLitEnd = *litPtr + sequence.litLength;
1055
const BYTE* match = oLitEnd - sequence.offset;
1056
1057
assert(op != NULL /* Precondition */);
1058
assert(oend_w < oend /* No underflow */);
1059
/* Handle edge cases in a slow path:
1060
* - Read beyond end of literals
1061
* - Match end is within WILDCOPY_OVERLIMIT of oend
1062
* - 32-bit mode and the match length overflows
1063
*/
1064
if (UNLIKELY(
1065
iLitEnd > litLimit ||
1066
oMatchEnd > oend_w ||
1067
(MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
1068
return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
1069
1070
/* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
1071
assert(op <= oLitEnd /* No overflow */);
1072
assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
1073
assert(oMatchEnd <= oend /* No underflow */);
1074
assert(iLitEnd <= litLimit /* Literal length is in bounds */);
1075
assert(oLitEnd <= oend_w /* Can wildcopy literals */);
1076
assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
1077
1078
/* Copy Literals:
1079
* Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
1080
* We likely don't need the full 32-byte wildcopy.
1081
*/
1082
assert(WILDCOPY_OVERLENGTH >= 16);
1083
ZSTD_copy16(op, (*litPtr));
1084
if (UNLIKELY(sequence.litLength > 16)) {
1085
ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap);
1086
}
1087
op = oLitEnd;
1088
*litPtr = iLitEnd; /* update for next sequence */
1089
1090
/* Copy Match */
1091
if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
1092
/* offset beyond prefix -> go into extDict */
1093
RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
1094
match = dictEnd + (match - prefixStart);
1095
if (match + sequence.matchLength <= dictEnd) {
1096
ZSTD_memmove(oLitEnd, match, sequence.matchLength);
1097
return sequenceLength;
1098
}
1099
/* span extDict & currentPrefixSegment */
1100
{ size_t const length1 = dictEnd - match;
1101
ZSTD_memmove(oLitEnd, match, length1);
1102
op = oLitEnd + length1;
1103
sequence.matchLength -= length1;
1104
match = prefixStart;
1105
} }
1106
/* Match within prefix of 1 or more bytes */
1107
assert(op <= oMatchEnd);
1108
assert(oMatchEnd <= oend_w);
1109
assert(match >= prefixStart);
1110
assert(sequence.matchLength >= 1);
1111
1112
/* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
1113
* without overlap checking.
1114
*/
1115
if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
1116
/* We bet on a full wildcopy for matches, since we expect matches to be
1117
* longer than literals (in general). In silesia, ~10% of matches are longer
1118
* than 16 bytes.
1119
*/
1120
ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
1121
return sequenceLength;
1122
}
1123
assert(sequence.offset < WILDCOPY_VECLEN);
1124
1125
/* Copy 8 bytes and spread the offset to be >= 8. */
1126
ZSTD_overlapCopy8(&op, &match, sequence.offset);
1127
1128
/* If the match length is > 8 bytes, then continue with the wildcopy. */
1129
if (sequence.matchLength > 8) {
1130
assert(op < oMatchEnd);
1131
ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst);
1132
}
1133
return sequenceLength;
1134
}
1135
1136
1137
static void
1138
ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt)
1139
{
1140
const void* ptr = dt;
1141
const ZSTD_seqSymbol_header* const DTableH = (const ZSTD_seqSymbol_header*)ptr;
1142
DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
1143
DEBUGLOG(6, "ZSTD_initFseState : val=%u using %u bits",
1144
(U32)DStatePtr->state, DTableH->tableLog);
1145
BIT_reloadDStream(bitD);
1146
DStatePtr->table = dt + 1;
1147
}
1148
1149
FORCE_INLINE_TEMPLATE void
1150
ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits)
1151
{
1152
size_t const lowBits = BIT_readBits(bitD, nbBits);
1153
DStatePtr->state = nextState + lowBits;
1154
}
1155
1156
/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
1157
* offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
1158
* bits before reloading. This value is the maximum number of bytes we read
1159
* after reloading when we are decoding long offsets.
1160
*/
1161
#define LONG_OFFSETS_MAX_EXTRA_BITS_32 \
1162
(ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32 \
1163
? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32 \
1164
: 0)
1165
1166
typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
1167
1168
FORCE_INLINE_TEMPLATE seq_t
1169
ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
1170
{
1171
seq_t seq;
1172
const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
1173
const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
1174
const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
1175
seq.matchLength = mlDInfo->baseValue;
1176
seq.litLength = llDInfo->baseValue;
1177
{ U32 const ofBase = ofDInfo->baseValue;
1178
BYTE const llBits = llDInfo->nbAdditionalBits;
1179
BYTE const mlBits = mlDInfo->nbAdditionalBits;
1180
BYTE const ofBits = ofDInfo->nbAdditionalBits;
1181
BYTE const totalBits = llBits+mlBits+ofBits;
1182
1183
U16 const llNext = llDInfo->nextState;
1184
U16 const mlNext = mlDInfo->nextState;
1185
U16 const ofNext = ofDInfo->nextState;
1186
U32 const llnbBits = llDInfo->nbBits;
1187
U32 const mlnbBits = mlDInfo->nbBits;
1188
U32 const ofnbBits = ofDInfo->nbBits;
1189
/*
1190
* As gcc has better branch and block analyzers, sometimes it is only
1191
* valuable to mark likelyness for clang, it gives around 3-4% of
1192
* performance.
1193
*/
1194
1195
/* sequence */
1196
{ size_t offset;
1197
#if defined(__clang__)
1198
if (LIKELY(ofBits > 1)) {
1199
#else
1200
if (ofBits > 1) {
1201
#endif
1202
ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
1203
ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
1204
assert(ofBits <= MaxOff);
1205
if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
1206
U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
1207
offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
1208
BIT_reloadDStream(&seqState->DStream);
1209
if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
1210
assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */
1211
} else {
1212
offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
1213
if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
1214
}
1215
seqState->prevOffset[2] = seqState->prevOffset[1];
1216
seqState->prevOffset[1] = seqState->prevOffset[0];
1217
seqState->prevOffset[0] = offset;
1218
} else {
1219
U32 const ll0 = (llDInfo->baseValue == 0);
1220
if (LIKELY((ofBits == 0))) {
1221
offset = seqState->prevOffset[ll0];
1222
seqState->prevOffset[1] = seqState->prevOffset[!ll0];
1223
seqState->prevOffset[0] = offset;
1224
} else {
1225
offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
1226
{ size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
1227
temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
1228
if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
1229
seqState->prevOffset[1] = seqState->prevOffset[0];
1230
seqState->prevOffset[0] = offset = temp;
1231
} } }
1232
seq.offset = offset;
1233
}
1234
1235
#if defined(__clang__)
1236
if (UNLIKELY(mlBits > 0))
1237
#else
1238
if (mlBits > 0)
1239
#endif
1240
seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
1241
1242
if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
1243
BIT_reloadDStream(&seqState->DStream);
1244
if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
1245
BIT_reloadDStream(&seqState->DStream);
1246
/* Ensure there are enough bits to read the rest of data in 64-bit mode. */
1247
ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
1248
1249
#if defined(__clang__)
1250
if (UNLIKELY(llBits > 0))
1251
#else
1252
if (llBits > 0)
1253
#endif
1254
seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
1255
1256
if (MEM_32bits())
1257
BIT_reloadDStream(&seqState->DStream);
1258
1259
DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
1260
(U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1261
1262
ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */
1263
ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */
1264
if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
1265
ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */
1266
}
1267
1268
return seq;
1269
}
1270
1271
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1272
MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
1273
{
1274
size_t const windowSize = dctx->fParams.windowSize;
1275
/* No dictionary used. */
1276
if (dctx->dictContentEndForFuzzing == NULL) return 0;
1277
/* Dictionary is our prefix. */
1278
if (prefixStart == dctx->dictContentBeginForFuzzing) return 1;
1279
/* Dictionary is not our ext-dict. */
1280
if (dctx->dictEnd != dctx->dictContentEndForFuzzing) return 0;
1281
/* Dictionary is not within our window size. */
1282
if ((size_t)(oLitEnd - prefixStart) >= windowSize) return 0;
1283
/* Dictionary is active. */
1284
return 1;
1285
}
1286
1287
MEM_STATIC void ZSTD_assertValidSequence(
1288
ZSTD_DCtx const* dctx,
1289
BYTE const* op, BYTE const* oend,
1290
seq_t const seq,
1291
BYTE const* prefixStart, BYTE const* virtualStart)
1292
{
1293
#if DEBUGLEVEL >= 1
1294
size_t const windowSize = dctx->fParams.windowSize;
1295
size_t const sequenceSize = seq.litLength + seq.matchLength;
1296
BYTE const* const oLitEnd = op + seq.litLength;
1297
DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
1298
(U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1299
assert(op <= oend);
1300
assert((size_t)(oend - op) >= sequenceSize);
1301
assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX);
1302
if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
1303
size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
1304
/* Offset must be within the dictionary. */
1305
assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
1306
assert(seq.offset <= windowSize + dictSize);
1307
} else {
1308
/* Offset must be within our window. */
1309
assert(seq.offset <= windowSize);
1310
}
1311
#else
1312
(void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
1313
#endif
1314
}
1315
#endif
1316
1317
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1318
1319
1320
FORCE_INLINE_TEMPLATE size_t
1321
DONT_VECTORIZE
1322
ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
1323
void* dst, size_t maxDstSize,
1324
const void* seqStart, size_t seqSize, int nbSeq,
1325
const ZSTD_longOffset_e isLongOffset,
1326
const int frame)
1327
{
1328
const BYTE* ip = (const BYTE*)seqStart;
1329
const BYTE* const iend = ip + seqSize;
1330
BYTE* const ostart = (BYTE*)dst;
1331
BYTE* const oend = ostart + maxDstSize;
1332
BYTE* op = ostart;
1333
const BYTE* litPtr = dctx->litPtr;
1334
const BYTE* litBufferEnd = dctx->litBufferEnd;
1335
const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
1336
const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
1337
const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
1338
DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer");
1339
(void)frame;
1340
1341
/* Regen sequences */
1342
if (nbSeq) {
1343
seqState_t seqState;
1344
dctx->fseEntropy = 1;
1345
{ U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1346
RETURN_ERROR_IF(
1347
ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
1348
corruption_detected, "");
1349
ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
1350
ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
1351
ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1352
assert(dst != NULL);
1353
1354
ZSTD_STATIC_ASSERT(
1355
BIT_DStream_unfinished < BIT_DStream_completed &&
1356
BIT_DStream_endOfBuffer < BIT_DStream_completed &&
1357
BIT_DStream_completed < BIT_DStream_overflow);
1358
1359
/* decompress without overrunning litPtr begins */
1360
{
1361
seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1362
/* Align the decompression loop to 32 + 16 bytes.
1363
*
1364
* zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
1365
* speed swings based on the alignment of the decompression loop. This
1366
* performance swing is caused by parts of the decompression loop falling
1367
* out of the DSB. The entire decompression loop should fit in the DSB,
1368
* when it can't we get much worse performance. You can measure if you've
1369
* hit the good case or the bad case with this perf command for some
1370
* compressed file test.zst:
1371
*
1372
* perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
1373
* -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
1374
*
1375
* If you see most cycles served out of the MITE you've hit the bad case.
1376
* If you see most cycles served out of the DSB you've hit the good case.
1377
* If it is pretty even then you may be in an okay case.
1378
*
1379
* This issue has been reproduced on the following CPUs:
1380
* - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
1381
* Use Instruments->Counters to get DSB/MITE cycles.
1382
* I never got performance swings, but I was able to
1383
* go from the good case of mostly DSB to half of the
1384
* cycles served from MITE.
1385
* - Coffeelake: Intel i9-9900k
1386
* - Coffeelake: Intel i7-9700k
1387
*
1388
* I haven't been able to reproduce the instability or DSB misses on any
1389
* of the following CPUS:
1390
* - Haswell
1391
* - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
1392
* - Skylake
1393
*
1394
* Alignment is done for each of the three major decompression loops:
1395
* - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer
1396
* - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer
1397
* - ZSTD_decompressSequences_body
1398
* Alignment choices are made to minimize large swings on bad cases and influence on performance
1399
* from changes external to this code, rather than to overoptimize on the current commit.
1400
*
1401
* If you are seeing performance stability this script can help test.
1402
* It tests on 4 commits in zstd where I saw performance change.
1403
*
1404
* https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
1405
*/
1406
#if defined(__GNUC__) && defined(__x86_64__)
1407
__asm__(".p2align 6");
1408
# if __GNUC__ >= 7
1409
/* good for gcc-7, gcc-9, and gcc-11 */
1410
__asm__("nop");
1411
__asm__(".p2align 5");
1412
__asm__("nop");
1413
__asm__(".p2align 4");
1414
# if __GNUC__ == 8 || __GNUC__ == 10
1415
/* good for gcc-8 and gcc-10 */
1416
__asm__("nop");
1417
__asm__(".p2align 3");
1418
# endif
1419
# endif
1420
#endif
1421
1422
/* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */
1423
for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) {
1424
size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1425
#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1426
assert(!ZSTD_isError(oneSeqSize));
1427
if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1428
#endif
1429
if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1430
return oneSeqSize;
1431
DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1432
op += oneSeqSize;
1433
if (UNLIKELY(!--nbSeq))
1434
break;
1435
BIT_reloadDStream(&(seqState.DStream));
1436
sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1437
}
1438
1439
/* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */
1440
if (nbSeq > 0) {
1441
const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1442
if (leftoverLit)
1443
{
1444
RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1445
ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1446
sequence.litLength -= leftoverLit;
1447
op += leftoverLit;
1448
}
1449
litPtr = dctx->litExtraBuffer;
1450
litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1451
dctx->litBufferLocation = ZSTD_not_in_dst;
1452
{
1453
size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1454
#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1455
assert(!ZSTD_isError(oneSeqSize));
1456
if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1457
#endif
1458
if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1459
return oneSeqSize;
1460
DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1461
op += oneSeqSize;
1462
if (--nbSeq)
1463
BIT_reloadDStream(&(seqState.DStream));
1464
}
1465
}
1466
}
1467
1468
if (nbSeq > 0) /* there is remaining lit from extra buffer */
1469
{
1470
1471
#if defined(__GNUC__) && defined(__x86_64__)
1472
__asm__(".p2align 6");
1473
__asm__("nop");
1474
# if __GNUC__ != 7
1475
/* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */
1476
__asm__(".p2align 4");
1477
__asm__("nop");
1478
__asm__(".p2align 3");
1479
# elif __GNUC__ >= 11
1480
__asm__(".p2align 3");
1481
# else
1482
__asm__(".p2align 5");
1483
__asm__("nop");
1484
__asm__(".p2align 3");
1485
# endif
1486
#endif
1487
1488
for (; ; ) {
1489
seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1490
size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1491
#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1492
assert(!ZSTD_isError(oneSeqSize));
1493
if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1494
#endif
1495
if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1496
return oneSeqSize;
1497
DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1498
op += oneSeqSize;
1499
if (UNLIKELY(!--nbSeq))
1500
break;
1501
BIT_reloadDStream(&(seqState.DStream));
1502
}
1503
}
1504
1505
/* check if reached exact end */
1506
DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);
1507
RETURN_ERROR_IF(nbSeq, corruption_detected, "");
1508
RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
1509
/* save reps for next block */
1510
{ U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
1511
}
1512
1513
/* last literal segment */
1514
if (dctx->litBufferLocation == ZSTD_split) /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
1515
{
1516
size_t const lastLLSize = litBufferEnd - litPtr;
1517
RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
1518
if (op != NULL) {
1519
ZSTD_memmove(op, litPtr, lastLLSize);
1520
op += lastLLSize;
1521
}
1522
litPtr = dctx->litExtraBuffer;
1523
litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1524
dctx->litBufferLocation = ZSTD_not_in_dst;
1525
}
1526
{ size_t const lastLLSize = litBufferEnd - litPtr;
1527
RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1528
if (op != NULL) {
1529
ZSTD_memcpy(op, litPtr, lastLLSize);
1530
op += lastLLSize;
1531
}
1532
}
1533
1534
return op-ostart;
1535
}
1536
1537
FORCE_INLINE_TEMPLATE size_t
1538
DONT_VECTORIZE
1539
ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
1540
void* dst, size_t maxDstSize,
1541
const void* seqStart, size_t seqSize, int nbSeq,
1542
const ZSTD_longOffset_e isLongOffset,
1543
const int frame)
1544
{
1545
const BYTE* ip = (const BYTE*)seqStart;
1546
const BYTE* const iend = ip + seqSize;
1547
BYTE* const ostart = (BYTE*)dst;
1548
BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer;
1549
BYTE* op = ostart;
1550
const BYTE* litPtr = dctx->litPtr;
1551
const BYTE* const litEnd = litPtr + dctx->litSize;
1552
const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
1553
const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
1554
const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
1555
DEBUGLOG(5, "ZSTD_decompressSequences_body");
1556
(void)frame;
1557
1558
/* Regen sequences */
1559
if (nbSeq) {
1560
seqState_t seqState;
1561
dctx->fseEntropy = 1;
1562
{ U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1563
RETURN_ERROR_IF(
1564
ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)),
1565
corruption_detected, "");
1566
ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
1567
ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
1568
ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1569
assert(dst != NULL);
1570
1571
ZSTD_STATIC_ASSERT(
1572
BIT_DStream_unfinished < BIT_DStream_completed &&
1573
BIT_DStream_endOfBuffer < BIT_DStream_completed &&
1574
BIT_DStream_completed < BIT_DStream_overflow);
1575
1576
#if defined(__GNUC__) && defined(__x86_64__)
1577
__asm__(".p2align 6");
1578
__asm__("nop");
1579
# if __GNUC__ >= 7
1580
__asm__(".p2align 5");
1581
__asm__("nop");
1582
__asm__(".p2align 3");
1583
# else
1584
__asm__(".p2align 4");
1585
__asm__("nop");
1586
__asm__(".p2align 3");
1587
# endif
1588
#endif
1589
1590
for ( ; ; ) {
1591
seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1592
size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
1593
#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1594
assert(!ZSTD_isError(oneSeqSize));
1595
if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1596
#endif
1597
if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1598
return oneSeqSize;
1599
DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1600
op += oneSeqSize;
1601
if (UNLIKELY(!--nbSeq))
1602
break;
1603
BIT_reloadDStream(&(seqState.DStream));
1604
}
1605
1606
/* check if reached exact end */
1607
DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
1608
RETURN_ERROR_IF(nbSeq, corruption_detected, "");
1609
RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
1610
/* save reps for next block */
1611
{ U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
1612
}
1613
1614
/* last literal segment */
1615
{ size_t const lastLLSize = litEnd - litPtr;
1616
RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1617
if (op != NULL) {
1618
ZSTD_memcpy(op, litPtr, lastLLSize);
1619
op += lastLLSize;
1620
}
1621
}
1622
1623
return op-ostart;
1624
}
1625
1626
static size_t
1627
ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
1628
void* dst, size_t maxDstSize,
1629
const void* seqStart, size_t seqSize, int nbSeq,
1630
const ZSTD_longOffset_e isLongOffset,
1631
const int frame)
1632
{
1633
return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1634
}
1635
1636
static size_t
1637
ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
1638
void* dst, size_t maxDstSize,
1639
const void* seqStart, size_t seqSize, int nbSeq,
1640
const ZSTD_longOffset_e isLongOffset,
1641
const int frame)
1642
{
1643
return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1644
}
1645
#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1646
1647
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1648
1649
FORCE_INLINE_TEMPLATE size_t
1650
ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
1651
const BYTE* const prefixStart, const BYTE* const dictEnd)
1652
{
1653
prefetchPos += sequence.litLength;
1654
{ const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
1655
const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
1656
* No consequence though : memory address is only used for prefetching, not for dereferencing */
1657
PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1658
}
1659
return prefetchPos + sequence.matchLength;
1660
}
1661
1662
/* This decoding function employs prefetching
1663
* to reduce latency impact of cache misses.
1664
* It's generally employed when block contains a significant portion of long-distance matches
1665
* or when coupled with a "cold" dictionary */
1666
FORCE_INLINE_TEMPLATE size_t
1667
ZSTD_decompressSequencesLong_body(
1668
ZSTD_DCtx* dctx,
1669
void* dst, size_t maxDstSize,
1670
const void* seqStart, size_t seqSize, int nbSeq,
1671
const ZSTD_longOffset_e isLongOffset,
1672
const int frame)
1673
{
1674
const BYTE* ip = (const BYTE*)seqStart;
1675
const BYTE* const iend = ip + seqSize;
1676
BYTE* const ostart = (BYTE*)dst;
1677
BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize;
1678
BYTE* op = ostart;
1679
const BYTE* litPtr = dctx->litPtr;
1680
const BYTE* litBufferEnd = dctx->litBufferEnd;
1681
const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
1682
const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
1683
const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
1684
(void)frame;
1685
1686
/* Regen sequences */
1687
if (nbSeq) {
1688
#define STORED_SEQS 8
1689
#define STORED_SEQS_MASK (STORED_SEQS-1)
1690
#define ADVANCED_SEQS STORED_SEQS
1691
seq_t sequences[STORED_SEQS];
1692
int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
1693
seqState_t seqState;
1694
int seqNb;
1695
size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */
1696
1697
dctx->fseEntropy = 1;
1698
{ int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1699
assert(dst != NULL);
1700
assert(iend >= ip);
1701
RETURN_ERROR_IF(
1702
ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
1703
corruption_detected, "");
1704
ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
1705
ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
1706
ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1707
1708
/* prepare in advance */
1709
for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
1710
seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1711
prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1712
sequences[seqNb] = sequence;
1713
}
1714
RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
1715
1716
/* decompress without stomping litBuffer */
1717
for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) {
1718
seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1719
size_t oneSeqSize;
1720
1721
if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd)
1722
{
1723
/* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */
1724
const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1725
if (leftoverLit)
1726
{
1727
RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1728
ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1729
sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit;
1730
op += leftoverLit;
1731
}
1732
litPtr = dctx->litExtraBuffer;
1733
litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1734
dctx->litBufferLocation = ZSTD_not_in_dst;
1735
oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1736
#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1737
assert(!ZSTD_isError(oneSeqSize));
1738
if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1739
#endif
1740
if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1741
1742
prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1743
sequences[seqNb & STORED_SEQS_MASK] = sequence;
1744
op += oneSeqSize;
1745
}
1746
else
1747
{
1748
/* lit buffer is either wholly contained in first or second split, or not split at all*/
1749
oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
1750
ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
1751
ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1752
#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1753
assert(!ZSTD_isError(oneSeqSize));
1754
if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1755
#endif
1756
if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1757
1758
prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1759
sequences[seqNb & STORED_SEQS_MASK] = sequence;
1760
op += oneSeqSize;
1761
}
1762
}
1763
RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
1764
1765
/* finish queue */
1766
seqNb -= seqAdvance;
1767
for ( ; seqNb<nbSeq ; seqNb++) {
1768
seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]);
1769
if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd)
1770
{
1771
const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1772
if (leftoverLit)
1773
{
1774
RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1775
ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1776
sequence->litLength -= leftoverLit;
1777
op += leftoverLit;
1778
}
1779
litPtr = dctx->litExtraBuffer;
1780
litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1781
dctx->litBufferLocation = ZSTD_not_in_dst;
1782
{
1783
size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1784
#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1785
assert(!ZSTD_isError(oneSeqSize));
1786
if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1787
#endif
1788
if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1789
op += oneSeqSize;
1790
}
1791
}
1792
else
1793
{
1794
size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
1795
ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
1796
ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1797
#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1798
assert(!ZSTD_isError(oneSeqSize));
1799
if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1800
#endif
1801
if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1802
op += oneSeqSize;
1803
}
1804
}
1805
1806
/* save reps for next block */
1807
{ U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
1808
}
1809
1810
/* last literal segment */
1811
if (dctx->litBufferLocation == ZSTD_split) /* first deplete literal buffer in dst, then copy litExtraBuffer */
1812
{
1813
size_t const lastLLSize = litBufferEnd - litPtr;
1814
RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
1815
if (op != NULL) {
1816
ZSTD_memmove(op, litPtr, lastLLSize);
1817
op += lastLLSize;
1818
}
1819
litPtr = dctx->litExtraBuffer;
1820
litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1821
}
1822
{ size_t const lastLLSize = litBufferEnd - litPtr;
1823
RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1824
if (op != NULL) {
1825
ZSTD_memmove(op, litPtr, lastLLSize);
1826
op += lastLLSize;
1827
}
1828
}
1829
1830
return op-ostart;
1831
}
1832
1833
static size_t
1834
ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
1835
void* dst, size_t maxDstSize,
1836
const void* seqStart, size_t seqSize, int nbSeq,
1837
const ZSTD_longOffset_e isLongOffset,
1838
const int frame)
1839
{
1840
return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1841
}
1842
#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1843
1844
1845
1846
#if DYNAMIC_BMI2
1847
1848
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1849
static BMI2_TARGET_ATTRIBUTE size_t
1850
DONT_VECTORIZE
1851
ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
1852
void* dst, size_t maxDstSize,
1853
const void* seqStart, size_t seqSize, int nbSeq,
1854
const ZSTD_longOffset_e isLongOffset,
1855
const int frame)
1856
{
1857
return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1858
}
1859
static BMI2_TARGET_ATTRIBUTE size_t
1860
DONT_VECTORIZE
1861
ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
1862
void* dst, size_t maxDstSize,
1863
const void* seqStart, size_t seqSize, int nbSeq,
1864
const ZSTD_longOffset_e isLongOffset,
1865
const int frame)
1866
{
1867
return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1868
}
1869
#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1870
1871
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1872
static BMI2_TARGET_ATTRIBUTE size_t
1873
ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
1874
void* dst, size_t maxDstSize,
1875
const void* seqStart, size_t seqSize, int nbSeq,
1876
const ZSTD_longOffset_e isLongOffset,
1877
const int frame)
1878
{
1879
return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1880
}
1881
#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1882
1883
#endif /* DYNAMIC_BMI2 */
1884
1885
typedef size_t (*ZSTD_decompressSequences_t)(
1886
ZSTD_DCtx* dctx,
1887
void* dst, size_t maxDstSize,
1888
const void* seqStart, size_t seqSize, int nbSeq,
1889
const ZSTD_longOffset_e isLongOffset,
1890
const int frame);
1891
1892
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1893
static size_t
1894
ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1895
const void* seqStart, size_t seqSize, int nbSeq,
1896
const ZSTD_longOffset_e isLongOffset,
1897
const int frame)
1898
{
1899
DEBUGLOG(5, "ZSTD_decompressSequences");
1900
#if DYNAMIC_BMI2
1901
if (ZSTD_DCtx_get_bmi2(dctx)) {
1902
return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1903
}
1904
#endif
1905
return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1906
}
1907
static size_t
1908
ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1909
const void* seqStart, size_t seqSize, int nbSeq,
1910
const ZSTD_longOffset_e isLongOffset,
1911
const int frame)
1912
{
1913
DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
1914
#if DYNAMIC_BMI2
1915
if (ZSTD_DCtx_get_bmi2(dctx)) {
1916
return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1917
}
1918
#endif
1919
return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1920
}
1921
#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1922
1923
1924
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1925
/* ZSTD_decompressSequencesLong() :
1926
* decompression function triggered when a minimum share of offsets is considered "long",
1927
* aka out of cache.
1928
* note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes meaning "farther than memory cache distance".
1929
* This function will try to mitigate main memory latency through the use of prefetching */
1930
static size_t
1931
ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
1932
void* dst, size_t maxDstSize,
1933
const void* seqStart, size_t seqSize, int nbSeq,
1934
const ZSTD_longOffset_e isLongOffset,
1935
const int frame)
1936
{
1937
DEBUGLOG(5, "ZSTD_decompressSequencesLong");
1938
#if DYNAMIC_BMI2
1939
if (ZSTD_DCtx_get_bmi2(dctx)) {
1940
return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1941
}
1942
#endif
1943
return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1944
}
1945
#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1946
1947
1948
1949
#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1950
!defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1951
/* ZSTD_getLongOffsetsShare() :
1952
* condition : offTable must be valid
1953
* @return : "share" of long offsets (arbitrarily defined as > (1<<23))
1954
* compared to maximum possible of (1<<OffFSELog) */
1955
static unsigned
1956
ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
1957
{
1958
const void* ptr = offTable;
1959
U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
1960
const ZSTD_seqSymbol* table = offTable + 1;
1961
U32 const max = 1 << tableLog;
1962
U32 u, total = 0;
1963
DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
1964
1965
assert(max <= (1 << OffFSELog)); /* max not too large */
1966
for (u=0; u<max; u++) {
1967
if (table[u].nbAdditionalBits > 22) total += 1;
1968
}
1969
1970
assert(tableLog <= OffFSELog);
1971
total <<= (OffFSELog - tableLog); /* scale to OffFSELog */
1972
1973
return total;
1974
}
1975
#endif
1976
1977
size_t
1978
ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1979
void* dst, size_t dstCapacity,
1980
const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
1981
{ /* blockType == blockCompressed */
1982
const BYTE* ip = (const BYTE*)src;
1983
/* isLongOffset must be true if there are long offsets.
1984
* Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
1985
* We don't expect that to be the case in 64-bit mode.
1986
* In block mode, window size is not known, so we have to be conservative.
1987
* (note: but it could be evaluated from current-lowLimit)
1988
*/
1989
ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
1990
DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
1991
1992
RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
1993
1994
/* Decode literals section */
1995
{ size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
1996
DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
1997
if (ZSTD_isError(litCSize)) return litCSize;
1998
ip += litCSize;
1999
srcSize -= litCSize;
2000
}
2001
2002
/* Build Decoding Tables */
2003
{
2004
/* These macros control at build-time which decompressor implementation
2005
* we use. If neither is defined, we do some inspection and dispatch at
2006
* runtime.
2007
*/
2008
#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
2009
!defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
2010
int usePrefetchDecoder = dctx->ddictIsCold;
2011
#endif
2012
int nbSeq;
2013
size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
2014
if (ZSTD_isError(seqHSize)) return seqHSize;
2015
ip += seqHSize;
2016
srcSize -= seqHSize;
2017
2018
RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
2019
2020
#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
2021
!defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
2022
if ( !usePrefetchDecoder
2023
&& (!frame || (dctx->fParams.windowSize > (1<<24)))
2024
&& (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */
2025
U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
2026
U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
2027
usePrefetchDecoder = (shareLongOffsets >= minShare);
2028
}
2029
#endif
2030
2031
dctx->ddictIsCold = 0;
2032
2033
#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
2034
!defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
2035
if (usePrefetchDecoder)
2036
#endif
2037
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
2038
return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
2039
#endif
2040
2041
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
2042
/* else */
2043
if (dctx->litBufferLocation == ZSTD_split)
2044
return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
2045
else
2046
return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
2047
#endif
2048
}
2049
}
2050
2051
2052
void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
2053
{
2054
if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */
2055
dctx->dictEnd = dctx->previousDstEnd;
2056
dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
2057
dctx->prefixStart = dst;
2058
dctx->previousDstEnd = dst;
2059
}
2060
}
2061
2062
2063
size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
2064
void* dst, size_t dstCapacity,
2065
const void* src, size_t srcSize)
2066
{
2067
size_t dSize;
2068
ZSTD_checkContinuity(dctx, dst, dstCapacity);
2069
dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming);
2070
dctx->previousDstEnd = (char*)dst + dSize;
2071
return dSize;
2072
}
2073
2074