CoCalc -- zstd_decompress

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/zstd/lib/decompress/zstd_decompress_block.c
⁴⁸³⁷⁸ views
1
/*
2
 * Copyright (c) Yann Collet, Facebook, Inc.
3
 * All rights reserved.
4
 *
5
 * This source code is licensed under both the BSD-style license (found in the
6
 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
 * in the COPYING file in the root directory of this source tree).
8
 * You may select, at your option, one of the above-listed licenses.
9
 */
10

11
/* zstd_decompress_block :
12
 * this module takes care of decompressing _compressed_ block */
13

14
/*-*******************************************************
15
*  Dependencies
16
*********************************************************/
17
#include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
18
#include "../common/compiler.h"    /* prefetch */
19
#include "../common/cpu.h"         /* bmi2 */
20
#include "../common/mem.h"         /* low level memory routines */
21
#define FSE_STATIC_LINKING_ONLY
22
#include "../common/fse.h"
23
#define HUF_STATIC_LINKING_ONLY
24
#include "../common/huf.h"
25
#include "../common/zstd_internal.h"
26
#include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
27
#include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
28
#include "zstd_decompress_block.h"
29

30
/*_*******************************************************
31
*  Macros
32
**********************************************************/
33

34
/* These two optional macros force the use one way or another of the two
35
 * ZSTD_decompressSequences implementations. You can't force in both directions
36
 * at the same time.
37
 */
38
#if defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
39
    defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
40
#error "Cannot force the use of the short and the long ZSTD_decompressSequences variants!"
41
#endif
42

43

44
/*_*******************************************************
45
*  Memory operations
46
**********************************************************/
47
static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }
48

49

50
/*-*************************************************************
51
 *   Block decoding
52
 ***************************************************************/
53

54
/*! ZSTD_getcBlockSize() :
55
 *  Provides the size of compressed block from block header `src` */
56
size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
57
                          blockProperties_t* bpPtr)
58
{
59
    RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong, "");
60

61
    {   U32 const cBlockHeader = MEM_readLE24(src);
62
        U32 const cSize = cBlockHeader >> 3;
63
        bpPtr->lastBlock = cBlockHeader & 1;
64
        bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3);
65
        bpPtr->origSize = cSize;   /* only useful for RLE */
66
        if (bpPtr->blockType == bt_rle) return 1;
67
        RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected, "");
68
        return cSize;
69
    }
70
}
71

72
/* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */
73
static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,
74
    const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately)
75
{
76
    if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH)
77
    {
78
        /* room for litbuffer to fit without read faulting */
79
        dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH;
80
        dctx->litBufferEnd = dctx->litBuffer + litSize;
81
        dctx->litBufferLocation = ZSTD_in_dst;
82
    }
83
    else if (litSize > ZSTD_LITBUFFEREXTRASIZE)
84
    {
85
        /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
86
        if (splitImmediately) {
87
            /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
88
            dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
89
            dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
90
        }
91
        else {
92
            /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
93
            dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
94
            dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
95
        }
96
        dctx->litBufferLocation = ZSTD_split;
97
    }
98
    else
99
    {
100
        /* fits entirely within litExtraBuffer, so no split is necessary */
101
        dctx->litBuffer = dctx->litExtraBuffer;
102
        dctx->litBufferEnd = dctx->litBuffer + litSize;
103
        dctx->litBufferLocation = ZSTD_not_in_dst;
104
    }
105
}
106

107
/* Hidden declaration for fullbench */
108
size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
109
                          const void* src, size_t srcSize,
110
                          void* dst, size_t dstCapacity, const streaming_operation streaming);
111
/*! ZSTD_decodeLiteralsBlock() :
112
 * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored
113
 * in the dstBuffer.  If there is room to do so, it will be stored in full in the excess dst space after where the current
114
 * block will be output.  Otherwise it will be stored at the end of the current dst blockspace, with a small portion being
115
 * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write.
116
 *
117
 * @return : nb of bytes read from src (< srcSize )
118
 *  note : symbol not declared but exposed for fullbench */
119
size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
120
                          const void* src, size_t srcSize,   /* note : srcSize < BLOCKSIZE */
121
                          void* dst, size_t dstCapacity, const streaming_operation streaming)
122
{
123
    DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");
124
    RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
125

126
    {   const BYTE* const istart = (const BYTE*) src;
127
        symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
128

129
        switch(litEncType)
130
        {
131
        case set_repeat:
132
            DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");
133
            RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, "");
134
            ZSTD_FALLTHROUGH;
135

136
        case set_compressed:
137
            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
138
            {   size_t lhSize, litSize, litCSize;
139
                U32 singleStream=0;
140
                U32 const lhlCode = (istart[0] >> 2) & 3;
141
                U32 const lhc = MEM_readLE32(istart);
142
                size_t hufSuccess;
143
                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
144
                switch(lhlCode)
145
                {
146
                case 0: case 1: default:   /* note : default is impossible, since lhlCode into [0..3] */
147
                    /* 2 - 2 - 10 - 10 */
148
                    singleStream = !lhlCode;
149
                    lhSize = 3;
150
                    litSize  = (lhc >> 4) & 0x3FF;
151
                    litCSize = (lhc >> 14) & 0x3FF;
152
                    break;
153
                case 2:
154
                    /* 2 - 2 - 14 - 14 */
155
                    lhSize = 4;
156
                    litSize  = (lhc >> 4) & 0x3FFF;
157
                    litCSize = lhc >> 18;
158
                    break;
159
                case 3:
160
                    /* 2 - 2 - 18 - 18 */
161
                    lhSize = 5;
162
                    litSize  = (lhc >> 4) & 0x3FFFF;
163
                    litCSize = (lhc >> 22) + ((size_t)istart[4] << 10);
164
                    break;
165
                }
166
                RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
167
                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
168
                RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
169
                RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
170
                ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
171

172
                /* prefetch huffman table if cold */
173
                if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
174
                    PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable));
175
                }
176

177
                if (litEncType==set_repeat) {
178
                    if (singleStream) {
179
                        hufSuccess = HUF_decompress1X_usingDTable_bmi2(
180
                            dctx->litBuffer, litSize, istart+lhSize, litCSize,
181
                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
182
                    } else {
183
                        hufSuccess = HUF_decompress4X_usingDTable_bmi2(
184
                            dctx->litBuffer, litSize, istart+lhSize, litCSize,
185
                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
186
                    }
187
                } else {
188
                    if (singleStream) {
189
#if defined(HUF_FORCE_DECOMPRESS_X2)
190
                        hufSuccess = HUF_decompress1X_DCtx_wksp(
191
                            dctx->entropy.hufTable, dctx->litBuffer, litSize,
192
                            istart+lhSize, litCSize, dctx->workspace,
193
                            sizeof(dctx->workspace));
194
#else
195
                        hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
196
                            dctx->entropy.hufTable, dctx->litBuffer, litSize,
197
                            istart+lhSize, litCSize, dctx->workspace,
198
                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
199
#endif
200
                    } else {
201
                        hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
202
                            dctx->entropy.hufTable, dctx->litBuffer, litSize,
203
                            istart+lhSize, litCSize, dctx->workspace,
204
                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
205
                    }
206
                }
207
                if (dctx->litBufferLocation == ZSTD_split)
208
                {
209
                    ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
210
                    ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
211
                    dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
212
                    dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;
213
                }
214

215
                RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
216

217
                dctx->litPtr = dctx->litBuffer;
218
                dctx->litSize = litSize;
219
                dctx->litEntropy = 1;
220
                if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable;
221
                return litCSize + lhSize;
222
            }
223

224
        case set_basic:
225
            {   size_t litSize, lhSize;
226
                U32 const lhlCode = ((istart[0]) >> 2) & 3;
227
                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
228
                switch(lhlCode)
229
                {
230
                case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
231
                    lhSize = 1;
232
                    litSize = istart[0] >> 3;
233
                    break;
234
                case 1:
235
                    lhSize = 2;
236
                    litSize = MEM_readLE16(istart) >> 4;
237
                    break;
238
                case 3:
239
                    lhSize = 3;
240
                    litSize = MEM_readLE24(istart) >> 4;
241
                    break;
242
                }
243

244
                RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
245
                RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
246
                ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
247
                if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
248
                    RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, "");
249
                    if (dctx->litBufferLocation == ZSTD_split)
250
                    {
251
                        ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE);
252
                        ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
253
                    }
254
                    else
255
                    {
256
                        ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize);
257
                    }
258
                    dctx->litPtr = dctx->litBuffer;
259
                    dctx->litSize = litSize;
260
                    return lhSize+litSize;
261
                }
262
                /* direct reference into compressed stream */
263
                dctx->litPtr = istart+lhSize;
264
                dctx->litSize = litSize;
265
                dctx->litBufferEnd = dctx->litPtr + litSize;
266
                dctx->litBufferLocation = ZSTD_not_in_dst;
267
                return lhSize+litSize;
268
            }
269

270
        case set_rle:
271
            {   U32 const lhlCode = ((istart[0]) >> 2) & 3;
272
                size_t litSize, lhSize;
273
                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
274
                switch(lhlCode)
275
                {
276
                case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
277
                    lhSize = 1;
278
                    litSize = istart[0] >> 3;
279
                    break;
280
                case 1:
281
                    lhSize = 2;
282
                    litSize = MEM_readLE16(istart) >> 4;
283
                    break;
284
                case 3:
285
                    lhSize = 3;
286
                    litSize = MEM_readLE24(istart) >> 4;
287
                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
288
                    break;
289
                }
290
                RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
291
                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
292
                RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
293
                ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
294
                if (dctx->litBufferLocation == ZSTD_split)
295
                {
296
                    ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE);
297
                    ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE);
298
                }
299
                else
300
                {
301
                    ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize);
302
                }
303
                dctx->litPtr = dctx->litBuffer;
304
                dctx->litSize = litSize;
305
                return lhSize+1;
306
            }
307
        default:
308
            RETURN_ERROR(corruption_detected, "impossible");
309
        }
310
    }
311
}
312

313
/* Default FSE distribution tables.
314
 * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
315
 * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
316
 * They were generated programmatically with following method :
317
 * - start from default distributions, present in /lib/common/zstd_internal.h
318
 * - generate tables normally, using ZSTD_buildFSETable()
319
 * - printout the content of tables
320
 * - pretify output, report below, test with fuzzer to ensure it's correct */
321

322
/* Default FSE distribution table for Literal Lengths */
323
static const ZSTD_seqSymbol LL_defaultDTable[(1<<LL_DEFAULTNORMLOG)+1] = {
324
     {  1,  1,  1, LL_DEFAULTNORMLOG},  /* header : fastMode, tableLog */
325
     /* nextState, nbAddBits, nbBits, baseVal */
326
     {  0,  0,  4,    0},  { 16,  0,  4,    0},
327
     { 32,  0,  5,    1},  {  0,  0,  5,    3},
328
     {  0,  0,  5,    4},  {  0,  0,  5,    6},
329
     {  0,  0,  5,    7},  {  0,  0,  5,    9},
330
     {  0,  0,  5,   10},  {  0,  0,  5,   12},
331
     {  0,  0,  6,   14},  {  0,  1,  5,   16},
332
     {  0,  1,  5,   20},  {  0,  1,  5,   22},
333
     {  0,  2,  5,   28},  {  0,  3,  5,   32},
334
     {  0,  4,  5,   48},  { 32,  6,  5,   64},
335
     {  0,  7,  5,  128},  {  0,  8,  6,  256},
336
     {  0, 10,  6, 1024},  {  0, 12,  6, 4096},
337
     { 32,  0,  4,    0},  {  0,  0,  4,    1},
338
     {  0,  0,  5,    2},  { 32,  0,  5,    4},
339
     {  0,  0,  5,    5},  { 32,  0,  5,    7},
340
     {  0,  0,  5,    8},  { 32,  0,  5,   10},
341
     {  0,  0,  5,   11},  {  0,  0,  6,   13},
342
     { 32,  1,  5,   16},  {  0,  1,  5,   18},
343
     { 32,  1,  5,   22},  {  0,  2,  5,   24},
344
     { 32,  3,  5,   32},  {  0,  3,  5,   40},
345
     {  0,  6,  4,   64},  { 16,  6,  4,   64},
346
     { 32,  7,  5,  128},  {  0,  9,  6,  512},
347
     {  0, 11,  6, 2048},  { 48,  0,  4,    0},
348
     { 16,  0,  4,    1},  { 32,  0,  5,    2},
349
     { 32,  0,  5,    3},  { 32,  0,  5,    5},
350
     { 32,  0,  5,    6},  { 32,  0,  5,    8},
351
     { 32,  0,  5,    9},  { 32,  0,  5,   11},
352
     { 32,  0,  5,   12},  {  0,  0,  6,   15},
353
     { 32,  1,  5,   18},  { 32,  1,  5,   20},
354
     { 32,  2,  5,   24},  { 32,  2,  5,   28},
355
     { 32,  3,  5,   40},  { 32,  4,  5,   48},
356
     {  0, 16,  6,65536},  {  0, 15,  6,32768},
357
     {  0, 14,  6,16384},  {  0, 13,  6, 8192},
358
};   /* LL_defaultDTable */
359

360
/* Default FSE distribution table for Offset Codes */
361
static const ZSTD_seqSymbol OF_defaultDTable[(1<<OF_DEFAULTNORMLOG)+1] = {
362
    {  1,  1,  1, OF_DEFAULTNORMLOG},  /* header : fastMode, tableLog */
363
    /* nextState, nbAddBits, nbBits, baseVal */
364
    {  0,  0,  5,    0},     {  0,  6,  4,   61},
365
    {  0,  9,  5,  509},     {  0, 15,  5,32765},
366
    {  0, 21,  5,2097149},   {  0,  3,  5,    5},
367
    {  0,  7,  4,  125},     {  0, 12,  5, 4093},
368
    {  0, 18,  5,262141},    {  0, 23,  5,8388605},
369
    {  0,  5,  5,   29},     {  0,  8,  4,  253},
370
    {  0, 14,  5,16381},     {  0, 20,  5,1048573},
371
    {  0,  2,  5,    1},     { 16,  7,  4,  125},
372
    {  0, 11,  5, 2045},     {  0, 17,  5,131069},
373
    {  0, 22,  5,4194301},   {  0,  4,  5,   13},
374
    { 16,  8,  4,  253},     {  0, 13,  5, 8189},
375
    {  0, 19,  5,524285},    {  0,  1,  5,    1},
376
    { 16,  6,  4,   61},     {  0, 10,  5, 1021},
377
    {  0, 16,  5,65533},     {  0, 28,  5,268435453},
378
    {  0, 27,  5,134217725}, {  0, 26,  5,67108861},
379
    {  0, 25,  5,33554429},  {  0, 24,  5,16777213},
380
};   /* OF_defaultDTable */
381

382

383
/* Default FSE distribution table for Match Lengths */
384
static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = {
385
    {  1,  1,  1, ML_DEFAULTNORMLOG},  /* header : fastMode, tableLog */
386
    /* nextState, nbAddBits, nbBits, baseVal */
387
    {  0,  0,  6,    3},  {  0,  0,  4,    4},
388
    { 32,  0,  5,    5},  {  0,  0,  5,    6},
389
    {  0,  0,  5,    8},  {  0,  0,  5,    9},
390
    {  0,  0,  5,   11},  {  0,  0,  6,   13},
391
    {  0,  0,  6,   16},  {  0,  0,  6,   19},
392
    {  0,  0,  6,   22},  {  0,  0,  6,   25},
393
    {  0,  0,  6,   28},  {  0,  0,  6,   31},
394
    {  0,  0,  6,   34},  {  0,  1,  6,   37},
395
    {  0,  1,  6,   41},  {  0,  2,  6,   47},
396
    {  0,  3,  6,   59},  {  0,  4,  6,   83},
397
    {  0,  7,  6,  131},  {  0,  9,  6,  515},
398
    { 16,  0,  4,    4},  {  0,  0,  4,    5},
399
    { 32,  0,  5,    6},  {  0,  0,  5,    7},
400
    { 32,  0,  5,    9},  {  0,  0,  5,   10},
401
    {  0,  0,  6,   12},  {  0,  0,  6,   15},
402
    {  0,  0,  6,   18},  {  0,  0,  6,   21},
403
    {  0,  0,  6,   24},  {  0,  0,  6,   27},
404
    {  0,  0,  6,   30},  {  0,  0,  6,   33},
405
    {  0,  1,  6,   35},  {  0,  1,  6,   39},
406
    {  0,  2,  6,   43},  {  0,  3,  6,   51},
407
    {  0,  4,  6,   67},  {  0,  5,  6,   99},
408
    {  0,  8,  6,  259},  { 32,  0,  4,    4},
409
    { 48,  0,  4,    4},  { 16,  0,  4,    5},
410
    { 32,  0,  5,    7},  { 32,  0,  5,    8},
411
    { 32,  0,  5,   10},  { 32,  0,  5,   11},
412
    {  0,  0,  6,   14},  {  0,  0,  6,   17},
413
    {  0,  0,  6,   20},  {  0,  0,  6,   23},
414
    {  0,  0,  6,   26},  {  0,  0,  6,   29},
415
    {  0,  0,  6,   32},  {  0, 16,  6,65539},
416
    {  0, 15,  6,32771},  {  0, 14,  6,16387},
417
    {  0, 13,  6, 8195},  {  0, 12,  6, 4099},
418
    {  0, 11,  6, 2051},  {  0, 10,  6, 1027},
419
};   /* ML_defaultDTable */
420

421

422
static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits)
423
{
424
    void* ptr = dt;
425
    ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr;
426
    ZSTD_seqSymbol* const cell = dt + 1;
427

428
    DTableH->tableLog = 0;
429
    DTableH->fastMode = 0;
430

431
    cell->nbBits = 0;
432
    cell->nextState = 0;
433
    assert(nbAddBits < 255);
434
    cell->nbAdditionalBits = nbAddBits;
435
    cell->baseValue = baseValue;
436
}
437

438

439
/* ZSTD_buildFSETable() :
440
 * generate FSE decoding table for one symbol (ll, ml or off)
441
 * cannot fail if input is valid =>
442
 * all inputs are presumed validated at this stage */
443
FORCE_INLINE_TEMPLATE
444
void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
445
            const short* normalizedCounter, unsigned maxSymbolValue,
446
            const U32* baseValue, const U8* nbAdditionalBits,
447
            unsigned tableLog, void* wksp, size_t wkspSize)
448
{
449
    ZSTD_seqSymbol* const tableDecode = dt+1;
450
    U32 const maxSV1 = maxSymbolValue + 1;
451
    U32 const tableSize = 1 << tableLog;
452

453
    U16* symbolNext = (U16*)wksp;
454
    BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1);
455
    U32 highThreshold = tableSize - 1;
456

457

458
    /* Sanity Checks */
459
    assert(maxSymbolValue <= MaxSeq);
460
    assert(tableLog <= MaxFSELog);
461
    assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE);
462
    (void)wkspSize;
463
    /* Init, lay down lowprob symbols */
464
    {   ZSTD_seqSymbol_header DTableH;
465
        DTableH.tableLog = tableLog;
466
        DTableH.fastMode = 1;
467
        {   S16 const largeLimit= (S16)(1 << (tableLog-1));
468
            U32 s;
469
            for (s=0; s<maxSV1; s++) {
470
                if (normalizedCounter[s]==-1) {
471
                    tableDecode[highThreshold--].baseValue = s;
472
                    symbolNext[s] = 1;
473
                } else {
474
                    if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
475
                    assert(normalizedCounter[s]>=0);
476
                    symbolNext[s] = (U16)normalizedCounter[s];
477
        }   }   }
478
        ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
479
    }
480

481
    /* Spread symbols */
482
    assert(tableSize <= 512);
483
    /* Specialized symbol spreading for the case when there are
484
     * no low probability (-1 count) symbols. When compressing
485
     * small blocks we avoid low probability symbols to hit this
486
     * case, since header decoding speed matters more.
487
     */
488
    if (highThreshold == tableSize - 1) {
489
        size_t const tableMask = tableSize-1;
490
        size_t const step = FSE_TABLESTEP(tableSize);
491
        /* First lay down the symbols in order.
492
         * We use a uint64_t to lay down 8 bytes at a time. This reduces branch
493
         * misses since small blocks generally have small table logs, so nearly
494
         * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
495
         * our buffer to handle the over-write.
496
         */
497
        {
498
            U64 const add = 0x0101010101010101ull;
499
            size_t pos = 0;
500
            U64 sv = 0;
501
            U32 s;
502
            for (s=0; s<maxSV1; ++s, sv += add) {
503
                int i;
504
                int const n = normalizedCounter[s];
505
                MEM_write64(spread + pos, sv);
506
                for (i = 8; i < n; i += 8) {
507
                    MEM_write64(spread + pos + i, sv);
508
                }
509
                pos += n;
510
            }
511
        }
512
        /* Now we spread those positions across the table.
513
         * The benefit of doing it in two stages is that we avoid the the
514
         * variable size inner loop, which caused lots of branch misses.
515
         * Now we can run through all the positions without any branch misses.
516
         * We unroll the loop twice, since that is what emperically worked best.
517
         */
518
        {
519
            size_t position = 0;
520
            size_t s;
521
            size_t const unroll = 2;
522
            assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
523
            for (s = 0; s < (size_t)tableSize; s += unroll) {
524
                size_t u;
525
                for (u = 0; u < unroll; ++u) {
526
                    size_t const uPosition = (position + (u * step)) & tableMask;
527
                    tableDecode[uPosition].baseValue = spread[s + u];
528
                }
529
                position = (position + (unroll * step)) & tableMask;
530
            }
531
            assert(position == 0);
532
        }
533
    } else {
534
        U32 const tableMask = tableSize-1;
535
        U32 const step = FSE_TABLESTEP(tableSize);
536
        U32 s, position = 0;
537
        for (s=0; s<maxSV1; s++) {
538
            int i;
539
            int const n = normalizedCounter[s];
540
            for (i=0; i<n; i++) {
541
                tableDecode[position].baseValue = s;
542
                position = (position + step) & tableMask;
543
                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
544
        }   }
545
        assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
546
    }
547

548
    /* Build Decoding table */
549
    {
550
        U32 u;
551
        for (u=0; u<tableSize; u++) {
552
            U32 const symbol = tableDecode[u].baseValue;
553
            U32 const nextState = symbolNext[symbol]++;
554
            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
555
            tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
556
            assert(nbAdditionalBits[symbol] < 255);
557
            tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
558
            tableDecode[u].baseValue = baseValue[symbol];
559
        }
560
    }
561
}
562

563
/* Avoids the FORCE_INLINE of the _body() function. */
564
static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
565
            const short* normalizedCounter, unsigned maxSymbolValue,
566
            const U32* baseValue, const U8* nbAdditionalBits,
567
            unsigned tableLog, void* wksp, size_t wkspSize)
568
{
569
    ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
570
            baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
571
}
572

573
#if DYNAMIC_BMI2
574
BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,
575
            const short* normalizedCounter, unsigned maxSymbolValue,
576
            const U32* baseValue, const U8* nbAdditionalBits,
577
            unsigned tableLog, void* wksp, size_t wkspSize)
578
{
579
    ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
580
            baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
581
}
582
#endif
583

584
void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
585
            const short* normalizedCounter, unsigned maxSymbolValue,
586
            const U32* baseValue, const U8* nbAdditionalBits,
587
            unsigned tableLog, void* wksp, size_t wkspSize, int bmi2)
588
{
589
#if DYNAMIC_BMI2
590
    if (bmi2) {
591
        ZSTD_buildFSETable_body_bmi2(dt, normalizedCounter, maxSymbolValue,
592
                baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
593
        return;
594
    }
595
#endif
596
    (void)bmi2;
597
    ZSTD_buildFSETable_body_default(dt, normalizedCounter, maxSymbolValue,
598
            baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
599
}
600

601

602
/*! ZSTD_buildSeqTable() :
603
 * @return : nb bytes read from src,
604
 *           or an error code if it fails */
605
static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,
606
                                 symbolEncodingType_e type, unsigned max, U32 maxLog,
607
                                 const void* src, size_t srcSize,
608
                                 const U32* baseValue, const U8* nbAdditionalBits,
609
                                 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
610
                                 int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize,
611
                                 int bmi2)
612
{
613
    switch(type)
614
    {
615
    case set_rle :
616
        RETURN_ERROR_IF(!srcSize, srcSize_wrong, "");
617
        RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, "");
618
        {   U32 const symbol = *(const BYTE*)src;
619
            U32 const baseline = baseValue[symbol];
620
            U8 const nbBits = nbAdditionalBits[symbol];
621
            ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits);
622
        }
623
        *DTablePtr = DTableSpace;
624
        return 1;
625
    case set_basic :
626
        *DTablePtr = defaultTable;
627
        return 0;
628
    case set_repeat:
629
        RETURN_ERROR_IF(!flagRepeatTable, corruption_detected, "");
630
        /* prefetch FSE table if used */
631
        if (ddictIsCold && (nbSeq > 24 /* heuristic */)) {
632
            const void* const pStart = *DTablePtr;
633
            size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog));
634
            PREFETCH_AREA(pStart, pSize);
635
        }
636
        return 0;
637
    case set_compressed :
638
        {   unsigned tableLog;
639
            S16 norm[MaxSeq+1];
640
            size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
641
            RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, "");
642
            RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, "");
643
            ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2);
644
            *DTablePtr = DTableSpace;
645
            return headerSize;
646
        }
647
    default :
648
        assert(0);
649
        RETURN_ERROR(GENERIC, "impossible");
650
    }
651
}
652

653
size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
654
                             const void* src, size_t srcSize)
655
{
656
    const BYTE* const istart = (const BYTE*)src;
657
    const BYTE* const iend = istart + srcSize;
658
    const BYTE* ip = istart;
659
    int nbSeq;
660
    DEBUGLOG(5, "ZSTD_decodeSeqHeaders");
661

662
    /* check */
663
    RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong, "");
664

665
    /* SeqHead */
666
    nbSeq = *ip++;
667
    if (!nbSeq) {
668
        *nbSeqPtr=0;
669
        RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
670
        return 1;
671
    }
672
    if (nbSeq > 0x7F) {
673
        if (nbSeq == 0xFF) {
674
            RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
675
            nbSeq = MEM_readLE16(ip) + LONGNBSEQ;
676
            ip+=2;
677
        } else {
678
            RETURN_ERROR_IF(ip >= iend, srcSize_wrong, "");
679
            nbSeq = ((nbSeq-0x80)<<8) + *ip++;
680
        }
681
    }
682
    *nbSeqPtr = nbSeq;
683

684
    /* FSE table descriptors */
685
    RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
686
    {   symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
687
        symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
688
        symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
689
        ip++;
690

691
        /* Build DTables */
692
        {   size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr,
693
                                                      LLtype, MaxLL, LLFSELog,
694
                                                      ip, iend-ip,
695
                                                      LL_base, LL_bits,
696
                                                      LL_defaultDTable, dctx->fseEntropy,
697
                                                      dctx->ddictIsCold, nbSeq,
698
                                                      dctx->workspace, sizeof(dctx->workspace),
699
                                                      ZSTD_DCtx_get_bmi2(dctx));
700
            RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");
701
            ip += llhSize;
702
        }
703

704
        {   size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr,
705
                                                      OFtype, MaxOff, OffFSELog,
706
                                                      ip, iend-ip,
707
                                                      OF_base, OF_bits,
708
                                                      OF_defaultDTable, dctx->fseEntropy,
709
                                                      dctx->ddictIsCold, nbSeq,
710
                                                      dctx->workspace, sizeof(dctx->workspace),
711
                                                      ZSTD_DCtx_get_bmi2(dctx));
712
            RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");
713
            ip += ofhSize;
714
        }
715

716
        {   size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr,
717
                                                      MLtype, MaxML, MLFSELog,
718
                                                      ip, iend-ip,
719
                                                      ML_base, ML_bits,
720
                                                      ML_defaultDTable, dctx->fseEntropy,
721
                                                      dctx->ddictIsCold, nbSeq,
722
                                                      dctx->workspace, sizeof(dctx->workspace),
723
                                                      ZSTD_DCtx_get_bmi2(dctx));
724
            RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");
725
            ip += mlhSize;
726
        }
727
    }
728

729
    return ip-istart;
730
}
731

732

733
typedef struct {
734
    size_t litLength;
735
    size_t matchLength;
736
    size_t offset;
737
} seq_t;
738

739
typedef struct {
740
    size_t state;
741
    const ZSTD_seqSymbol* table;
742
} ZSTD_fseState;
743

744
typedef struct {
745
    BIT_DStream_t DStream;
746
    ZSTD_fseState stateLL;
747
    ZSTD_fseState stateOffb;
748
    ZSTD_fseState stateML;
749
    size_t prevOffset[ZSTD_REP_NUM];
750
} seqState_t;
751

752
/*! ZSTD_overlapCopy8() :
753
 *  Copies 8 bytes from ip to op and updates op and ip where ip <= op.
754
 *  If the offset is < 8 then the offset is spread to at least 8 bytes.
755
 *
756
 *  Precondition: *ip <= *op
757
 *  Postcondition: *op - *op >= 8
758
 */
759
HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
760
    assert(*ip <= *op);
761
    if (offset < 8) {
762
        /* close range match, overlap */
763
        static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };   /* added */
764
        static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 };   /* subtracted */
765
        int const sub2 = dec64table[offset];
766
        (*op)[0] = (*ip)[0];
767
        (*op)[1] = (*ip)[1];
768
        (*op)[2] = (*ip)[2];
769
        (*op)[3] = (*ip)[3];
770
        *ip += dec32table[offset];
771
        ZSTD_copy4(*op+4, *ip);
772
        *ip -= sub2;
773
    } else {
774
        ZSTD_copy8(*op, *ip);
775
    }
776
    *ip += 8;
777
    *op += 8;
778
    assert(*op - *ip >= 8);
779
}
780

781
/*! ZSTD_safecopy() :
782
 *  Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer
783
 *  and write up to 16 bytes past oend_w (op >= oend_w is allowed).
784
 *  This function is only called in the uncommon case where the sequence is near the end of the block. It
785
 *  should be fast for a single long sequence, but can be slow for several short sequences.
786
 *
787
 *  @param ovtype controls the overlap detection
788
 *         - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart.
789
 *         - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.
790
 *           The src buffer must be before the dst buffer.
791
 */
792
static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
793
    ptrdiff_t const diff = op - ip;
794
    BYTE* const oend = op + length;
795

796
    assert((ovtype == ZSTD_no_overlap && (diff <= -8 || diff >= 8 || op >= oend_w)) ||
797
           (ovtype == ZSTD_overlap_src_before_dst && diff >= 0));
798

799
    if (length < 8) {
800
        /* Handle short lengths. */
801
        while (op < oend) *op++ = *ip++;
802
        return;
803
    }
804
    if (ovtype == ZSTD_overlap_src_before_dst) {
805
        /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */
806
        assert(length >= 8);
807
        ZSTD_overlapCopy8(&op, &ip, diff);
808
        length -= 8;
809
        assert(op - ip >= 8);
810
        assert(op <= oend);
811
    }
812

813
    if (oend <= oend_w) {
814
        /* No risk of overwrite. */
815
        ZSTD_wildcopy(op, ip, length, ovtype);
816
        return;
817
    }
818
    if (op <= oend_w) {
819
        /* Wildcopy until we get close to the end. */
820
        assert(oend > oend_w);
821
        ZSTD_wildcopy(op, ip, oend_w - op, ovtype);
822
        ip += oend_w - op;
823
        op += oend_w - op;
824
    }
825
    /* Handle the leftovers. */
826
    while (op < oend) *op++ = *ip++;
827
}
828

829
/* ZSTD_safecopyDstBeforeSrc():
830
 * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
831
 * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */
832
static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) {
833
    ptrdiff_t const diff = op - ip;
834
    BYTE* const oend = op + length;
835

836
    if (length < 8 || diff > -8) {
837
        /* Handle short lengths, close overlaps, and dst not before src. */
838
        while (op < oend) *op++ = *ip++;
839
        return;
840
    }
841

842
    if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) {
843
        ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap);
844
        ip += oend - WILDCOPY_OVERLENGTH - op;
845
        op += oend - WILDCOPY_OVERLENGTH - op;
846
    }
847

848
    /* Handle the leftovers. */
849
    while (op < oend) *op++ = *ip++;
850
}
851

852
/* ZSTD_execSequenceEnd():
853
 * This version handles cases that are near the end of the output buffer. It requires
854
 * more careful checks to make sure there is no overflow. By separating out these hard
855
 * and unlikely cases, we can speed up the common cases.
856
 *
857
 * NOTE: This function needs to be fast for a single long sequence, but doesn't need
858
 * to be optimized for many small sequences, since those fall into ZSTD_execSequence().
859
 */
860
FORCE_NOINLINE
861
size_t ZSTD_execSequenceEnd(BYTE* op,
862
    BYTE* const oend, seq_t sequence,
863
    const BYTE** litPtr, const BYTE* const litLimit,
864
    const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
865
{
866
    BYTE* const oLitEnd = op + sequence.litLength;
867
    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
868
    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
869
    const BYTE* match = oLitEnd - sequence.offset;
870
    BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
871

872
    /* bounds checks : careful of address space overflow in 32-bit mode */
873
    RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
874
    RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
875
    assert(op < op + sequenceLength);
876
    assert(oLitEnd < op + sequenceLength);
877

878
    /* copy literals */
879
    ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap);
880
    op = oLitEnd;
881
    *litPtr = iLitEnd;
882

883
    /* copy Match */
884
    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
885
        /* offset beyond prefix */
886
        RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
887
        match = dictEnd - (prefixStart - match);
888
        if (match + sequence.matchLength <= dictEnd) {
889
            ZSTD_memmove(oLitEnd, match, sequence.matchLength);
890
            return sequenceLength;
891
        }
892
        /* span extDict & currentPrefixSegment */
893
        {   size_t const length1 = dictEnd - match;
894
        ZSTD_memmove(oLitEnd, match, length1);
895
        op = oLitEnd + length1;
896
        sequence.matchLength -= length1;
897
        match = prefixStart;
898
        }
899
    }
900
    ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
901
    return sequenceLength;
902
}
903

904
/* ZSTD_execSequenceEndSplitLitBuffer():
905
 * This version is intended to be used during instances where the litBuffer is still split.  It is kept separate to avoid performance impact for the good case.
906
 */
907
FORCE_NOINLINE
908
size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
909
    BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
910
    const BYTE** litPtr, const BYTE* const litLimit,
911
    const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
912
{
913
    BYTE* const oLitEnd = op + sequence.litLength;
914
    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
915
    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
916
    const BYTE* match = oLitEnd - sequence.offset;
917

918

919
    /* bounds checks : careful of address space overflow in 32-bit mode */
920
    RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
921
    RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
922
    assert(op < op + sequenceLength);
923
    assert(oLitEnd < op + sequenceLength);
924

925
    /* copy literals */
926
    RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer");
927
    ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength);
928
    op = oLitEnd;
929
    *litPtr = iLitEnd;
930

931
    /* copy Match */
932
    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
933
        /* offset beyond prefix */
934
        RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
935
        match = dictEnd - (prefixStart - match);
936
        if (match + sequence.matchLength <= dictEnd) {
937
            ZSTD_memmove(oLitEnd, match, sequence.matchLength);
938
            return sequenceLength;
939
        }
940
        /* span extDict & currentPrefixSegment */
941
        {   size_t const length1 = dictEnd - match;
942
        ZSTD_memmove(oLitEnd, match, length1);
943
        op = oLitEnd + length1;
944
        sequence.matchLength -= length1;
945
        match = prefixStart;
946
        }
947
    }
948
    ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
949
    return sequenceLength;
950
}
951

952
HINT_INLINE
953
size_t ZSTD_execSequence(BYTE* op,
954
    BYTE* const oend, seq_t sequence,
955
    const BYTE** litPtr, const BYTE* const litLimit,
956
    const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
957
{
958
    BYTE* const oLitEnd = op + sequence.litLength;
959
    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
960
    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
961
    BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;   /* risk : address space underflow on oend=NULL */
962
    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
963
    const BYTE* match = oLitEnd - sequence.offset;
964

965
    assert(op != NULL /* Precondition */);
966
    assert(oend_w < oend /* No underflow */);
967
    /* Handle edge cases in a slow path:
968
     *   - Read beyond end of literals
969
     *   - Match end is within WILDCOPY_OVERLIMIT of oend
970
     *   - 32-bit mode and the match length overflows
971
     */
972
    if (UNLIKELY(
973
        iLitEnd > litLimit ||
974
        oMatchEnd > oend_w ||
975
        (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
976
        return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
977

978
    /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
979
    assert(op <= oLitEnd /* No overflow */);
980
    assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
981
    assert(oMatchEnd <= oend /* No underflow */);
982
    assert(iLitEnd <= litLimit /* Literal length is in bounds */);
983
    assert(oLitEnd <= oend_w /* Can wildcopy literals */);
984
    assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
985

986
    /* Copy Literals:
987
     * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
988
     * We likely don't need the full 32-byte wildcopy.
989
     */
990
    assert(WILDCOPY_OVERLENGTH >= 16);
991
    ZSTD_copy16(op, (*litPtr));
992
    if (UNLIKELY(sequence.litLength > 16)) {
993
        ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap);
994
    }
995
    op = oLitEnd;
996
    *litPtr = iLitEnd;   /* update for next sequence */
997

998
    /* Copy Match */
999
    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
1000
        /* offset beyond prefix -> go into extDict */
1001
        RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
1002
        match = dictEnd + (match - prefixStart);
1003
        if (match + sequence.matchLength <= dictEnd) {
1004
            ZSTD_memmove(oLitEnd, match, sequence.matchLength);
1005
            return sequenceLength;
1006
        }
1007
        /* span extDict & currentPrefixSegment */
1008
        {   size_t const length1 = dictEnd - match;
1009
        ZSTD_memmove(oLitEnd, match, length1);
1010
        op = oLitEnd + length1;
1011
        sequence.matchLength -= length1;
1012
        match = prefixStart;
1013
        }
1014
    }
1015
    /* Match within prefix of 1 or more bytes */
1016
    assert(op <= oMatchEnd);
1017
    assert(oMatchEnd <= oend_w);
1018
    assert(match >= prefixStart);
1019
    assert(sequence.matchLength >= 1);
1020

1021
    /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
1022
     * without overlap checking.
1023
     */
1024
    if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
1025
        /* We bet on a full wildcopy for matches, since we expect matches to be
1026
         * longer than literals (in general). In silesia, ~10% of matches are longer
1027
         * than 16 bytes.
1028
         */
1029
        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
1030
        return sequenceLength;
1031
    }
1032
    assert(sequence.offset < WILDCOPY_VECLEN);
1033

1034
    /* Copy 8 bytes and spread the offset to be >= 8. */
1035
    ZSTD_overlapCopy8(&op, &match, sequence.offset);
1036

1037
    /* If the match length is > 8 bytes, then continue with the wildcopy. */
1038
    if (sequence.matchLength > 8) {
1039
        assert(op < oMatchEnd);
1040
        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst);
1041
    }
1042
    return sequenceLength;
1043
}
1044

1045
HINT_INLINE
1046
size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
1047
    BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
1048
    const BYTE** litPtr, const BYTE* const litLimit,
1049
    const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
1050
{
1051
    BYTE* const oLitEnd = op + sequence.litLength;
1052
    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
1053
    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
1054
    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
1055
    const BYTE* match = oLitEnd - sequence.offset;
1056

1057
    assert(op != NULL /* Precondition */);
1058
    assert(oend_w < oend /* No underflow */);
1059
    /* Handle edge cases in a slow path:
1060
     *   - Read beyond end of literals
1061
     *   - Match end is within WILDCOPY_OVERLIMIT of oend
1062
     *   - 32-bit mode and the match length overflows
1063
     */
1064
    if (UNLIKELY(
1065
            iLitEnd > litLimit ||
1066
            oMatchEnd > oend_w ||
1067
            (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
1068
        return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
1069

1070
    /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
1071
    assert(op <= oLitEnd /* No overflow */);
1072
    assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
1073
    assert(oMatchEnd <= oend /* No underflow */);
1074
    assert(iLitEnd <= litLimit /* Literal length is in bounds */);
1075
    assert(oLitEnd <= oend_w /* Can wildcopy literals */);
1076
    assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
1077

1078
    /* Copy Literals:
1079
     * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
1080
     * We likely don't need the full 32-byte wildcopy.
1081
     */
1082
    assert(WILDCOPY_OVERLENGTH >= 16);
1083
    ZSTD_copy16(op, (*litPtr));
1084
    if (UNLIKELY(sequence.litLength > 16)) {
1085
        ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap);
1086
    }
1087
    op = oLitEnd;
1088
    *litPtr = iLitEnd;   /* update for next sequence */
1089

1090
    /* Copy Match */
1091
    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
1092
        /* offset beyond prefix -> go into extDict */
1093
        RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
1094
        match = dictEnd + (match - prefixStart);
1095
        if (match + sequence.matchLength <= dictEnd) {
1096
            ZSTD_memmove(oLitEnd, match, sequence.matchLength);
1097
            return sequenceLength;
1098
        }
1099
        /* span extDict & currentPrefixSegment */
1100
        {   size_t const length1 = dictEnd - match;
1101
            ZSTD_memmove(oLitEnd, match, length1);
1102
            op = oLitEnd + length1;
1103
            sequence.matchLength -= length1;
1104
            match = prefixStart;
1105
    }   }
1106
    /* Match within prefix of 1 or more bytes */
1107
    assert(op <= oMatchEnd);
1108
    assert(oMatchEnd <= oend_w);
1109
    assert(match >= prefixStart);
1110
    assert(sequence.matchLength >= 1);
1111

1112
    /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
1113
     * without overlap checking.
1114
     */
1115
    if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
1116
        /* We bet on a full wildcopy for matches, since we expect matches to be
1117
         * longer than literals (in general). In silesia, ~10% of matches are longer
1118
         * than 16 bytes.
1119
         */
1120
        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
1121
        return sequenceLength;
1122
    }
1123
    assert(sequence.offset < WILDCOPY_VECLEN);
1124

1125
    /* Copy 8 bytes and spread the offset to be >= 8. */
1126
    ZSTD_overlapCopy8(&op, &match, sequence.offset);
1127

1128
    /* If the match length is > 8 bytes, then continue with the wildcopy. */
1129
    if (sequence.matchLength > 8) {
1130
        assert(op < oMatchEnd);
1131
        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst);
1132
    }
1133
    return sequenceLength;
1134
}
1135

1136

1137
static void
1138
ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt)
1139
{
1140
    const void* ptr = dt;
1141
    const ZSTD_seqSymbol_header* const DTableH = (const ZSTD_seqSymbol_header*)ptr;
1142
    DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
1143
    DEBUGLOG(6, "ZSTD_initFseState : val=%u using %u bits",
1144
                (U32)DStatePtr->state, DTableH->tableLog);
1145
    BIT_reloadDStream(bitD);
1146
    DStatePtr->table = dt + 1;
1147
}
1148

1149
FORCE_INLINE_TEMPLATE void
1150
ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits)
1151
{
1152
    size_t const lowBits = BIT_readBits(bitD, nbBits);
1153
    DStatePtr->state = nextState + lowBits;
1154
}
1155

1156
/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
1157
 * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
1158
 * bits before reloading. This value is the maximum number of bytes we read
1159
 * after reloading when we are decoding long offsets.
1160
 */
1161
#define LONG_OFFSETS_MAX_EXTRA_BITS_32                       \
1162
    (ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32       \
1163
        ? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32  \
1164
        : 0)
1165

1166
typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
1167

1168
FORCE_INLINE_TEMPLATE seq_t
1169
ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
1170
{
1171
    seq_t seq;
1172
    const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
1173
    const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
1174
    const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
1175
    seq.matchLength = mlDInfo->baseValue;
1176
    seq.litLength = llDInfo->baseValue;
1177
    {   U32 const ofBase = ofDInfo->baseValue;
1178
        BYTE const llBits = llDInfo->nbAdditionalBits;
1179
        BYTE const mlBits = mlDInfo->nbAdditionalBits;
1180
        BYTE const ofBits = ofDInfo->nbAdditionalBits;
1181
        BYTE const totalBits = llBits+mlBits+ofBits;
1182

1183
        U16 const llNext = llDInfo->nextState;
1184
        U16 const mlNext = mlDInfo->nextState;
1185
        U16 const ofNext = ofDInfo->nextState;
1186
        U32 const llnbBits = llDInfo->nbBits;
1187
        U32 const mlnbBits = mlDInfo->nbBits;
1188
        U32 const ofnbBits = ofDInfo->nbBits;
1189
        /*
1190
         * As gcc has better branch and block analyzers, sometimes it is only
1191
         * valuable to mark likelyness for clang, it gives around 3-4% of
1192
         * performance.
1193
         */
1194

1195
        /* sequence */
1196
        {   size_t offset;
1197
    #if defined(__clang__)
1198
            if (LIKELY(ofBits > 1)) {
1199
    #else
1200
            if (ofBits > 1) {
1201
    #endif
1202
                ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
1203
                ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
1204
                assert(ofBits <= MaxOff);
1205
                if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
1206
                    U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
1207
                    offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
1208
                    BIT_reloadDStream(&seqState->DStream);
1209
                    if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
1210
                    assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32);   /* to avoid another reload */
1211
                } else {
1212
                    offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
1213
                    if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
1214
                }
1215
                seqState->prevOffset[2] = seqState->prevOffset[1];
1216
                seqState->prevOffset[1] = seqState->prevOffset[0];
1217
                seqState->prevOffset[0] = offset;
1218
            } else {
1219
                U32 const ll0 = (llDInfo->baseValue == 0);
1220
                if (LIKELY((ofBits == 0))) {
1221
                    offset = seqState->prevOffset[ll0];
1222
                    seqState->prevOffset[1] = seqState->prevOffset[!ll0];
1223
                    seqState->prevOffset[0] = offset;
1224
                } else {
1225
                    offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
1226
                    {   size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
1227
                        temp += !temp;   /* 0 is not valid; input is corrupted; force offset to 1 */
1228
                        if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
1229
                        seqState->prevOffset[1] = seqState->prevOffset[0];
1230
                        seqState->prevOffset[0] = offset = temp;
1231
            }   }   }
1232
            seq.offset = offset;
1233
        }
1234

1235
    #if defined(__clang__)
1236
        if (UNLIKELY(mlBits > 0))
1237
    #else
1238
        if (mlBits > 0)
1239
    #endif
1240
            seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
1241

1242
        if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
1243
            BIT_reloadDStream(&seqState->DStream);
1244
        if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
1245
            BIT_reloadDStream(&seqState->DStream);
1246
        /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
1247
        ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
1248

1249
    #if defined(__clang__)
1250
        if (UNLIKELY(llBits > 0))
1251
    #else
1252
        if (llBits > 0)
1253
    #endif
1254
            seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
1255

1256
        if (MEM_32bits())
1257
            BIT_reloadDStream(&seqState->DStream);
1258

1259
        DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
1260
                    (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1261

1262
        ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */
1263
        ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */
1264
        if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
1265
        ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits);  /* <=  8 bits */
1266
    }
1267

1268
    return seq;
1269
}
1270

1271
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1272
MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
1273
{
1274
    size_t const windowSize = dctx->fParams.windowSize;
1275
    /* No dictionary used. */
1276
    if (dctx->dictContentEndForFuzzing == NULL) return 0;
1277
    /* Dictionary is our prefix. */
1278
    if (prefixStart == dctx->dictContentBeginForFuzzing) return 1;
1279
    /* Dictionary is not our ext-dict. */
1280
    if (dctx->dictEnd != dctx->dictContentEndForFuzzing) return 0;
1281
    /* Dictionary is not within our window size. */
1282
    if ((size_t)(oLitEnd - prefixStart) >= windowSize) return 0;
1283
    /* Dictionary is active. */
1284
    return 1;
1285
}
1286

1287
MEM_STATIC void ZSTD_assertValidSequence(
1288
        ZSTD_DCtx const* dctx,
1289
        BYTE const* op, BYTE const* oend,
1290
        seq_t const seq,
1291
        BYTE const* prefixStart, BYTE const* virtualStart)
1292
{
1293
#if DEBUGLEVEL >= 1
1294
    size_t const windowSize = dctx->fParams.windowSize;
1295
    size_t const sequenceSize = seq.litLength + seq.matchLength;
1296
    BYTE const* const oLitEnd = op + seq.litLength;
1297
    DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
1298
            (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
1299
    assert(op <= oend);
1300
    assert((size_t)(oend - op) >= sequenceSize);
1301
    assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX);
1302
    if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
1303
        size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
1304
        /* Offset must be within the dictionary. */
1305
        assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
1306
        assert(seq.offset <= windowSize + dictSize);
1307
    } else {
1308
        /* Offset must be within our window. */
1309
        assert(seq.offset <= windowSize);
1310
    }
1311
#else
1312
    (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
1313
#endif
1314
}
1315
#endif
1316

1317
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1318

1319

1320
FORCE_INLINE_TEMPLATE size_t
1321
DONT_VECTORIZE
1322
ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
1323
                               void* dst, size_t maxDstSize,
1324
                         const void* seqStart, size_t seqSize, int nbSeq,
1325
                         const ZSTD_longOffset_e isLongOffset,
1326
                         const int frame)
1327
{
1328
    const BYTE* ip = (const BYTE*)seqStart;
1329
    const BYTE* const iend = ip + seqSize;
1330
    BYTE* const ostart = (BYTE*)dst;
1331
    BYTE* const oend = ostart + maxDstSize;
1332
    BYTE* op = ostart;
1333
    const BYTE* litPtr = dctx->litPtr;
1334
    const BYTE* litBufferEnd = dctx->litBufferEnd;
1335
    const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
1336
    const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
1337
    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
1338
    DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer");
1339
    (void)frame;
1340

1341
    /* Regen sequences */
1342
    if (nbSeq) {
1343
        seqState_t seqState;
1344
        dctx->fseEntropy = 1;
1345
        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1346
        RETURN_ERROR_IF(
1347
            ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
1348
            corruption_detected, "");
1349
        ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
1350
        ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
1351
        ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1352
        assert(dst != NULL);
1353

1354
        ZSTD_STATIC_ASSERT(
1355
                BIT_DStream_unfinished < BIT_DStream_completed &&
1356
                BIT_DStream_endOfBuffer < BIT_DStream_completed &&
1357
                BIT_DStream_completed < BIT_DStream_overflow);
1358

1359
        /* decompress without overrunning litPtr begins */
1360
        {
1361
            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1362
            /* Align the decompression loop to 32 + 16 bytes.
1363
                *
1364
                * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
1365
                * speed swings based on the alignment of the decompression loop. This
1366
                * performance swing is caused by parts of the decompression loop falling
1367
                * out of the DSB. The entire decompression loop should fit in the DSB,
1368
                * when it can't we get much worse performance. You can measure if you've
1369
                * hit the good case or the bad case with this perf command for some
1370
                * compressed file test.zst:
1371
                *
1372
                *   perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
1373
                *             -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
1374
                *
1375
                * If you see most cycles served out of the MITE you've hit the bad case.
1376
                * If you see most cycles served out of the DSB you've hit the good case.
1377
                * If it is pretty even then you may be in an okay case.
1378
                *
1379
                * This issue has been reproduced on the following CPUs:
1380
                *   - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
1381
                *               Use Instruments->Counters to get DSB/MITE cycles.
1382
                *               I never got performance swings, but I was able to
1383
                *               go from the good case of mostly DSB to half of the
1384
                *               cycles served from MITE.
1385
                *   - Coffeelake: Intel i9-9900k
1386
                *   - Coffeelake: Intel i7-9700k
1387
                *
1388
                * I haven't been able to reproduce the instability or DSB misses on any
1389
                * of the following CPUS:
1390
                *   - Haswell
1391
                *   - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
1392
                *   - Skylake
1393
                *
1394
                * Alignment is done for each of the three major decompression loops:
1395
                *   - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer
1396
                *   - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer
1397
                *   - ZSTD_decompressSequences_body
1398
                * Alignment choices are made to minimize large swings on bad cases and influence on performance
1399
                * from changes external to this code, rather than to overoptimize on the current commit.
1400
                *
1401
                * If you are seeing performance stability this script can help test.
1402
                * It tests on 4 commits in zstd where I saw performance change.
1403
                *
1404
                *   https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
1405
                */
1406
#if defined(__GNUC__) && defined(__x86_64__)
1407
            __asm__(".p2align 6");
1408
#  if __GNUC__ >= 7
1409
	    /* good for gcc-7, gcc-9, and gcc-11 */
1410
            __asm__("nop");
1411
            __asm__(".p2align 5");
1412
            __asm__("nop");
1413
            __asm__(".p2align 4");
1414
#    if __GNUC__ == 8 || __GNUC__ == 10
1415
	    /* good for gcc-8 and gcc-10 */
1416
            __asm__("nop");
1417
            __asm__(".p2align 3");
1418
#    endif
1419
#  endif
1420
#endif
1421

1422
            /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */
1423
            for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) {
1424
                size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1425
#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1426
                assert(!ZSTD_isError(oneSeqSize));
1427
                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1428
#endif
1429
                if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1430
                    return oneSeqSize;
1431
                DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1432
                op += oneSeqSize;
1433
                if (UNLIKELY(!--nbSeq))
1434
                    break;
1435
                BIT_reloadDStream(&(seqState.DStream));
1436
                sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1437
            }
1438

1439
            /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */
1440
            if (nbSeq > 0) {
1441
                const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1442
                if (leftoverLit)
1443
                {
1444
                    RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1445
                    ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1446
                    sequence.litLength -= leftoverLit;
1447
                    op += leftoverLit;
1448
                }
1449
                litPtr = dctx->litExtraBuffer;
1450
                litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1451
                dctx->litBufferLocation = ZSTD_not_in_dst;
1452
                {
1453
                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1454
#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1455
                    assert(!ZSTD_isError(oneSeqSize));
1456
                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1457
#endif
1458
                    if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1459
                        return oneSeqSize;
1460
                    DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1461
                    op += oneSeqSize;
1462
                    if (--nbSeq)
1463
                        BIT_reloadDStream(&(seqState.DStream));
1464
                }
1465
            }
1466
        }
1467

1468
        if (nbSeq > 0) /* there is remaining lit from extra buffer */
1469
        {
1470

1471
#if defined(__GNUC__) && defined(__x86_64__)
1472
            __asm__(".p2align 6");
1473
            __asm__("nop");
1474
#  if __GNUC__ != 7
1475
            /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */
1476
            __asm__(".p2align 4");
1477
            __asm__("nop");
1478
            __asm__(".p2align 3");
1479
#  elif __GNUC__ >= 11
1480
            __asm__(".p2align 3");
1481
#  else
1482
            __asm__(".p2align 5");
1483
            __asm__("nop");
1484
            __asm__(".p2align 3");
1485
#  endif
1486
#endif
1487

1488
            for (; ; ) {
1489
                seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1490
                size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
1491
#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1492
                assert(!ZSTD_isError(oneSeqSize));
1493
                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1494
#endif
1495
                if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1496
                    return oneSeqSize;
1497
                DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1498
                op += oneSeqSize;
1499
                if (UNLIKELY(!--nbSeq))
1500
                    break;
1501
                BIT_reloadDStream(&(seqState.DStream));
1502
            }
1503
        }
1504

1505
        /* check if reached exact end */
1506
        DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);
1507
        RETURN_ERROR_IF(nbSeq, corruption_detected, "");
1508
        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
1509
        /* save reps for next block */
1510
        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
1511
    }
1512

1513
    /* last literal segment */
1514
    if (dctx->litBufferLocation == ZSTD_split)  /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
1515
    {
1516
        size_t const lastLLSize = litBufferEnd - litPtr;
1517
        RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
1518
        if (op != NULL) {
1519
            ZSTD_memmove(op, litPtr, lastLLSize);
1520
            op += lastLLSize;
1521
        }
1522
        litPtr = dctx->litExtraBuffer;
1523
        litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1524
        dctx->litBufferLocation = ZSTD_not_in_dst;
1525
    }
1526
    {   size_t const lastLLSize = litBufferEnd - litPtr;
1527
        RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1528
        if (op != NULL) {
1529
            ZSTD_memcpy(op, litPtr, lastLLSize);
1530
            op += lastLLSize;
1531
        }
1532
    }
1533

1534
    return op-ostart;
1535
}
1536

1537
FORCE_INLINE_TEMPLATE size_t
1538
DONT_VECTORIZE
1539
ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
1540
    void* dst, size_t maxDstSize,
1541
    const void* seqStart, size_t seqSize, int nbSeq,
1542
    const ZSTD_longOffset_e isLongOffset,
1543
    const int frame)
1544
{
1545
    const BYTE* ip = (const BYTE*)seqStart;
1546
    const BYTE* const iend = ip + seqSize;
1547
    BYTE* const ostart = (BYTE*)dst;
1548
    BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer;
1549
    BYTE* op = ostart;
1550
    const BYTE* litPtr = dctx->litPtr;
1551
    const BYTE* const litEnd = litPtr + dctx->litSize;
1552
    const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
1553
    const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
1554
    const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
1555
    DEBUGLOG(5, "ZSTD_decompressSequences_body");
1556
    (void)frame;
1557

1558
    /* Regen sequences */
1559
    if (nbSeq) {
1560
        seqState_t seqState;
1561
        dctx->fseEntropy = 1;
1562
        { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1563
        RETURN_ERROR_IF(
1564
            ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)),
1565
            corruption_detected, "");
1566
        ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
1567
        ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
1568
        ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1569
        assert(dst != NULL);
1570

1571
        ZSTD_STATIC_ASSERT(
1572
            BIT_DStream_unfinished < BIT_DStream_completed &&
1573
            BIT_DStream_endOfBuffer < BIT_DStream_completed &&
1574
            BIT_DStream_completed < BIT_DStream_overflow);
1575

1576
#if defined(__GNUC__) && defined(__x86_64__)
1577
            __asm__(".p2align 6");
1578
            __asm__("nop");
1579
#  if __GNUC__ >= 7
1580
            __asm__(".p2align 5");
1581
            __asm__("nop");
1582
            __asm__(".p2align 3");
1583
#  else
1584
            __asm__(".p2align 4");
1585
            __asm__("nop");
1586
            __asm__(".p2align 3");
1587
#  endif
1588
#endif
1589

1590
        for ( ; ; ) {
1591
            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1592
            size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
1593
#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1594
            assert(!ZSTD_isError(oneSeqSize));
1595
            if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
1596
#endif
1597
            if (UNLIKELY(ZSTD_isError(oneSeqSize)))
1598
                return oneSeqSize;
1599
            DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
1600
            op += oneSeqSize;
1601
            if (UNLIKELY(!--nbSeq))
1602
                break;
1603
            BIT_reloadDStream(&(seqState.DStream));
1604
        }
1605

1606
        /* check if reached exact end */
1607
        DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
1608
        RETURN_ERROR_IF(nbSeq, corruption_detected, "");
1609
        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
1610
        /* save reps for next block */
1611
        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
1612
    }
1613

1614
    /* last literal segment */
1615
    {   size_t const lastLLSize = litEnd - litPtr;
1616
        RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1617
        if (op != NULL) {
1618
            ZSTD_memcpy(op, litPtr, lastLLSize);
1619
            op += lastLLSize;
1620
        }
1621
    }
1622

1623
    return op-ostart;
1624
}
1625

1626
static size_t
1627
ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
1628
                                 void* dst, size_t maxDstSize,
1629
                           const void* seqStart, size_t seqSize, int nbSeq,
1630
                           const ZSTD_longOffset_e isLongOffset,
1631
                           const int frame)
1632
{
1633
    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1634
}
1635

1636
static size_t
1637
ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
1638
                                               void* dst, size_t maxDstSize,
1639
                                         const void* seqStart, size_t seqSize, int nbSeq,
1640
                                         const ZSTD_longOffset_e isLongOffset,
1641
                                         const int frame)
1642
{
1643
    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1644
}
1645
#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1646

1647
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1648

1649
FORCE_INLINE_TEMPLATE size_t
1650
ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
1651
                   const BYTE* const prefixStart, const BYTE* const dictEnd)
1652
{
1653
    prefetchPos += sequence.litLength;
1654
    {   const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
1655
        const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
1656
                                                                              * No consequence though : memory address is only used for prefetching, not for dereferencing */
1657
        PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE);   /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
1658
    }
1659
    return prefetchPos + sequence.matchLength;
1660
}
1661

1662
/* This decoding function employs prefetching
1663
 * to reduce latency impact of cache misses.
1664
 * It's generally employed when block contains a significant portion of long-distance matches
1665
 * or when coupled with a "cold" dictionary */
1666
FORCE_INLINE_TEMPLATE size_t
1667
ZSTD_decompressSequencesLong_body(
1668
                               ZSTD_DCtx* dctx,
1669
                               void* dst, size_t maxDstSize,
1670
                         const void* seqStart, size_t seqSize, int nbSeq,
1671
                         const ZSTD_longOffset_e isLongOffset,
1672
                         const int frame)
1673
{
1674
    const BYTE* ip = (const BYTE*)seqStart;
1675
    const BYTE* const iend = ip + seqSize;
1676
    BYTE* const ostart = (BYTE*)dst;
1677
    BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize;
1678
    BYTE* op = ostart;
1679
    const BYTE* litPtr = dctx->litPtr;
1680
    const BYTE* litBufferEnd = dctx->litBufferEnd;
1681
    const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
1682
    const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
1683
    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
1684
    (void)frame;
1685

1686
    /* Regen sequences */
1687
    if (nbSeq) {
1688
#define STORED_SEQS 8
1689
#define STORED_SEQS_MASK (STORED_SEQS-1)
1690
#define ADVANCED_SEQS STORED_SEQS
1691
        seq_t sequences[STORED_SEQS];
1692
        int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
1693
        seqState_t seqState;
1694
        int seqNb;
1695
        size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */
1696

1697
        dctx->fseEntropy = 1;
1698
        { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
1699
        assert(dst != NULL);
1700
        assert(iend >= ip);
1701
        RETURN_ERROR_IF(
1702
            ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
1703
            corruption_detected, "");
1704
        ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
1705
        ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
1706
        ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
1707

1708
        /* prepare in advance */
1709
        for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
1710
            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1711
            prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1712
            sequences[seqNb] = sequence;
1713
        }
1714
        RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
1715

1716
        /* decompress without stomping litBuffer */
1717
        for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) {
1718
            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
1719
            size_t oneSeqSize;
1720

1721
            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd)
1722
            {
1723
                /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */
1724
                const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1725
                if (leftoverLit)
1726
                {
1727
                    RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1728
                    ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1729
                    sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit;
1730
                    op += leftoverLit;
1731
                }
1732
                litPtr = dctx->litExtraBuffer;
1733
                litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1734
                dctx->litBufferLocation = ZSTD_not_in_dst;
1735
                oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1736
#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1737
                assert(!ZSTD_isError(oneSeqSize));
1738
                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1739
#endif
1740
                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1741

1742
                prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1743
                sequences[seqNb & STORED_SEQS_MASK] = sequence;
1744
                op += oneSeqSize;
1745
            }
1746
            else
1747
            {
1748
                /* lit buffer is either wholly contained in first or second split, or not split at all*/
1749
                oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
1750
                    ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
1751
                    ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1752
#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1753
                assert(!ZSTD_isError(oneSeqSize));
1754
                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
1755
#endif
1756
                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1757

1758
                prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
1759
                sequences[seqNb & STORED_SEQS_MASK] = sequence;
1760
                op += oneSeqSize;
1761
            }
1762
        }
1763
        RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
1764

1765
        /* finish queue */
1766
        seqNb -= seqAdvance;
1767
        for ( ; seqNb<nbSeq ; seqNb++) {
1768
            seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]);
1769
            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd)
1770
            {
1771
                const size_t leftoverLit = dctx->litBufferEnd - litPtr;
1772
                if (leftoverLit)
1773
                {
1774
                    RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
1775
                    ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
1776
                    sequence->litLength -= leftoverLit;
1777
                    op += leftoverLit;
1778
                }
1779
                litPtr = dctx->litExtraBuffer;
1780
                litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1781
                dctx->litBufferLocation = ZSTD_not_in_dst;
1782
                {
1783
                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1784
#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1785
                    assert(!ZSTD_isError(oneSeqSize));
1786
                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1787
#endif
1788
                    if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1789
                    op += oneSeqSize;
1790
                }
1791
            }
1792
            else
1793
            {
1794
                size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
1795
                    ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
1796
                    ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
1797
#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
1798
                assert(!ZSTD_isError(oneSeqSize));
1799
                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
1800
#endif
1801
                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
1802
                op += oneSeqSize;
1803
            }
1804
        }
1805

1806
        /* save reps for next block */
1807
        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
1808
    }
1809

1810
    /* last literal segment */
1811
    if (dctx->litBufferLocation == ZSTD_split)  /* first deplete literal buffer in dst, then copy litExtraBuffer */
1812
    {
1813
        size_t const lastLLSize = litBufferEnd - litPtr;
1814
        RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
1815
        if (op != NULL) {
1816
            ZSTD_memmove(op, litPtr, lastLLSize);
1817
            op += lastLLSize;
1818
        }
1819
        litPtr = dctx->litExtraBuffer;
1820
        litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
1821
    }
1822
    {   size_t const lastLLSize = litBufferEnd - litPtr;
1823
        RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
1824
        if (op != NULL) {
1825
            ZSTD_memmove(op, litPtr, lastLLSize);
1826
            op += lastLLSize;
1827
        }
1828
    }
1829

1830
    return op-ostart;
1831
}
1832

1833
static size_t
1834
ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
1835
                                 void* dst, size_t maxDstSize,
1836
                           const void* seqStart, size_t seqSize, int nbSeq,
1837
                           const ZSTD_longOffset_e isLongOffset,
1838
                           const int frame)
1839
{
1840
    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1841
}
1842
#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1843

1844

1845

1846
#if DYNAMIC_BMI2
1847

1848
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1849
static BMI2_TARGET_ATTRIBUTE size_t
1850
DONT_VECTORIZE
1851
ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
1852
                                 void* dst, size_t maxDstSize,
1853
                           const void* seqStart, size_t seqSize, int nbSeq,
1854
                           const ZSTD_longOffset_e isLongOffset,
1855
                           const int frame)
1856
{
1857
    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1858
}
1859
static BMI2_TARGET_ATTRIBUTE size_t
1860
DONT_VECTORIZE
1861
ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
1862
                                 void* dst, size_t maxDstSize,
1863
                           const void* seqStart, size_t seqSize, int nbSeq,
1864
                           const ZSTD_longOffset_e isLongOffset,
1865
                           const int frame)
1866
{
1867
    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1868
}
1869
#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1870

1871
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1872
static BMI2_TARGET_ATTRIBUTE size_t
1873
ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
1874
                                 void* dst, size_t maxDstSize,
1875
                           const void* seqStart, size_t seqSize, int nbSeq,
1876
                           const ZSTD_longOffset_e isLongOffset,
1877
                           const int frame)
1878
{
1879
    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1880
}
1881
#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1882

1883
#endif /* DYNAMIC_BMI2 */
1884

1885
typedef size_t (*ZSTD_decompressSequences_t)(
1886
                            ZSTD_DCtx* dctx,
1887
                            void* dst, size_t maxDstSize,
1888
                            const void* seqStart, size_t seqSize, int nbSeq,
1889
                            const ZSTD_longOffset_e isLongOffset,
1890
                            const int frame);
1891

1892
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
1893
static size_t
1894
ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1895
                   const void* seqStart, size_t seqSize, int nbSeq,
1896
                   const ZSTD_longOffset_e isLongOffset,
1897
                   const int frame)
1898
{
1899
    DEBUGLOG(5, "ZSTD_decompressSequences");
1900
#if DYNAMIC_BMI2
1901
    if (ZSTD_DCtx_get_bmi2(dctx)) {
1902
        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1903
    }
1904
#endif
1905
    return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1906
}
1907
static size_t
1908
ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
1909
                                 const void* seqStart, size_t seqSize, int nbSeq,
1910
                                 const ZSTD_longOffset_e isLongOffset,
1911
                                 const int frame)
1912
{
1913
    DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
1914
#if DYNAMIC_BMI2
1915
    if (ZSTD_DCtx_get_bmi2(dctx)) {
1916
        return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1917
    }
1918
#endif
1919
    return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1920
}
1921
#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
1922

1923

1924
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
1925
/* ZSTD_decompressSequencesLong() :
1926
 * decompression function triggered when a minimum share of offsets is considered "long",
1927
 * aka out of cache.
1928
 * note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes meaning "farther than memory cache distance".
1929
 * This function will try to mitigate main memory latency through the use of prefetching */
1930
static size_t
1931
ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
1932
                             void* dst, size_t maxDstSize,
1933
                             const void* seqStart, size_t seqSize, int nbSeq,
1934
                             const ZSTD_longOffset_e isLongOffset,
1935
                             const int frame)
1936
{
1937
    DEBUGLOG(5, "ZSTD_decompressSequencesLong");
1938
#if DYNAMIC_BMI2
1939
    if (ZSTD_DCtx_get_bmi2(dctx)) {
1940
        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1941
    }
1942
#endif
1943
  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
1944
}
1945
#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
1946

1947

1948

1949
#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
1950
    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
1951
/* ZSTD_getLongOffsetsShare() :
1952
 * condition : offTable must be valid
1953
 * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
1954
 *           compared to maximum possible of (1<<OffFSELog) */
1955
static unsigned
1956
ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
1957
{
1958
    const void* ptr = offTable;
1959
    U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
1960
    const ZSTD_seqSymbol* table = offTable + 1;
1961
    U32 const max = 1 << tableLog;
1962
    U32 u, total = 0;
1963
    DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
1964

1965
    assert(max <= (1 << OffFSELog));  /* max not too large */
1966
    for (u=0; u<max; u++) {
1967
        if (table[u].nbAdditionalBits > 22) total += 1;
1968
    }
1969

1970
    assert(tableLog <= OffFSELog);
1971
    total <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
1972

1973
    return total;
1974
}
1975
#endif
1976

1977
size_t
1978
ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
1979
                              void* dst, size_t dstCapacity,
1980
                        const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
1981
{   /* blockType == blockCompressed */
1982
    const BYTE* ip = (const BYTE*)src;
1983
    /* isLongOffset must be true if there are long offsets.
1984
     * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
1985
     * We don't expect that to be the case in 64-bit mode.
1986
     * In block mode, window size is not known, so we have to be conservative.
1987
     * (note: but it could be evaluated from current-lowLimit)
1988
     */
1989
    ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
1990
    DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
1991

1992
    RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
1993

1994
    /* Decode literals section */
1995
    {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
1996
        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
1997
        if (ZSTD_isError(litCSize)) return litCSize;
1998
        ip += litCSize;
1999
        srcSize -= litCSize;
2000
    }
2001

2002
    /* Build Decoding Tables */
2003
    {
2004
        /* These macros control at build-time which decompressor implementation
2005
         * we use. If neither is defined, we do some inspection and dispatch at
2006
         * runtime.
2007
         */
2008
#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
2009
    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
2010
        int usePrefetchDecoder = dctx->ddictIsCold;
2011
#endif
2012
        int nbSeq;
2013
        size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
2014
        if (ZSTD_isError(seqHSize)) return seqHSize;
2015
        ip += seqHSize;
2016
        srcSize -= seqHSize;
2017

2018
        RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
2019

2020
#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
2021
    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
2022
        if ( !usePrefetchDecoder
2023
          && (!frame || (dctx->fParams.windowSize > (1<<24)))
2024
          && (nbSeq>ADVANCED_SEQS) ) {  /* could probably use a larger nbSeq limit */
2025
            U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
2026
            U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
2027
            usePrefetchDecoder = (shareLongOffsets >= minShare);
2028
        }
2029
#endif
2030

2031
        dctx->ddictIsCold = 0;
2032

2033
#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
2034
    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
2035
        if (usePrefetchDecoder)
2036
#endif
2037
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
2038
            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
2039
#endif
2040

2041
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
2042
        /* else */
2043
        if (dctx->litBufferLocation == ZSTD_split)
2044
            return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
2045
        else
2046
            return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
2047
#endif
2048
    }
2049
}
2050

2051

2052
void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
2053
{
2054
    if (dst != dctx->previousDstEnd && dstSize > 0) {   /* not contiguous */
2055
        dctx->dictEnd = dctx->previousDstEnd;
2056
        dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
2057
        dctx->prefixStart = dst;
2058
        dctx->previousDstEnd = dst;
2059
    }
2060
}
2061

2062

2063
size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
2064
                            void* dst, size_t dstCapacity,
2065
                      const void* src, size_t srcSize)
2066
{
2067
    size_t dSize;
2068
    ZSTD_checkContinuity(dctx, dst, dstCapacity);
2069
    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming);
2070
    dctx->previousDstEnd = (char*)dst + dSize;
2071
    return dSize;
2072
}
2073

2074
Product

Resources

Company