Path: blob/master/Utilities/cmzstd/lib/decompress/zstd_decompress_block.c
3158 views
/*1* Copyright (c) Meta Platforms, Inc. and affiliates.2* All rights reserved.3*4* This source code is licensed under both the BSD-style license (found in the5* LICENSE file in the root directory of this source tree) and the GPLv2 (found6* in the COPYING file in the root directory of this source tree).7* You may select, at your option, one of the above-listed licenses.8*/910/* zstd_decompress_block :11* this module takes care of decompressing _compressed_ block */1213/*-*******************************************************14* Dependencies15*********************************************************/16#include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */17#include "../common/compiler.h" /* prefetch */18#include "../common/cpu.h" /* bmi2 */19#include "../common/mem.h" /* low level memory routines */20#define FSE_STATIC_LINKING_ONLY21#include "../common/fse.h"22#include "../common/huf.h"23#include "../common/zstd_internal.h"24#include "zstd_decompress_internal.h" /* ZSTD_DCtx */25#include "zstd_ddict.h" /* ZSTD_DDictDictContent */26#include "zstd_decompress_block.h"27#include "../common/bits.h" /* ZSTD_highbit32 */2829/*_*******************************************************30* Macros31**********************************************************/3233/* These two optional macros force the use one way or another of the two34* ZSTD_decompressSequences implementations. You can't force in both directions35* at the same time.36*/37#if defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \38defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)39#error "Cannot force the use of the short and the long ZSTD_decompressSequences variants!"40#endif414243/*_*******************************************************44* Memory operations45**********************************************************/46static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }474849/*-*************************************************************50* Block decoding51***************************************************************/5253/*! ZSTD_getcBlockSize() :54* Provides the size of compressed block from block header `src` */55size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,56blockProperties_t* bpPtr)57{58RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong, "");5960{ U32 const cBlockHeader = MEM_readLE24(src);61U32 const cSize = cBlockHeader >> 3;62bpPtr->lastBlock = cBlockHeader & 1;63bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3);64bpPtr->origSize = cSize; /* only useful for RLE */65if (bpPtr->blockType == bt_rle) return 1;66RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected, "");67return cSize;68}69}7071/* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */72static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,73const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately)74{75if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH)76{77/* room for litbuffer to fit without read faulting */78dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH;79dctx->litBufferEnd = dctx->litBuffer + litSize;80dctx->litBufferLocation = ZSTD_in_dst;81}82else if (litSize > ZSTD_LITBUFFEREXTRASIZE)83{84/* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */85if (splitImmediately) {86/* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */87dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;88dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;89}90else {91/* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */92dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;93dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;94}95dctx->litBufferLocation = ZSTD_split;96}97else98{99/* fits entirely within litExtraBuffer, so no split is necessary */100dctx->litBuffer = dctx->litExtraBuffer;101dctx->litBufferEnd = dctx->litBuffer + litSize;102dctx->litBufferLocation = ZSTD_not_in_dst;103}104}105106/* Hidden declaration for fullbench */107size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,108const void* src, size_t srcSize,109void* dst, size_t dstCapacity, const streaming_operation streaming);110/*! ZSTD_decodeLiteralsBlock() :111* Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored112* in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current113* block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being114* stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write.115*116* @return : nb of bytes read from src (< srcSize )117* note : symbol not declared but exposed for fullbench */118size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,119const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */120void* dst, size_t dstCapacity, const streaming_operation streaming)121{122DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");123RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");124125{ const BYTE* const istart = (const BYTE*) src;126symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);127128switch(litEncType)129{130case set_repeat:131DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");132RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, "");133ZSTD_FALLTHROUGH;134135case set_compressed:136RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");137{ size_t lhSize, litSize, litCSize;138U32 singleStream=0;139U32 const lhlCode = (istart[0] >> 2) & 3;140U32 const lhc = MEM_readLE32(istart);141size_t hufSuccess;142size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);143int const flags = 0144| (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)145| (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);146switch(lhlCode)147{148case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */149/* 2 - 2 - 10 - 10 */150singleStream = !lhlCode;151lhSize = 3;152litSize = (lhc >> 4) & 0x3FF;153litCSize = (lhc >> 14) & 0x3FF;154break;155case 2:156/* 2 - 2 - 14 - 14 */157lhSize = 4;158litSize = (lhc >> 4) & 0x3FFF;159litCSize = lhc >> 18;160break;161case 3:162/* 2 - 2 - 18 - 18 */163lhSize = 5;164litSize = (lhc >> 4) & 0x3FFFF;165litCSize = (lhc >> 22) + ((size_t)istart[4] << 10);166break;167}168RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");169RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");170if (!singleStream)171RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,172"Not enough literals (%zu) for the 4-streams mode (min %u)",173litSize, MIN_LITERALS_FOR_4_STREAMS);174RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");175RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");176ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);177178/* prefetch huffman table if cold */179if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {180PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable));181}182183if (litEncType==set_repeat) {184if (singleStream) {185hufSuccess = HUF_decompress1X_usingDTable(186dctx->litBuffer, litSize, istart+lhSize, litCSize,187dctx->HUFptr, flags);188} else {189assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);190hufSuccess = HUF_decompress4X_usingDTable(191dctx->litBuffer, litSize, istart+lhSize, litCSize,192dctx->HUFptr, flags);193}194} else {195if (singleStream) {196#if defined(HUF_FORCE_DECOMPRESS_X2)197hufSuccess = HUF_decompress1X_DCtx_wksp(198dctx->entropy.hufTable, dctx->litBuffer, litSize,199istart+lhSize, litCSize, dctx->workspace,200sizeof(dctx->workspace), flags);201#else202hufSuccess = HUF_decompress1X1_DCtx_wksp(203dctx->entropy.hufTable, dctx->litBuffer, litSize,204istart+lhSize, litCSize, dctx->workspace,205sizeof(dctx->workspace), flags);206#endif207} else {208hufSuccess = HUF_decompress4X_hufOnly_wksp(209dctx->entropy.hufTable, dctx->litBuffer, litSize,210istart+lhSize, litCSize, dctx->workspace,211sizeof(dctx->workspace), flags);212}213}214if (dctx->litBufferLocation == ZSTD_split)215{216ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);217ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);218dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;219dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;220}221222RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");223224dctx->litPtr = dctx->litBuffer;225dctx->litSize = litSize;226dctx->litEntropy = 1;227if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable;228return litCSize + lhSize;229}230231case set_basic:232{ size_t litSize, lhSize;233U32 const lhlCode = ((istart[0]) >> 2) & 3;234size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);235switch(lhlCode)236{237case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */238lhSize = 1;239litSize = istart[0] >> 3;240break;241case 1:242lhSize = 2;243litSize = MEM_readLE16(istart) >> 4;244break;245case 3:246lhSize = 3;247RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");248litSize = MEM_readLE24(istart) >> 4;249break;250}251252RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");253RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");254ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);255if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */256RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, "");257if (dctx->litBufferLocation == ZSTD_split)258{259ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE);260ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);261}262else263{264ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize);265}266dctx->litPtr = dctx->litBuffer;267dctx->litSize = litSize;268return lhSize+litSize;269}270/* direct reference into compressed stream */271dctx->litPtr = istart+lhSize;272dctx->litSize = litSize;273dctx->litBufferEnd = dctx->litPtr + litSize;274dctx->litBufferLocation = ZSTD_not_in_dst;275return lhSize+litSize;276}277278case set_rle:279{ U32 const lhlCode = ((istart[0]) >> 2) & 3;280size_t litSize, lhSize;281size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);282switch(lhlCode)283{284case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */285lhSize = 1;286litSize = istart[0] >> 3;287break;288case 1:289lhSize = 2;290RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");291litSize = MEM_readLE16(istart) >> 4;292break;293case 3:294lhSize = 3;295RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");296litSize = MEM_readLE24(istart) >> 4;297break;298}299RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");300RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");301RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");302ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);303if (dctx->litBufferLocation == ZSTD_split)304{305ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE);306ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE);307}308else309{310ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize);311}312dctx->litPtr = dctx->litBuffer;313dctx->litSize = litSize;314return lhSize+1;315}316default:317RETURN_ERROR(corruption_detected, "impossible");318}319}320}321322/* Default FSE distribution tables.323* These are pre-calculated FSE decoding tables using default distributions as defined in specification :324* https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions325* They were generated programmatically with following method :326* - start from default distributions, present in /lib/common/zstd_internal.h327* - generate tables normally, using ZSTD_buildFSETable()328* - printout the content of tables329* - pretify output, report below, test with fuzzer to ensure it's correct */330331/* Default FSE distribution table for Literal Lengths */332static const ZSTD_seqSymbol LL_defaultDTable[(1<<LL_DEFAULTNORMLOG)+1] = {333{ 1, 1, 1, LL_DEFAULTNORMLOG}, /* header : fastMode, tableLog */334/* nextState, nbAddBits, nbBits, baseVal */335{ 0, 0, 4, 0}, { 16, 0, 4, 0},336{ 32, 0, 5, 1}, { 0, 0, 5, 3},337{ 0, 0, 5, 4}, { 0, 0, 5, 6},338{ 0, 0, 5, 7}, { 0, 0, 5, 9},339{ 0, 0, 5, 10}, { 0, 0, 5, 12},340{ 0, 0, 6, 14}, { 0, 1, 5, 16},341{ 0, 1, 5, 20}, { 0, 1, 5, 22},342{ 0, 2, 5, 28}, { 0, 3, 5, 32},343{ 0, 4, 5, 48}, { 32, 6, 5, 64},344{ 0, 7, 5, 128}, { 0, 8, 6, 256},345{ 0, 10, 6, 1024}, { 0, 12, 6, 4096},346{ 32, 0, 4, 0}, { 0, 0, 4, 1},347{ 0, 0, 5, 2}, { 32, 0, 5, 4},348{ 0, 0, 5, 5}, { 32, 0, 5, 7},349{ 0, 0, 5, 8}, { 32, 0, 5, 10},350{ 0, 0, 5, 11}, { 0, 0, 6, 13},351{ 32, 1, 5, 16}, { 0, 1, 5, 18},352{ 32, 1, 5, 22}, { 0, 2, 5, 24},353{ 32, 3, 5, 32}, { 0, 3, 5, 40},354{ 0, 6, 4, 64}, { 16, 6, 4, 64},355{ 32, 7, 5, 128}, { 0, 9, 6, 512},356{ 0, 11, 6, 2048}, { 48, 0, 4, 0},357{ 16, 0, 4, 1}, { 32, 0, 5, 2},358{ 32, 0, 5, 3}, { 32, 0, 5, 5},359{ 32, 0, 5, 6}, { 32, 0, 5, 8},360{ 32, 0, 5, 9}, { 32, 0, 5, 11},361{ 32, 0, 5, 12}, { 0, 0, 6, 15},362{ 32, 1, 5, 18}, { 32, 1, 5, 20},363{ 32, 2, 5, 24}, { 32, 2, 5, 28},364{ 32, 3, 5, 40}, { 32, 4, 5, 48},365{ 0, 16, 6,65536}, { 0, 15, 6,32768},366{ 0, 14, 6,16384}, { 0, 13, 6, 8192},367}; /* LL_defaultDTable */368369/* Default FSE distribution table for Offset Codes */370static const ZSTD_seqSymbol OF_defaultDTable[(1<<OF_DEFAULTNORMLOG)+1] = {371{ 1, 1, 1, OF_DEFAULTNORMLOG}, /* header : fastMode, tableLog */372/* nextState, nbAddBits, nbBits, baseVal */373{ 0, 0, 5, 0}, { 0, 6, 4, 61},374{ 0, 9, 5, 509}, { 0, 15, 5,32765},375{ 0, 21, 5,2097149}, { 0, 3, 5, 5},376{ 0, 7, 4, 125}, { 0, 12, 5, 4093},377{ 0, 18, 5,262141}, { 0, 23, 5,8388605},378{ 0, 5, 5, 29}, { 0, 8, 4, 253},379{ 0, 14, 5,16381}, { 0, 20, 5,1048573},380{ 0, 2, 5, 1}, { 16, 7, 4, 125},381{ 0, 11, 5, 2045}, { 0, 17, 5,131069},382{ 0, 22, 5,4194301}, { 0, 4, 5, 13},383{ 16, 8, 4, 253}, { 0, 13, 5, 8189},384{ 0, 19, 5,524285}, { 0, 1, 5, 1},385{ 16, 6, 4, 61}, { 0, 10, 5, 1021},386{ 0, 16, 5,65533}, { 0, 28, 5,268435453},387{ 0, 27, 5,134217725}, { 0, 26, 5,67108861},388{ 0, 25, 5,33554429}, { 0, 24, 5,16777213},389}; /* OF_defaultDTable */390391392/* Default FSE distribution table for Match Lengths */393static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = {394{ 1, 1, 1, ML_DEFAULTNORMLOG}, /* header : fastMode, tableLog */395/* nextState, nbAddBits, nbBits, baseVal */396{ 0, 0, 6, 3}, { 0, 0, 4, 4},397{ 32, 0, 5, 5}, { 0, 0, 5, 6},398{ 0, 0, 5, 8}, { 0, 0, 5, 9},399{ 0, 0, 5, 11}, { 0, 0, 6, 13},400{ 0, 0, 6, 16}, { 0, 0, 6, 19},401{ 0, 0, 6, 22}, { 0, 0, 6, 25},402{ 0, 0, 6, 28}, { 0, 0, 6, 31},403{ 0, 0, 6, 34}, { 0, 1, 6, 37},404{ 0, 1, 6, 41}, { 0, 2, 6, 47},405{ 0, 3, 6, 59}, { 0, 4, 6, 83},406{ 0, 7, 6, 131}, { 0, 9, 6, 515},407{ 16, 0, 4, 4}, { 0, 0, 4, 5},408{ 32, 0, 5, 6}, { 0, 0, 5, 7},409{ 32, 0, 5, 9}, { 0, 0, 5, 10},410{ 0, 0, 6, 12}, { 0, 0, 6, 15},411{ 0, 0, 6, 18}, { 0, 0, 6, 21},412{ 0, 0, 6, 24}, { 0, 0, 6, 27},413{ 0, 0, 6, 30}, { 0, 0, 6, 33},414{ 0, 1, 6, 35}, { 0, 1, 6, 39},415{ 0, 2, 6, 43}, { 0, 3, 6, 51},416{ 0, 4, 6, 67}, { 0, 5, 6, 99},417{ 0, 8, 6, 259}, { 32, 0, 4, 4},418{ 48, 0, 4, 4}, { 16, 0, 4, 5},419{ 32, 0, 5, 7}, { 32, 0, 5, 8},420{ 32, 0, 5, 10}, { 32, 0, 5, 11},421{ 0, 0, 6, 14}, { 0, 0, 6, 17},422{ 0, 0, 6, 20}, { 0, 0, 6, 23},423{ 0, 0, 6, 26}, { 0, 0, 6, 29},424{ 0, 0, 6, 32}, { 0, 16, 6,65539},425{ 0, 15, 6,32771}, { 0, 14, 6,16387},426{ 0, 13, 6, 8195}, { 0, 12, 6, 4099},427{ 0, 11, 6, 2051}, { 0, 10, 6, 1027},428}; /* ML_defaultDTable */429430431static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits)432{433void* ptr = dt;434ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr;435ZSTD_seqSymbol* const cell = dt + 1;436437DTableH->tableLog = 0;438DTableH->fastMode = 0;439440cell->nbBits = 0;441cell->nextState = 0;442assert(nbAddBits < 255);443cell->nbAdditionalBits = nbAddBits;444cell->baseValue = baseValue;445}446447448/* ZSTD_buildFSETable() :449* generate FSE decoding table for one symbol (ll, ml or off)450* cannot fail if input is valid =>451* all inputs are presumed validated at this stage */452FORCE_INLINE_TEMPLATE453void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,454const short* normalizedCounter, unsigned maxSymbolValue,455const U32* baseValue, const U8* nbAdditionalBits,456unsigned tableLog, void* wksp, size_t wkspSize)457{458ZSTD_seqSymbol* const tableDecode = dt+1;459U32 const maxSV1 = maxSymbolValue + 1;460U32 const tableSize = 1 << tableLog;461462U16* symbolNext = (U16*)wksp;463BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1);464U32 highThreshold = tableSize - 1;465466467/* Sanity Checks */468assert(maxSymbolValue <= MaxSeq);469assert(tableLog <= MaxFSELog);470assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE);471(void)wkspSize;472/* Init, lay down lowprob symbols */473{ ZSTD_seqSymbol_header DTableH;474DTableH.tableLog = tableLog;475DTableH.fastMode = 1;476{ S16 const largeLimit= (S16)(1 << (tableLog-1));477U32 s;478for (s=0; s<maxSV1; s++) {479if (normalizedCounter[s]==-1) {480tableDecode[highThreshold--].baseValue = s;481symbolNext[s] = 1;482} else {483if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;484assert(normalizedCounter[s]>=0);485symbolNext[s] = (U16)normalizedCounter[s];486} } }487ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));488}489490/* Spread symbols */491assert(tableSize <= 512);492/* Specialized symbol spreading for the case when there are493* no low probability (-1 count) symbols. When compressing494* small blocks we avoid low probability symbols to hit this495* case, since header decoding speed matters more.496*/497if (highThreshold == tableSize - 1) {498size_t const tableMask = tableSize-1;499size_t const step = FSE_TABLESTEP(tableSize);500/* First lay down the symbols in order.501* We use a uint64_t to lay down 8 bytes at a time. This reduces branch502* misses since small blocks generally have small table logs, so nearly503* all symbols have counts <= 8. We ensure we have 8 bytes at the end of504* our buffer to handle the over-write.505*/506{507U64 const add = 0x0101010101010101ull;508size_t pos = 0;509U64 sv = 0;510U32 s;511for (s=0; s<maxSV1; ++s, sv += add) {512int i;513int const n = normalizedCounter[s];514MEM_write64(spread + pos, sv);515for (i = 8; i < n; i += 8) {516MEM_write64(spread + pos + i, sv);517}518assert(n>=0);519pos += (size_t)n;520}521}522/* Now we spread those positions across the table.523* The benefit of doing it in two stages is that we avoid the524* variable size inner loop, which caused lots of branch misses.525* Now we can run through all the positions without any branch misses.526* We unroll the loop twice, since that is what empirically worked best.527*/528{529size_t position = 0;530size_t s;531size_t const unroll = 2;532assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */533for (s = 0; s < (size_t)tableSize; s += unroll) {534size_t u;535for (u = 0; u < unroll; ++u) {536size_t const uPosition = (position + (u * step)) & tableMask;537tableDecode[uPosition].baseValue = spread[s + u];538}539position = (position + (unroll * step)) & tableMask;540}541assert(position == 0);542}543} else {544U32 const tableMask = tableSize-1;545U32 const step = FSE_TABLESTEP(tableSize);546U32 s, position = 0;547for (s=0; s<maxSV1; s++) {548int i;549int const n = normalizedCounter[s];550for (i=0; i<n; i++) {551tableDecode[position].baseValue = s;552position = (position + step) & tableMask;553while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */554} }555assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */556}557558/* Build Decoding table */559{560U32 u;561for (u=0; u<tableSize; u++) {562U32 const symbol = tableDecode[u].baseValue;563U32 const nextState = symbolNext[symbol]++;564tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );565tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);566assert(nbAdditionalBits[symbol] < 255);567tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];568tableDecode[u].baseValue = baseValue[symbol];569}570}571}572573/* Avoids the FORCE_INLINE of the _body() function. */574static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,575const short* normalizedCounter, unsigned maxSymbolValue,576const U32* baseValue, const U8* nbAdditionalBits,577unsigned tableLog, void* wksp, size_t wkspSize)578{579ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,580baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);581}582583#if DYNAMIC_BMI2584BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,585const short* normalizedCounter, unsigned maxSymbolValue,586const U32* baseValue, const U8* nbAdditionalBits,587unsigned tableLog, void* wksp, size_t wkspSize)588{589ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,590baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);591}592#endif593594void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,595const short* normalizedCounter, unsigned maxSymbolValue,596const U32* baseValue, const U8* nbAdditionalBits,597unsigned tableLog, void* wksp, size_t wkspSize, int bmi2)598{599#if DYNAMIC_BMI2600if (bmi2) {601ZSTD_buildFSETable_body_bmi2(dt, normalizedCounter, maxSymbolValue,602baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);603return;604}605#endif606(void)bmi2;607ZSTD_buildFSETable_body_default(dt, normalizedCounter, maxSymbolValue,608baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);609}610611612/*! ZSTD_buildSeqTable() :613* @return : nb bytes read from src,614* or an error code if it fails */615static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,616symbolEncodingType_e type, unsigned max, U32 maxLog,617const void* src, size_t srcSize,618const U32* baseValue, const U8* nbAdditionalBits,619const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,620int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize,621int bmi2)622{623switch(type)624{625case set_rle :626RETURN_ERROR_IF(!srcSize, srcSize_wrong, "");627RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, "");628{ U32 const symbol = *(const BYTE*)src;629U32 const baseline = baseValue[symbol];630U8 const nbBits = nbAdditionalBits[symbol];631ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits);632}633*DTablePtr = DTableSpace;634return 1;635case set_basic :636*DTablePtr = defaultTable;637return 0;638case set_repeat:639RETURN_ERROR_IF(!flagRepeatTable, corruption_detected, "");640/* prefetch FSE table if used */641if (ddictIsCold && (nbSeq > 24 /* heuristic */)) {642const void* const pStart = *DTablePtr;643size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog));644PREFETCH_AREA(pStart, pSize);645}646return 0;647case set_compressed :648{ unsigned tableLog;649S16 norm[MaxSeq+1];650size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);651RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, "");652RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, "");653ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2);654*DTablePtr = DTableSpace;655return headerSize;656}657default :658assert(0);659RETURN_ERROR(GENERIC, "impossible");660}661}662663size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,664const void* src, size_t srcSize)665{666const BYTE* const istart = (const BYTE*)src;667const BYTE* const iend = istart + srcSize;668const BYTE* ip = istart;669int nbSeq;670DEBUGLOG(5, "ZSTD_decodeSeqHeaders");671672/* check */673RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong, "");674675/* SeqHead */676nbSeq = *ip++;677if (!nbSeq) {678*nbSeqPtr=0;679RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");680return 1;681}682if (nbSeq > 0x7F) {683if (nbSeq == 0xFF) {684RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");685nbSeq = MEM_readLE16(ip) + LONGNBSEQ;686ip+=2;687} else {688RETURN_ERROR_IF(ip >= iend, srcSize_wrong, "");689nbSeq = ((nbSeq-0x80)<<8) + *ip++;690}691}692*nbSeqPtr = nbSeq;693694/* FSE table descriptors */695RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */696{ symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);697symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);698symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);699ip++;700701/* Build DTables */702{ size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr,703LLtype, MaxLL, LLFSELog,704ip, iend-ip,705LL_base, LL_bits,706LL_defaultDTable, dctx->fseEntropy,707dctx->ddictIsCold, nbSeq,708dctx->workspace, sizeof(dctx->workspace),709ZSTD_DCtx_get_bmi2(dctx));710RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");711ip += llhSize;712}713714{ size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr,715OFtype, MaxOff, OffFSELog,716ip, iend-ip,717OF_base, OF_bits,718OF_defaultDTable, dctx->fseEntropy,719dctx->ddictIsCold, nbSeq,720dctx->workspace, sizeof(dctx->workspace),721ZSTD_DCtx_get_bmi2(dctx));722RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");723ip += ofhSize;724}725726{ size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr,727MLtype, MaxML, MLFSELog,728ip, iend-ip,729ML_base, ML_bits,730ML_defaultDTable, dctx->fseEntropy,731dctx->ddictIsCold, nbSeq,732dctx->workspace, sizeof(dctx->workspace),733ZSTD_DCtx_get_bmi2(dctx));734RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");735ip += mlhSize;736}737}738739return ip-istart;740}741742743typedef struct {744size_t litLength;745size_t matchLength;746size_t offset;747} seq_t;748749typedef struct {750size_t state;751const ZSTD_seqSymbol* table;752} ZSTD_fseState;753754typedef struct {755BIT_DStream_t DStream;756ZSTD_fseState stateLL;757ZSTD_fseState stateOffb;758ZSTD_fseState stateML;759size_t prevOffset[ZSTD_REP_NUM];760} seqState_t;761762/*! ZSTD_overlapCopy8() :763* Copies 8 bytes from ip to op and updates op and ip where ip <= op.764* If the offset is < 8 then the offset is spread to at least 8 bytes.765*766* Precondition: *ip <= *op767* Postcondition: *op - *op >= 8768*/769HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {770assert(*ip <= *op);771if (offset < 8) {772/* close range match, overlap */773static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */774static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */775int const sub2 = dec64table[offset];776(*op)[0] = (*ip)[0];777(*op)[1] = (*ip)[1];778(*op)[2] = (*ip)[2];779(*op)[3] = (*ip)[3];780*ip += dec32table[offset];781ZSTD_copy4(*op+4, *ip);782*ip -= sub2;783} else {784ZSTD_copy8(*op, *ip);785}786*ip += 8;787*op += 8;788assert(*op - *ip >= 8);789}790791/*! ZSTD_safecopy() :792* Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer793* and write up to 16 bytes past oend_w (op >= oend_w is allowed).794* This function is only called in the uncommon case where the sequence is near the end of the block. It795* should be fast for a single long sequence, but can be slow for several short sequences.796*797* @param ovtype controls the overlap detection798* - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart.799* - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.800* The src buffer must be before the dst buffer.801*/802static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {803ptrdiff_t const diff = op - ip;804BYTE* const oend = op + length;805806assert((ovtype == ZSTD_no_overlap && (diff <= -8 || diff >= 8 || op >= oend_w)) ||807(ovtype == ZSTD_overlap_src_before_dst && diff >= 0));808809if (length < 8) {810/* Handle short lengths. */811while (op < oend) *op++ = *ip++;812return;813}814if (ovtype == ZSTD_overlap_src_before_dst) {815/* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */816assert(length >= 8);817ZSTD_overlapCopy8(&op, &ip, diff);818length -= 8;819assert(op - ip >= 8);820assert(op <= oend);821}822823if (oend <= oend_w) {824/* No risk of overwrite. */825ZSTD_wildcopy(op, ip, length, ovtype);826return;827}828if (op <= oend_w) {829/* Wildcopy until we get close to the end. */830assert(oend > oend_w);831ZSTD_wildcopy(op, ip, oend_w - op, ovtype);832ip += oend_w - op;833op += oend_w - op;834}835/* Handle the leftovers. */836while (op < oend) *op++ = *ip++;837}838839/* ZSTD_safecopyDstBeforeSrc():840* This version allows overlap with dst before src, or handles the non-overlap case with dst after src841* Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */842static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) {843ptrdiff_t const diff = op - ip;844BYTE* const oend = op + length;845846if (length < 8 || diff > -8) {847/* Handle short lengths, close overlaps, and dst not before src. */848while (op < oend) *op++ = *ip++;849return;850}851852if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) {853ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap);854ip += oend - WILDCOPY_OVERLENGTH - op;855op += oend - WILDCOPY_OVERLENGTH - op;856}857858/* Handle the leftovers. */859while (op < oend) *op++ = *ip++;860}861862/* ZSTD_execSequenceEnd():863* This version handles cases that are near the end of the output buffer. It requires864* more careful checks to make sure there is no overflow. By separating out these hard865* and unlikely cases, we can speed up the common cases.866*867* NOTE: This function needs to be fast for a single long sequence, but doesn't need868* to be optimized for many small sequences, since those fall into ZSTD_execSequence().869*/870FORCE_NOINLINE871size_t ZSTD_execSequenceEnd(BYTE* op,872BYTE* const oend, seq_t sequence,873const BYTE** litPtr, const BYTE* const litLimit,874const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)875{876BYTE* const oLitEnd = op + sequence.litLength;877size_t const sequenceLength = sequence.litLength + sequence.matchLength;878const BYTE* const iLitEnd = *litPtr + sequence.litLength;879const BYTE* match = oLitEnd - sequence.offset;880BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;881882/* bounds checks : careful of address space overflow in 32-bit mode */883RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");884RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");885assert(op < op + sequenceLength);886assert(oLitEnd < op + sequenceLength);887888/* copy literals */889ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap);890op = oLitEnd;891*litPtr = iLitEnd;892893/* copy Match */894if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {895/* offset beyond prefix */896RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");897match = dictEnd - (prefixStart - match);898if (match + sequence.matchLength <= dictEnd) {899ZSTD_memmove(oLitEnd, match, sequence.matchLength);900return sequenceLength;901}902/* span extDict & currentPrefixSegment */903{ size_t const length1 = dictEnd - match;904ZSTD_memmove(oLitEnd, match, length1);905op = oLitEnd + length1;906sequence.matchLength -= length1;907match = prefixStart;908}909}910ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);911return sequenceLength;912}913914/* ZSTD_execSequenceEndSplitLitBuffer():915* This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case.916*/917FORCE_NOINLINE918size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,919BYTE* const oend, const BYTE* const oend_w, seq_t sequence,920const BYTE** litPtr, const BYTE* const litLimit,921const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)922{923BYTE* const oLitEnd = op + sequence.litLength;924size_t const sequenceLength = sequence.litLength + sequence.matchLength;925const BYTE* const iLitEnd = *litPtr + sequence.litLength;926const BYTE* match = oLitEnd - sequence.offset;927928929/* bounds checks : careful of address space overflow in 32-bit mode */930RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");931RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");932assert(op < op + sequenceLength);933assert(oLitEnd < op + sequenceLength);934935/* copy literals */936RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer");937ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength);938op = oLitEnd;939*litPtr = iLitEnd;940941/* copy Match */942if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {943/* offset beyond prefix */944RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");945match = dictEnd - (prefixStart - match);946if (match + sequence.matchLength <= dictEnd) {947ZSTD_memmove(oLitEnd, match, sequence.matchLength);948return sequenceLength;949}950/* span extDict & currentPrefixSegment */951{ size_t const length1 = dictEnd - match;952ZSTD_memmove(oLitEnd, match, length1);953op = oLitEnd + length1;954sequence.matchLength -= length1;955match = prefixStart;956}957}958ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);959return sequenceLength;960}961962HINT_INLINE963size_t ZSTD_execSequence(BYTE* op,964BYTE* const oend, seq_t sequence,965const BYTE** litPtr, const BYTE* const litLimit,966const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)967{968BYTE* const oLitEnd = op + sequence.litLength;969size_t const sequenceLength = sequence.litLength + sequence.matchLength;970BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */971BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; /* risk : address space underflow on oend=NULL */972const BYTE* const iLitEnd = *litPtr + sequence.litLength;973const BYTE* match = oLitEnd - sequence.offset;974975assert(op != NULL /* Precondition */);976assert(oend_w < oend /* No underflow */);977978#if defined(__aarch64__)979/* prefetch sequence starting from match that will be used for copy later */980PREFETCH_L1(match);981#endif982/* Handle edge cases in a slow path:983* - Read beyond end of literals984* - Match end is within WILDCOPY_OVERLIMIT of oend985* - 32-bit mode and the match length overflows986*/987if (UNLIKELY(988iLitEnd > litLimit ||989oMatchEnd > oend_w ||990(MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))991return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);992993/* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */994assert(op <= oLitEnd /* No overflow */);995assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);996assert(oMatchEnd <= oend /* No underflow */);997assert(iLitEnd <= litLimit /* Literal length is in bounds */);998assert(oLitEnd <= oend_w /* Can wildcopy literals */);999assert(oMatchEnd <= oend_w /* Can wildcopy matches */);10001001/* Copy Literals:1002* Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.1003* We likely don't need the full 32-byte wildcopy.1004*/1005assert(WILDCOPY_OVERLENGTH >= 16);1006ZSTD_copy16(op, (*litPtr));1007if (UNLIKELY(sequence.litLength > 16)) {1008ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap);1009}1010op = oLitEnd;1011*litPtr = iLitEnd; /* update for next sequence */10121013/* Copy Match */1014if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {1015/* offset beyond prefix -> go into extDict */1016RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");1017match = dictEnd + (match - prefixStart);1018if (match + sequence.matchLength <= dictEnd) {1019ZSTD_memmove(oLitEnd, match, sequence.matchLength);1020return sequenceLength;1021}1022/* span extDict & currentPrefixSegment */1023{ size_t const length1 = dictEnd - match;1024ZSTD_memmove(oLitEnd, match, length1);1025op = oLitEnd + length1;1026sequence.matchLength -= length1;1027match = prefixStart;1028}1029}1030/* Match within prefix of 1 or more bytes */1031assert(op <= oMatchEnd);1032assert(oMatchEnd <= oend_w);1033assert(match >= prefixStart);1034assert(sequence.matchLength >= 1);10351036/* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy1037* without overlap checking.1038*/1039if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {1040/* We bet on a full wildcopy for matches, since we expect matches to be1041* longer than literals (in general). In silesia, ~10% of matches are longer1042* than 16 bytes.1043*/1044ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);1045return sequenceLength;1046}1047assert(sequence.offset < WILDCOPY_VECLEN);10481049/* Copy 8 bytes and spread the offset to be >= 8. */1050ZSTD_overlapCopy8(&op, &match, sequence.offset);10511052/* If the match length is > 8 bytes, then continue with the wildcopy. */1053if (sequence.matchLength > 8) {1054assert(op < oMatchEnd);1055ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst);1056}1057return sequenceLength;1058}10591060HINT_INLINE1061size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,1062BYTE* const oend, const BYTE* const oend_w, seq_t sequence,1063const BYTE** litPtr, const BYTE* const litLimit,1064const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)1065{1066BYTE* const oLitEnd = op + sequence.litLength;1067size_t const sequenceLength = sequence.litLength + sequence.matchLength;1068BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */1069const BYTE* const iLitEnd = *litPtr + sequence.litLength;1070const BYTE* match = oLitEnd - sequence.offset;10711072assert(op != NULL /* Precondition */);1073assert(oend_w < oend /* No underflow */);1074/* Handle edge cases in a slow path:1075* - Read beyond end of literals1076* - Match end is within WILDCOPY_OVERLIMIT of oend1077* - 32-bit mode and the match length overflows1078*/1079if (UNLIKELY(1080iLitEnd > litLimit ||1081oMatchEnd > oend_w ||1082(MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))1083return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);10841085/* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */1086assert(op <= oLitEnd /* No overflow */);1087assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);1088assert(oMatchEnd <= oend /* No underflow */);1089assert(iLitEnd <= litLimit /* Literal length is in bounds */);1090assert(oLitEnd <= oend_w /* Can wildcopy literals */);1091assert(oMatchEnd <= oend_w /* Can wildcopy matches */);10921093/* Copy Literals:1094* Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.1095* We likely don't need the full 32-byte wildcopy.1096*/1097assert(WILDCOPY_OVERLENGTH >= 16);1098ZSTD_copy16(op, (*litPtr));1099if (UNLIKELY(sequence.litLength > 16)) {1100ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap);1101}1102op = oLitEnd;1103*litPtr = iLitEnd; /* update for next sequence */11041105/* Copy Match */1106if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {1107/* offset beyond prefix -> go into extDict */1108RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");1109match = dictEnd + (match - prefixStart);1110if (match + sequence.matchLength <= dictEnd) {1111ZSTD_memmove(oLitEnd, match, sequence.matchLength);1112return sequenceLength;1113}1114/* span extDict & currentPrefixSegment */1115{ size_t const length1 = dictEnd - match;1116ZSTD_memmove(oLitEnd, match, length1);1117op = oLitEnd + length1;1118sequence.matchLength -= length1;1119match = prefixStart;1120} }1121/* Match within prefix of 1 or more bytes */1122assert(op <= oMatchEnd);1123assert(oMatchEnd <= oend_w);1124assert(match >= prefixStart);1125assert(sequence.matchLength >= 1);11261127/* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy1128* without overlap checking.1129*/1130if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {1131/* We bet on a full wildcopy for matches, since we expect matches to be1132* longer than literals (in general). In silesia, ~10% of matches are longer1133* than 16 bytes.1134*/1135ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);1136return sequenceLength;1137}1138assert(sequence.offset < WILDCOPY_VECLEN);11391140/* Copy 8 bytes and spread the offset to be >= 8. */1141ZSTD_overlapCopy8(&op, &match, sequence.offset);11421143/* If the match length is > 8 bytes, then continue with the wildcopy. */1144if (sequence.matchLength > 8) {1145assert(op < oMatchEnd);1146ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst);1147}1148return sequenceLength;1149}115011511152static void1153ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt)1154{1155const void* ptr = dt;1156const ZSTD_seqSymbol_header* const DTableH = (const ZSTD_seqSymbol_header*)ptr;1157DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);1158DEBUGLOG(6, "ZSTD_initFseState : val=%u using %u bits",1159(U32)DStatePtr->state, DTableH->tableLog);1160BIT_reloadDStream(bitD);1161DStatePtr->table = dt + 1;1162}11631164FORCE_INLINE_TEMPLATE void1165ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits)1166{1167size_t const lowBits = BIT_readBits(bitD, nbBits);1168DStatePtr->state = nextState + lowBits;1169}11701171/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum1172* offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_321173* bits before reloading. This value is the maximum number of bytes we read1174* after reloading when we are decoding long offsets.1175*/1176#define LONG_OFFSETS_MAX_EXTRA_BITS_32 \1177(ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32 \1178? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32 \1179: 0)11801181typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;11821183FORCE_INLINE_TEMPLATE seq_t1184ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)1185{1186seq_t seq;1187/*1188* ZSTD_seqSymbol is a structure with a total of 64 bits wide. So it can be1189* loaded in one operation and extracted its fields by simply shifting or1190* bit-extracting on aarch64.1191* GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh1192* operations that cause performance drop. This can be avoided by using this1193* ZSTD_memcpy hack.1194*/1195#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))1196ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;1197ZSTD_seqSymbol* const llDInfo = &llDInfoS;1198ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;1199ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;1200ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));1201ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));1202ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));1203#else1204const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;1205const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;1206const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;1207#endif1208seq.matchLength = mlDInfo->baseValue;1209seq.litLength = llDInfo->baseValue;1210{ U32 const ofBase = ofDInfo->baseValue;1211BYTE const llBits = llDInfo->nbAdditionalBits;1212BYTE const mlBits = mlDInfo->nbAdditionalBits;1213BYTE const ofBits = ofDInfo->nbAdditionalBits;1214BYTE const totalBits = llBits+mlBits+ofBits;12151216U16 const llNext = llDInfo->nextState;1217U16 const mlNext = mlDInfo->nextState;1218U16 const ofNext = ofDInfo->nextState;1219U32 const llnbBits = llDInfo->nbBits;1220U32 const mlnbBits = mlDInfo->nbBits;1221U32 const ofnbBits = ofDInfo->nbBits;12221223assert(llBits <= MaxLLBits);1224assert(mlBits <= MaxMLBits);1225assert(ofBits <= MaxOff);1226/*1227* As gcc has better branch and block analyzers, sometimes it is only1228* valuable to mark likeliness for clang, it gives around 3-4% of1229* performance.1230*/12311232/* sequence */1233{ size_t offset;1234if (ofBits > 1) {1235ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);1236ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);1237ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);1238ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);1239if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {1240/* Always read extra bits, this keeps the logic simple,1241* avoids branches, and avoids accidentally reading 0 bits.1242*/1243U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;1244offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);1245BIT_reloadDStream(&seqState->DStream);1246offset += BIT_readBitsFast(&seqState->DStream, extraBits);1247} else {1248offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */1249if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);1250}1251seqState->prevOffset[2] = seqState->prevOffset[1];1252seqState->prevOffset[1] = seqState->prevOffset[0];1253seqState->prevOffset[0] = offset;1254} else {1255U32 const ll0 = (llDInfo->baseValue == 0);1256if (LIKELY((ofBits == 0))) {1257offset = seqState->prevOffset[ll0];1258seqState->prevOffset[1] = seqState->prevOffset[!ll0];1259seqState->prevOffset[0] = offset;1260} else {1261offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);1262{ size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];1263temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */1264if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];1265seqState->prevOffset[1] = seqState->prevOffset[0];1266seqState->prevOffset[0] = offset = temp;1267} } }1268seq.offset = offset;1269}12701271if (mlBits > 0)1272seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);12731274if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))1275BIT_reloadDStream(&seqState->DStream);1276if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))1277BIT_reloadDStream(&seqState->DStream);1278/* Ensure there are enough bits to read the rest of data in 64-bit mode. */1279ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);12801281if (llBits > 0)1282seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);12831284if (MEM_32bits())1285BIT_reloadDStream(&seqState->DStream);12861287DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",1288(U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);12891290ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */1291ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */1292if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */1293ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */1294}12951296return seq;1297}12981299#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION1300MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)1301{1302size_t const windowSize = dctx->fParams.windowSize;1303/* No dictionary used. */1304if (dctx->dictContentEndForFuzzing == NULL) return 0;1305/* Dictionary is our prefix. */1306if (prefixStart == dctx->dictContentBeginForFuzzing) return 1;1307/* Dictionary is not our ext-dict. */1308if (dctx->dictEnd != dctx->dictContentEndForFuzzing) return 0;1309/* Dictionary is not within our window size. */1310if ((size_t)(oLitEnd - prefixStart) >= windowSize) return 0;1311/* Dictionary is active. */1312return 1;1313}13141315MEM_STATIC void ZSTD_assertValidSequence(1316ZSTD_DCtx const* dctx,1317BYTE const* op, BYTE const* oend,1318seq_t const seq,1319BYTE const* prefixStart, BYTE const* virtualStart)1320{1321#if DEBUGLEVEL >= 11322size_t const windowSize = dctx->fParams.windowSize;1323size_t const sequenceSize = seq.litLength + seq.matchLength;1324BYTE const* const oLitEnd = op + seq.litLength;1325DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",1326(U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);1327assert(op <= oend);1328assert((size_t)(oend - op) >= sequenceSize);1329assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX);1330if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {1331size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);1332/* Offset must be within the dictionary. */1333assert(seq.offset <= (size_t)(oLitEnd - virtualStart));1334assert(seq.offset <= windowSize + dictSize);1335} else {1336/* Offset must be within our window. */1337assert(seq.offset <= windowSize);1338}1339#else1340(void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;1341#endif1342}1343#endif13441345#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG134613471348FORCE_INLINE_TEMPLATE size_t1349DONT_VECTORIZE1350ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,1351void* dst, size_t maxDstSize,1352const void* seqStart, size_t seqSize, int nbSeq,1353const ZSTD_longOffset_e isLongOffset,1354const int frame)1355{1356const BYTE* ip = (const BYTE*)seqStart;1357const BYTE* const iend = ip + seqSize;1358BYTE* const ostart = (BYTE*)dst;1359BYTE* const oend = ostart + maxDstSize;1360BYTE* op = ostart;1361const BYTE* litPtr = dctx->litPtr;1362const BYTE* litBufferEnd = dctx->litBufferEnd;1363const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);1364const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);1365const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);1366DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer");1367(void)frame;13681369/* Regen sequences */1370if (nbSeq) {1371seqState_t seqState;1372dctx->fseEntropy = 1;1373{ U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }1374RETURN_ERROR_IF(1375ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),1376corruption_detected, "");1377ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);1378ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);1379ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);1380assert(dst != NULL);13811382ZSTD_STATIC_ASSERT(1383BIT_DStream_unfinished < BIT_DStream_completed &&1384BIT_DStream_endOfBuffer < BIT_DStream_completed &&1385BIT_DStream_completed < BIT_DStream_overflow);13861387/* decompress without overrunning litPtr begins */1388{1389seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);1390/* Align the decompression loop to 32 + 16 bytes.1391*1392* zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression1393* speed swings based on the alignment of the decompression loop. This1394* performance swing is caused by parts of the decompression loop falling1395* out of the DSB. The entire decompression loop should fit in the DSB,1396* when it can't we get much worse performance. You can measure if you've1397* hit the good case or the bad case with this perf command for some1398* compressed file test.zst:1399*1400* perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \1401* -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst1402*1403* If you see most cycles served out of the MITE you've hit the bad case.1404* If you see most cycles served out of the DSB you've hit the good case.1405* If it is pretty even then you may be in an okay case.1406*1407* This issue has been reproduced on the following CPUs:1408* - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i91409* Use Instruments->Counters to get DSB/MITE cycles.1410* I never got performance swings, but I was able to1411* go from the good case of mostly DSB to half of the1412* cycles served from MITE.1413* - Coffeelake: Intel i9-9900k1414* - Coffeelake: Intel i7-9700k1415*1416* I haven't been able to reproduce the instability or DSB misses on any1417* of the following CPUS:1418* - Haswell1419* - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH1420* - Skylake1421*1422* Alignment is done for each of the three major decompression loops:1423* - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer1424* - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer1425* - ZSTD_decompressSequences_body1426* Alignment choices are made to minimize large swings on bad cases and influence on performance1427* from changes external to this code, rather than to overoptimize on the current commit.1428*1429* If you are seeing performance stability this script can help test.1430* It tests on 4 commits in zstd where I saw performance change.1431*1432* https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f41433*/1434#if defined(__GNUC__) && defined(__x86_64__)1435__asm__(".p2align 6");1436# if __GNUC__ >= 71437/* good for gcc-7, gcc-9, and gcc-11 */1438__asm__("nop");1439__asm__(".p2align 5");1440__asm__("nop");1441__asm__(".p2align 4");1442# if __GNUC__ == 8 || __GNUC__ == 101443/* good for gcc-8 and gcc-10 */1444__asm__("nop");1445__asm__(".p2align 3");1446# endif1447# endif1448#endif14491450/* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */1451for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) {1452size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);1453#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)1454assert(!ZSTD_isError(oneSeqSize));1455if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);1456#endif1457if (UNLIKELY(ZSTD_isError(oneSeqSize)))1458return oneSeqSize;1459DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);1460op += oneSeqSize;1461if (UNLIKELY(!--nbSeq))1462break;1463BIT_reloadDStream(&(seqState.DStream));1464sequence = ZSTD_decodeSequence(&seqState, isLongOffset);1465}14661467/* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */1468if (nbSeq > 0) {1469const size_t leftoverLit = dctx->litBufferEnd - litPtr;1470if (leftoverLit)1471{1472RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");1473ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);1474sequence.litLength -= leftoverLit;1475op += leftoverLit;1476}1477litPtr = dctx->litExtraBuffer;1478litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;1479dctx->litBufferLocation = ZSTD_not_in_dst;1480{1481size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);1482#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)1483assert(!ZSTD_isError(oneSeqSize));1484if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);1485#endif1486if (UNLIKELY(ZSTD_isError(oneSeqSize)))1487return oneSeqSize;1488DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);1489op += oneSeqSize;1490if (--nbSeq)1491BIT_reloadDStream(&(seqState.DStream));1492}1493}1494}14951496if (nbSeq > 0) /* there is remaining lit from extra buffer */1497{14981499#if defined(__GNUC__) && defined(__x86_64__)1500__asm__(".p2align 6");1501__asm__("nop");1502# if __GNUC__ != 71503/* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */1504__asm__(".p2align 4");1505__asm__("nop");1506__asm__(".p2align 3");1507# elif __GNUC__ >= 111508__asm__(".p2align 3");1509# else1510__asm__(".p2align 5");1511__asm__("nop");1512__asm__(".p2align 3");1513# endif1514#endif15151516for (; ; ) {1517seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);1518size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);1519#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)1520assert(!ZSTD_isError(oneSeqSize));1521if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);1522#endif1523if (UNLIKELY(ZSTD_isError(oneSeqSize)))1524return oneSeqSize;1525DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);1526op += oneSeqSize;1527if (UNLIKELY(!--nbSeq))1528break;1529BIT_reloadDStream(&(seqState.DStream));1530}1531}15321533/* check if reached exact end */1534DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);1535RETURN_ERROR_IF(nbSeq, corruption_detected, "");1536RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");1537/* save reps for next block */1538{ U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }1539}15401541/* last literal segment */1542if (dctx->litBufferLocation == ZSTD_split) /* split hasn't been reached yet, first get dst then copy litExtraBuffer */1543{1544size_t const lastLLSize = litBufferEnd - litPtr;1545RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");1546if (op != NULL) {1547ZSTD_memmove(op, litPtr, lastLLSize);1548op += lastLLSize;1549}1550litPtr = dctx->litExtraBuffer;1551litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;1552dctx->litBufferLocation = ZSTD_not_in_dst;1553}1554{ size_t const lastLLSize = litBufferEnd - litPtr;1555RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");1556if (op != NULL) {1557ZSTD_memcpy(op, litPtr, lastLLSize);1558op += lastLLSize;1559}1560}15611562return op-ostart;1563}15641565FORCE_INLINE_TEMPLATE size_t1566DONT_VECTORIZE1567ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,1568void* dst, size_t maxDstSize,1569const void* seqStart, size_t seqSize, int nbSeq,1570const ZSTD_longOffset_e isLongOffset,1571const int frame)1572{1573const BYTE* ip = (const BYTE*)seqStart;1574const BYTE* const iend = ip + seqSize;1575BYTE* const ostart = (BYTE*)dst;1576BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer;1577BYTE* op = ostart;1578const BYTE* litPtr = dctx->litPtr;1579const BYTE* const litEnd = litPtr + dctx->litSize;1580const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);1581const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);1582const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);1583DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);1584(void)frame;15851586/* Regen sequences */1587if (nbSeq) {1588seqState_t seqState;1589dctx->fseEntropy = 1;1590{ U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }1591RETURN_ERROR_IF(1592ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)),1593corruption_detected, "");1594ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);1595ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);1596ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);1597assert(dst != NULL);15981599ZSTD_STATIC_ASSERT(1600BIT_DStream_unfinished < BIT_DStream_completed &&1601BIT_DStream_endOfBuffer < BIT_DStream_completed &&1602BIT_DStream_completed < BIT_DStream_overflow);16031604#if defined(__GNUC__) && defined(__x86_64__)1605__asm__(".p2align 6");1606__asm__("nop");1607# if __GNUC__ >= 71608__asm__(".p2align 5");1609__asm__("nop");1610__asm__(".p2align 3");1611# else1612__asm__(".p2align 4");1613__asm__("nop");1614__asm__(".p2align 3");1615# endif1616#endif16171618for ( ; ; ) {1619seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);1620size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);1621#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)1622assert(!ZSTD_isError(oneSeqSize));1623if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);1624#endif1625if (UNLIKELY(ZSTD_isError(oneSeqSize)))1626return oneSeqSize;1627DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);1628op += oneSeqSize;1629if (UNLIKELY(!--nbSeq))1630break;1631BIT_reloadDStream(&(seqState.DStream));1632}16331634/* check if reached exact end */1635DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);1636RETURN_ERROR_IF(nbSeq, corruption_detected, "");1637RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");1638/* save reps for next block */1639{ U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }1640}16411642/* last literal segment */1643{ size_t const lastLLSize = litEnd - litPtr;1644RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");1645if (op != NULL) {1646ZSTD_memcpy(op, litPtr, lastLLSize);1647op += lastLLSize;1648}1649}16501651return op-ostart;1652}16531654static size_t1655ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,1656void* dst, size_t maxDstSize,1657const void* seqStart, size_t seqSize, int nbSeq,1658const ZSTD_longOffset_e isLongOffset,1659const int frame)1660{1661return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);1662}16631664static size_t1665ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,1666void* dst, size_t maxDstSize,1667const void* seqStart, size_t seqSize, int nbSeq,1668const ZSTD_longOffset_e isLongOffset,1669const int frame)1670{1671return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);1672}1673#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */16741675#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT16761677FORCE_INLINE_TEMPLATE size_t1678ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,1679const BYTE* const prefixStart, const BYTE* const dictEnd)1680{1681prefetchPos += sequence.litLength;1682{ const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;1683const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.1684* No consequence though : memory address is only used for prefetching, not for dereferencing */1685PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */1686}1687return prefetchPos + sequence.matchLength;1688}16891690/* This decoding function employs prefetching1691* to reduce latency impact of cache misses.1692* It's generally employed when block contains a significant portion of long-distance matches1693* or when coupled with a "cold" dictionary */1694FORCE_INLINE_TEMPLATE size_t1695ZSTD_decompressSequencesLong_body(1696ZSTD_DCtx* dctx,1697void* dst, size_t maxDstSize,1698const void* seqStart, size_t seqSize, int nbSeq,1699const ZSTD_longOffset_e isLongOffset,1700const int frame)1701{1702const BYTE* ip = (const BYTE*)seqStart;1703const BYTE* const iend = ip + seqSize;1704BYTE* const ostart = (BYTE*)dst;1705BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize;1706BYTE* op = ostart;1707const BYTE* litPtr = dctx->litPtr;1708const BYTE* litBufferEnd = dctx->litBufferEnd;1709const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);1710const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);1711const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);1712(void)frame;17131714/* Regen sequences */1715if (nbSeq) {1716#define STORED_SEQS 81717#define STORED_SEQS_MASK (STORED_SEQS-1)1718#define ADVANCED_SEQS STORED_SEQS1719seq_t sequences[STORED_SEQS];1720int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);1721seqState_t seqState;1722int seqNb;1723size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */17241725dctx->fseEntropy = 1;1726{ int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }1727assert(dst != NULL);1728assert(iend >= ip);1729RETURN_ERROR_IF(1730ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),1731corruption_detected, "");1732ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);1733ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);1734ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);17351736/* prepare in advance */1737for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {1738seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);1739prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);1740sequences[seqNb] = sequence;1741}1742RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");17431744/* decompress without stomping litBuffer */1745for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) {1746seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);1747size_t oneSeqSize;17481749if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd)1750{1751/* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */1752const size_t leftoverLit = dctx->litBufferEnd - litPtr;1753if (leftoverLit)1754{1755RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");1756ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);1757sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit;1758op += leftoverLit;1759}1760litPtr = dctx->litExtraBuffer;1761litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;1762dctx->litBufferLocation = ZSTD_not_in_dst;1763oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);1764#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)1765assert(!ZSTD_isError(oneSeqSize));1766if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);1767#endif1768if (ZSTD_isError(oneSeqSize)) return oneSeqSize;17691770prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);1771sequences[seqNb & STORED_SEQS_MASK] = sequence;1772op += oneSeqSize;1773}1774else1775{1776/* lit buffer is either wholly contained in first or second split, or not split at all*/1777oneSeqSize = dctx->litBufferLocation == ZSTD_split ?1778ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :1779ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);1780#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)1781assert(!ZSTD_isError(oneSeqSize));1782if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);1783#endif1784if (ZSTD_isError(oneSeqSize)) return oneSeqSize;17851786prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);1787sequences[seqNb & STORED_SEQS_MASK] = sequence;1788op += oneSeqSize;1789}1790}1791RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");17921793/* finish queue */1794seqNb -= seqAdvance;1795for ( ; seqNb<nbSeq ; seqNb++) {1796seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]);1797if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd)1798{1799const size_t leftoverLit = dctx->litBufferEnd - litPtr;1800if (leftoverLit)1801{1802RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");1803ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);1804sequence->litLength -= leftoverLit;1805op += leftoverLit;1806}1807litPtr = dctx->litExtraBuffer;1808litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;1809dctx->litBufferLocation = ZSTD_not_in_dst;1810{1811size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);1812#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)1813assert(!ZSTD_isError(oneSeqSize));1814if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);1815#endif1816if (ZSTD_isError(oneSeqSize)) return oneSeqSize;1817op += oneSeqSize;1818}1819}1820else1821{1822size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?1823ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :1824ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);1825#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)1826assert(!ZSTD_isError(oneSeqSize));1827if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);1828#endif1829if (ZSTD_isError(oneSeqSize)) return oneSeqSize;1830op += oneSeqSize;1831}1832}18331834/* save reps for next block */1835{ U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }1836}18371838/* last literal segment */1839if (dctx->litBufferLocation == ZSTD_split) /* first deplete literal buffer in dst, then copy litExtraBuffer */1840{1841size_t const lastLLSize = litBufferEnd - litPtr;1842RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");1843if (op != NULL) {1844ZSTD_memmove(op, litPtr, lastLLSize);1845op += lastLLSize;1846}1847litPtr = dctx->litExtraBuffer;1848litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;1849}1850{ size_t const lastLLSize = litBufferEnd - litPtr;1851RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");1852if (op != NULL) {1853ZSTD_memmove(op, litPtr, lastLLSize);1854op += lastLLSize;1855}1856}18571858return op-ostart;1859}18601861static size_t1862ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,1863void* dst, size_t maxDstSize,1864const void* seqStart, size_t seqSize, int nbSeq,1865const ZSTD_longOffset_e isLongOffset,1866const int frame)1867{1868return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);1869}1870#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */1871187218731874#if DYNAMIC_BMI218751876#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG1877static BMI2_TARGET_ATTRIBUTE size_t1878DONT_VECTORIZE1879ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,1880void* dst, size_t maxDstSize,1881const void* seqStart, size_t seqSize, int nbSeq,1882const ZSTD_longOffset_e isLongOffset,1883const int frame)1884{1885return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);1886}1887static BMI2_TARGET_ATTRIBUTE size_t1888DONT_VECTORIZE1889ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,1890void* dst, size_t maxDstSize,1891const void* seqStart, size_t seqSize, int nbSeq,1892const ZSTD_longOffset_e isLongOffset,1893const int frame)1894{1895return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);1896}1897#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */18981899#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT1900static BMI2_TARGET_ATTRIBUTE size_t1901ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,1902void* dst, size_t maxDstSize,1903const void* seqStart, size_t seqSize, int nbSeq,1904const ZSTD_longOffset_e isLongOffset,1905const int frame)1906{1907return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);1908}1909#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */19101911#endif /* DYNAMIC_BMI2 */19121913typedef size_t (*ZSTD_decompressSequences_t)(1914ZSTD_DCtx* dctx,1915void* dst, size_t maxDstSize,1916const void* seqStart, size_t seqSize, int nbSeq,1917const ZSTD_longOffset_e isLongOffset,1918const int frame);19191920#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG1921static size_t1922ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,1923const void* seqStart, size_t seqSize, int nbSeq,1924const ZSTD_longOffset_e isLongOffset,1925const int frame)1926{1927DEBUGLOG(5, "ZSTD_decompressSequences");1928#if DYNAMIC_BMI21929if (ZSTD_DCtx_get_bmi2(dctx)) {1930return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);1931}1932#endif1933return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);1934}1935static size_t1936ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,1937const void* seqStart, size_t seqSize, int nbSeq,1938const ZSTD_longOffset_e isLongOffset,1939const int frame)1940{1941DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");1942#if DYNAMIC_BMI21943if (ZSTD_DCtx_get_bmi2(dctx)) {1944return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);1945}1946#endif1947return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);1948}1949#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */195019511952#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT1953/* ZSTD_decompressSequencesLong() :1954* decompression function triggered when a minimum share of offsets is considered "long",1955* aka out of cache.1956* note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes meaning "farther than memory cache distance".1957* This function will try to mitigate main memory latency through the use of prefetching */1958static size_t1959ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,1960void* dst, size_t maxDstSize,1961const void* seqStart, size_t seqSize, int nbSeq,1962const ZSTD_longOffset_e isLongOffset,1963const int frame)1964{1965DEBUGLOG(5, "ZSTD_decompressSequencesLong");1966#if DYNAMIC_BMI21967if (ZSTD_DCtx_get_bmi2(dctx)) {1968return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);1969}1970#endif1971return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);1972}1973#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */197419751976/**1977* @returns The total size of the history referenceable by zstd, including1978* both the prefix and the extDict. At @p op any offset larger than this1979* is invalid.1980*/1981static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)1982{1983return (size_t)(op - virtualStart);1984}19851986typedef struct {1987unsigned longOffsetShare;1988unsigned maxNbAdditionalBits;1989} ZSTD_OffsetInfo;19901991/* ZSTD_getOffsetInfo() :1992* condition : offTable must be valid1993* @return : "share" of long offsets (arbitrarily defined as > (1<<23))1994* compared to maximum possible of (1<<OffFSELog),1995* as well as the maximum number additional bits required.1996*/1997static ZSTD_OffsetInfo1998ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)1999{2000ZSTD_OffsetInfo info = {0, 0};2001/* If nbSeq == 0, then the offTable is uninitialized, but we have2002* no sequences, so both values should be 0.2003*/2004if (nbSeq != 0) {2005const void* ptr = offTable;2006U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;2007const ZSTD_seqSymbol* table = offTable + 1;2008U32 const max = 1 << tableLog;2009U32 u;2010DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);20112012assert(max <= (1 << OffFSELog)); /* max not too large */2013for (u=0; u<max; u++) {2014info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);2015if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;2016}20172018assert(tableLog <= OffFSELog);2019info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */2020}20212022return info;2023}20242025/**2026* @returns The maximum offset we can decode in one read of our bitstream, without2027* reloading more bits in the middle of the offset bits read. Any offsets larger2028* than this must use the long offset decoder.2029*/2030static size_t ZSTD_maxShortOffset(void)2031{2032if (MEM_64bits()) {2033/* We can decode any offset without reloading bits.2034* This might change if the max window size grows.2035*/2036ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);2037return (size_t)-1;2038} else {2039/* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.2040* This offBase would require STREAM_ACCUMULATOR_MIN extra bits.2041* Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.2042*/2043size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;2044size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;2045assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);2046return maxOffset;2047}2048}20492050size_t2051ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,2052void* dst, size_t dstCapacity,2053const void* src, size_t srcSize, const int frame, const streaming_operation streaming)2054{ /* blockType == blockCompressed */2055const BYTE* ip = (const BYTE*)src;2056DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);20572058/* Note : the wording of the specification2059* allows compressed block to be sized exactly ZSTD_BLOCKSIZE_MAX.2060* This generally does not happen, as it makes little sense,2061* since an uncompressed block would feature same size and have no decompression cost.2062* Also, note that decoder from reference libzstd before < v1.5.42063* would consider this edge case as an error.2064* As a consequence, avoid generating compressed blocks of size ZSTD_BLOCKSIZE_MAX2065* for broader compatibility with the deployed ecosystem of zstd decoders */2066RETURN_ERROR_IF(srcSize > ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");20672068/* Decode literals section */2069{ size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);2070DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);2071if (ZSTD_isError(litCSize)) return litCSize;2072ip += litCSize;2073srcSize -= litCSize;2074}20752076/* Build Decoding Tables */2077{2078/* Compute the maximum block size, which must also work when !frame and fParams are unset.2079* Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.2080*/2081size_t const blockSizeMax = MIN(dstCapacity, (frame ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX));2082size_t const totalHistorySize = ZSTD_totalHistorySize((BYTE*)dst + blockSizeMax, (BYTE const*)dctx->virtualStart);2083/* isLongOffset must be true if there are long offsets.2084* Offsets are long if they are larger than ZSTD_maxShortOffset().2085* We don't expect that to be the case in 64-bit mode.2086*2087* We check here to see if our history is large enough to allow long offsets.2088* If it isn't, then we can't possible have (valid) long offsets. If the offset2089* is invalid, then it is okay to read it incorrectly.2090*2091* If isLongOffsets is true, then we will later check our decoding table to see2092* if it is even possible to generate long offsets.2093*/2094ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));2095/* These macros control at build-time which decompressor implementation2096* we use. If neither is defined, we do some inspection and dispatch at2097* runtime.2098*/2099#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \2100!defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)2101int usePrefetchDecoder = dctx->ddictIsCold;2102#else2103/* Set to 1 to avoid computing offset info if we don't need to.2104* Otherwise this value is ignored.2105*/2106int usePrefetchDecoder = 1;2107#endif2108int nbSeq;2109size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);2110if (ZSTD_isError(seqHSize)) return seqHSize;2111ip += seqHSize;2112srcSize -= seqHSize;21132114RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled");2115RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall,2116"invalid dst");21172118/* If we could potentially have long offsets, or we might want to use the prefetch decoder,2119* compute information about the share of long offsets, and the maximum nbAdditionalBits.2120* NOTE: could probably use a larger nbSeq limit2121*/2122if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {2123ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);2124if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {2125/* If isLongOffset, but the maximum number of additional bits that we see in our table is small2126* enough, then we know it is impossible to have too long an offset in this block, so we can2127* use the regular offset decoder.2128*/2129isLongOffset = ZSTD_lo_isRegularOffset;2130}2131if (!usePrefetchDecoder) {2132U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */2133usePrefetchDecoder = (info.longOffsetShare >= minShare);2134}2135}21362137dctx->ddictIsCold = 0;21382139#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \2140!defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)2141if (usePrefetchDecoder) {2142#else2143(void)usePrefetchDecoder;2144{2145#endif2146#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT2147return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);2148#endif2149}21502151#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG2152/* else */2153if (dctx->litBufferLocation == ZSTD_split)2154return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);2155else2156return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);2157#endif2158}2159}216021612162void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)2163{2164if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */2165dctx->dictEnd = dctx->previousDstEnd;2166dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));2167dctx->prefixStart = dst;2168dctx->previousDstEnd = dst;2169}2170}217121722173size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,2174void* dst, size_t dstCapacity,2175const void* src, size_t srcSize)2176{2177size_t dSize;2178ZSTD_checkContinuity(dctx, dst, dstCapacity);2179dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming);2180dctx->previousDstEnd = (char*)dst + dSize;2181return dSize;2182}218321842185/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */2186size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,2187void* dst, size_t dstCapacity,2188const void* src, size_t srcSize)2189{2190return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize);2191}219221932194