Path: blob/master/Utilities/cmzstd/lib/decompress/huf_decompress.c
5040 views
/* ******************************************************************1* huff0 huffman decoder,2* part of Finite State Entropy library3* Copyright (c) Meta Platforms, Inc. and affiliates.4*5* You can contact the author at :6* - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy7*8* This source code is licensed under both the BSD-style license (found in the9* LICENSE file in the root directory of this source tree) and the GPLv2 (found10* in the COPYING file in the root directory of this source tree).11* You may select, at your option, one of the above-listed licenses.12****************************************************************** */1314/* **************************************************************15* Dependencies16****************************************************************/17#include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memset */18#include "../common/compiler.h"19#include "../common/bitstream.h" /* BIT_* */20#include "../common/fse.h" /* to compress headers */21#include "../common/huf.h"22#include "../common/error_private.h"23#include "../common/zstd_internal.h"24#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */2526/* **************************************************************27* Constants28****************************************************************/2930#define HUF_DECODER_FAST_TABLELOG 113132/* **************************************************************33* Macros34****************************************************************/3536#ifdef HUF_DISABLE_FAST_DECODE37# define HUF_ENABLE_FAST_DECODE 038#else39# define HUF_ENABLE_FAST_DECODE 140#endif4142/* These two optional macros force the use one way or another of the two43* Huffman decompression implementations. You can't force in both directions44* at the same time.45*/46#if defined(HUF_FORCE_DECOMPRESS_X1) && \47defined(HUF_FORCE_DECOMPRESS_X2)48#error "Cannot force the use of the X1 and X2 decoders at the same time!"49#endif5051/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is52* supported at runtime, so we can add the BMI2 target attribute.53* When it is disabled, we will still get BMI2 if it is enabled statically.54*/55#if DYNAMIC_BMI256# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE57#else58# define HUF_FAST_BMI2_ATTRS59#endif6061#ifdef __cplusplus62# define HUF_EXTERN_C extern "C"63#else64# define HUF_EXTERN_C65#endif66#define HUF_ASM_DECL HUF_EXTERN_C6768#if DYNAMIC_BMI269# define HUF_NEED_BMI2_FUNCTION 170#else71# define HUF_NEED_BMI2_FUNCTION 072#endif7374/* **************************************************************75* Error Management76****************************************************************/77#define HUF_isError ERR_isError787980/* **************************************************************81* Byte alignment for workSpace management82****************************************************************/83#define HUF_ALIGN(x, a) HUF_ALIGN_MASK((x), (a) - 1)84#define HUF_ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask))858687/* **************************************************************88* BMI2 Variant Wrappers89****************************************************************/90typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,91const void *cSrc,92size_t cSrcSize,93const HUF_DTable *DTable);9495#if DYNAMIC_BMI29697#define HUF_DGEN(fn) \98\99static size_t fn##_default( \100void* dst, size_t dstSize, \101const void* cSrc, size_t cSrcSize, \102const HUF_DTable* DTable) \103{ \104return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \105} \106\107static BMI2_TARGET_ATTRIBUTE size_t fn##_bmi2( \108void* dst, size_t dstSize, \109const void* cSrc, size_t cSrcSize, \110const HUF_DTable* DTable) \111{ \112return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \113} \114\115static size_t fn(void* dst, size_t dstSize, void const* cSrc, \116size_t cSrcSize, HUF_DTable const* DTable, int flags) \117{ \118if (flags & HUF_flags_bmi2) { \119return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \120} \121return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \122}123124#else125126#define HUF_DGEN(fn) \127static size_t fn(void* dst, size_t dstSize, void const* cSrc, \128size_t cSrcSize, HUF_DTable const* DTable, int flags) \129{ \130(void)flags; \131return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \132}133134#endif135136137/*-***************************/138/* generic DTableDesc */139/*-***************************/140typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved; } DTableDesc;141142static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)143{144DTableDesc dtd;145ZSTD_memcpy(&dtd, table, sizeof(dtd));146return dtd;147}148149static size_t HUF_initFastDStream(BYTE const* ip) {150BYTE const lastByte = ip[7];151size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;152size_t const value = MEM_readLEST(ip) | 1;153assert(bitsConsumed <= 8);154assert(sizeof(size_t) == 8);155return value << bitsConsumed;156}157158159/**160* The input/output arguments to the Huffman fast decoding loop:161*162* ip [in/out] - The input pointers, must be updated to reflect what is consumed.163* op [in/out] - The output pointers, must be updated to reflect what is written.164* bits [in/out] - The bitstream containers, must be updated to reflect the current state.165* dt [in] - The decoding table.166* ilowest [in] - The beginning of the valid range of the input. Decoders may read167* down to this pointer. It may be below iend[0].168* oend [in] - The end of the output stream. op[3] must not cross oend.169* iend [in] - The end of each input stream. ip[i] may cross iend[i],170* as long as it is above ilowest, but that indicates corruption.171*/172typedef struct {173BYTE const* ip[4];174BYTE* op[4];175U64 bits[4];176void const* dt;177BYTE const* ilowest;178BYTE* oend;179BYTE const* iend[4];180} HUF_DecompressFastArgs;181182typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);183184/**185* Initializes args for the fast decoding loop.186* @returns 1 on success187* 0 if the fallback implementation should be used.188* Or an error code on failure.189*/190static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)191{192void const* dt = DTable + 1;193U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;194195const BYTE* const istart = (const BYTE*)src;196197BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);198199/* The fast decoding loop assumes 64-bit little-endian.200* This condition is false on x32.201*/202if (!MEM_isLittleEndian() || MEM_32bits())203return 0;204205/* Avoid nullptr addition */206if (dstSize == 0)207return 0;208assert(dst != NULL);209210/* strict minimum : jump table + 1 byte per stream */211if (srcSize < 10)212return ERROR(corruption_detected);213214/* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers.215* If table log is not correct at this point, fallback to the old decoder.216* On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.217*/218if (dtLog != HUF_DECODER_FAST_TABLELOG)219return 0;220221/* Read the jump table. */222{223size_t const length1 = MEM_readLE16(istart);224size_t const length2 = MEM_readLE16(istart+2);225size_t const length3 = MEM_readLE16(istart+4);226size_t const length4 = srcSize - (length1 + length2 + length3 + 6);227args->iend[0] = istart + 6; /* jumpTable */228args->iend[1] = args->iend[0] + length1;229args->iend[2] = args->iend[1] + length2;230args->iend[3] = args->iend[2] + length3;231232/* HUF_initFastDStream() requires this, and this small of an input233* won't benefit from the ASM loop anyways.234*/235if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8)236return 0;237if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */238}239/* ip[] contains the position that is currently loaded into bits[]. */240args->ip[0] = args->iend[1] - sizeof(U64);241args->ip[1] = args->iend[2] - sizeof(U64);242args->ip[2] = args->iend[3] - sizeof(U64);243args->ip[3] = (BYTE const*)src + srcSize - sizeof(U64);244245/* op[] contains the output pointers. */246args->op[0] = (BYTE*)dst;247args->op[1] = args->op[0] + (dstSize+3)/4;248args->op[2] = args->op[1] + (dstSize+3)/4;249args->op[3] = args->op[2] + (dstSize+3)/4;250251/* No point to call the ASM loop for tiny outputs. */252if (args->op[3] >= oend)253return 0;254255/* bits[] is the bit container.256* It is read from the MSB down to the LSB.257* It is shifted left as it is read, and zeros are258* shifted in. After the lowest valid bit a 1 is259* set, so that CountTrailingZeros(bits[]) can be used260* to count how many bits we've consumed.261*/262args->bits[0] = HUF_initFastDStream(args->ip[0]);263args->bits[1] = HUF_initFastDStream(args->ip[1]);264args->bits[2] = HUF_initFastDStream(args->ip[2]);265args->bits[3] = HUF_initFastDStream(args->ip[3]);266267/* The decoders must be sure to never read beyond ilowest.268* This is lower than iend[0], but allowing decoders to read269* down to ilowest can allow an extra iteration or two in the270* fast loop.271*/272args->ilowest = istart;273274args->oend = oend;275args->dt = dt;276277return 1;278}279280static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)281{282/* Validate that we haven't overwritten. */283if (args->op[stream] > segmentEnd)284return ERROR(corruption_detected);285/* Validate that we haven't read beyond iend[].286* Note that ip[] may be < iend[] because the MSB is287* the next bit to read, and we may have consumed 100%288* of the stream, so down to iend[i] - 8 is valid.289*/290if (args->ip[stream] < args->iend[stream] - 8)291return ERROR(corruption_detected);292293/* Construct the BIT_DStream_t. */294assert(sizeof(size_t) == 8);295bit->bitContainer = MEM_readLEST(args->ip[stream]);296bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);297bit->start = (const char*)args->ilowest;298bit->limitPtr = bit->start + sizeof(size_t);299bit->ptr = (const char*)args->ip[stream];300301return 0;302}303304/* Calls X(N) for each stream 0, 1, 2, 3. */305#define HUF_4X_FOR_EACH_STREAM(X) \306do { \307X(0); \308X(1); \309X(2); \310X(3); \311} while (0)312313/* Calls X(N, var) for each stream 0, 1, 2, 3. */314#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \315do { \316X(0, (var)); \317X(1, (var)); \318X(2, (var)); \319X(3, (var)); \320} while (0)321322323#ifndef HUF_FORCE_DECOMPRESS_X2324325/*-***************************/326/* single-symbol decoding */327/*-***************************/328typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decoding */329330/**331* Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at332* a time.333*/334static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {335U64 D4;336if (MEM_isLittleEndian()) {337D4 = (U64)((symbol << 8) + nbBits);338} else {339D4 = (U64)(symbol + (nbBits << 8));340}341assert(D4 < (1U << 16));342D4 *= 0x0001000100010001ULL;343return D4;344}345346/**347* Increase the tableLog to targetTableLog and rescales the stats.348* If tableLog > targetTableLog this is a no-op.349* @returns New tableLog350*/351static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog)352{353if (tableLog > targetTableLog)354return tableLog;355if (tableLog < targetTableLog) {356U32 const scale = targetTableLog - tableLog;357U32 s;358/* Increase the weight for all non-zero probability symbols by scale. */359for (s = 0; s < nbSymbols; ++s) {360huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale);361}362/* Update rankVal to reflect the new weights.363* All weights except 0 get moved to weight + scale.364* Weights [1, scale] are empty.365*/366for (s = targetTableLog; s > scale; --s) {367rankVal[s] = rankVal[s - scale];368}369for (s = scale; s > 0; --s) {370rankVal[s] = 0;371}372}373return targetTableLog;374}375376typedef struct {377U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];378U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1];379U32 statsWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];380BYTE symbols[HUF_SYMBOLVALUE_MAX + 1];381BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];382} HUF_ReadDTableX1_Workspace;383384size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)385{386U32 tableLog = 0;387U32 nbSymbols = 0;388size_t iSize;389void* const dtPtr = DTable + 1;390HUF_DEltX1* const dt = (HUF_DEltX1*)dtPtr;391HUF_ReadDTableX1_Workspace* wksp = (HUF_ReadDTableX1_Workspace*)workSpace;392393DEBUG_STATIC_ASSERT(HUF_DECOMPRESS_WORKSPACE_SIZE >= sizeof(*wksp));394if (sizeof(*wksp) > wkspSize) return ERROR(tableLog_tooLarge);395396DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));397/* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */398399iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);400if (HUF_isError(iSize)) return iSize;401402403/* Table header */404{ DTableDesc dtd = HUF_getDTableDesc(DTable);405U32 const maxTableLog = dtd.maxTableLog + 1;406U32 const targetTableLog = MIN(maxTableLog, HUF_DECODER_FAST_TABLELOG);407tableLog = HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbSymbols, tableLog, targetTableLog);408if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */409dtd.tableType = 0;410dtd.tableLog = (BYTE)tableLog;411ZSTD_memcpy(DTable, &dtd, sizeof(dtd));412}413414/* Compute symbols and rankStart given rankVal:415*416* rankVal already contains the number of values of each weight.417*418* symbols contains the symbols ordered by weight. First are the rankVal[0]419* weight 0 symbols, followed by the rankVal[1] weight 1 symbols, and so on.420* symbols[0] is filled (but unused) to avoid a branch.421*422* rankStart contains the offset where each rank belongs in the DTable.423* rankStart[0] is not filled because there are no entries in the table for424* weight 0.425*/426{ int n;427U32 nextRankStart = 0;428int const unroll = 4;429int const nLimit = (int)nbSymbols - unroll + 1;430for (n=0; n<(int)tableLog+1; n++) {431U32 const curr = nextRankStart;432nextRankStart += wksp->rankVal[n];433wksp->rankStart[n] = curr;434}435for (n=0; n < nLimit; n += unroll) {436int u;437for (u=0; u < unroll; ++u) {438size_t const w = wksp->huffWeight[n+u];439wksp->symbols[wksp->rankStart[w]++] = (BYTE)(n+u);440}441}442for (; n < (int)nbSymbols; ++n) {443size_t const w = wksp->huffWeight[n];444wksp->symbols[wksp->rankStart[w]++] = (BYTE)n;445}446}447448/* fill DTable449* We fill all entries of each weight in order.450* That way length is a constant for each iteration of the outer loop.451* We can switch based on the length to a different inner loop which is452* optimized for that particular case.453*/454{ U32 w;455int symbol = wksp->rankVal[0];456int rankStart = 0;457for (w=1; w<tableLog+1; ++w) {458int const symbolCount = wksp->rankVal[w];459int const length = (1 << w) >> 1;460int uStart = rankStart;461BYTE const nbBits = (BYTE)(tableLog + 1 - w);462int s;463int u;464switch (length) {465case 1:466for (s=0; s<symbolCount; ++s) {467HUF_DEltX1 D;468D.byte = wksp->symbols[symbol + s];469D.nbBits = nbBits;470dt[uStart] = D;471uStart += 1;472}473break;474case 2:475for (s=0; s<symbolCount; ++s) {476HUF_DEltX1 D;477D.byte = wksp->symbols[symbol + s];478D.nbBits = nbBits;479dt[uStart+0] = D;480dt[uStart+1] = D;481uStart += 2;482}483break;484case 4:485for (s=0; s<symbolCount; ++s) {486U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);487MEM_write64(dt + uStart, D4);488uStart += 4;489}490break;491case 8:492for (s=0; s<symbolCount; ++s) {493U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);494MEM_write64(dt + uStart, D4);495MEM_write64(dt + uStart + 4, D4);496uStart += 8;497}498break;499default:500for (s=0; s<symbolCount; ++s) {501U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);502for (u=0; u < length; u += 16) {503MEM_write64(dt + uStart + u + 0, D4);504MEM_write64(dt + uStart + u + 4, D4);505MEM_write64(dt + uStart + u + 8, D4);506MEM_write64(dt + uStart + u + 12, D4);507}508assert(u == length);509uStart += length;510}511break;512}513symbol += symbolCount;514rankStart += symbolCount * length;515}516}517return iSize;518}519520FORCE_INLINE_TEMPLATE BYTE521HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog)522{523size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */524BYTE const c = dt[val].byte;525BIT_skipBits(Dstream, dt[val].nbBits);526return c;527}528529#define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \530do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0)531532#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \533do { \534if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \535HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \536} while (0)537538#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \539do { \540if (MEM_64bits()) \541HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \542} while (0)543544HINT_INLINE size_t545HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)546{547BYTE* const pStart = p;548549/* up to 4 symbols at a time */550if ((pEnd - p) > 3) {551while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {552HUF_DECODE_SYMBOLX1_2(p, bitDPtr);553HUF_DECODE_SYMBOLX1_1(p, bitDPtr);554HUF_DECODE_SYMBOLX1_2(p, bitDPtr);555HUF_DECODE_SYMBOLX1_0(p, bitDPtr);556}557} else {558BIT_reloadDStream(bitDPtr);559}560561/* [0-3] symbols remaining */562if (MEM_32bits())563while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd))564HUF_DECODE_SYMBOLX1_0(p, bitDPtr);565566/* no more data to retrieve from bitstream, no need to reload */567while (p < pEnd)568HUF_DECODE_SYMBOLX1_0(p, bitDPtr);569570return (size_t)(pEnd-pStart);571}572573FORCE_INLINE_TEMPLATE size_t574HUF_decompress1X1_usingDTable_internal_body(575void* dst, size_t dstSize,576const void* cSrc, size_t cSrcSize,577const HUF_DTable* DTable)578{579BYTE* op = (BYTE*)dst;580BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize);581const void* dtPtr = DTable + 1;582const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;583BIT_DStream_t bitD;584DTableDesc const dtd = HUF_getDTableDesc(DTable);585U32 const dtLog = dtd.tableLog;586587CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );588589HUF_decodeStreamX1(op, &bitD, oend, dt, dtLog);590591if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);592593return dstSize;594}595596/* HUF_decompress4X1_usingDTable_internal_body():597* Conditions :598* @dstSize >= 6599*/600FORCE_INLINE_TEMPLATE size_t601HUF_decompress4X1_usingDTable_internal_body(602void* dst, size_t dstSize,603const void* cSrc, size_t cSrcSize,604const HUF_DTable* DTable)605{606/* Check */607if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */608if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */609610{ const BYTE* const istart = (const BYTE*) cSrc;611BYTE* const ostart = (BYTE*) dst;612BYTE* const oend = ostart + dstSize;613BYTE* const olimit = oend - 3;614const void* const dtPtr = DTable + 1;615const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;616617/* Init */618BIT_DStream_t bitD1;619BIT_DStream_t bitD2;620BIT_DStream_t bitD3;621BIT_DStream_t bitD4;622size_t const length1 = MEM_readLE16(istart);623size_t const length2 = MEM_readLE16(istart+2);624size_t const length3 = MEM_readLE16(istart+4);625size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);626const BYTE* const istart1 = istart + 6; /* jumpTable */627const BYTE* const istart2 = istart1 + length1;628const BYTE* const istart3 = istart2 + length2;629const BYTE* const istart4 = istart3 + length3;630const size_t segmentSize = (dstSize+3) / 4;631BYTE* const opStart2 = ostart + segmentSize;632BYTE* const opStart3 = opStart2 + segmentSize;633BYTE* const opStart4 = opStart3 + segmentSize;634BYTE* op1 = ostart;635BYTE* op2 = opStart2;636BYTE* op3 = opStart3;637BYTE* op4 = opStart4;638DTableDesc const dtd = HUF_getDTableDesc(DTable);639U32 const dtLog = dtd.tableLog;640U32 endSignal = 1;641642if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */643if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */644assert(dstSize >= 6); /* validated above */645CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );646CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );647CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );648CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );649650/* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */651if ((size_t)(oend - op4) >= sizeof(size_t)) {652for ( ; (endSignal) & (op4 < olimit) ; ) {653HUF_DECODE_SYMBOLX1_2(op1, &bitD1);654HUF_DECODE_SYMBOLX1_2(op2, &bitD2);655HUF_DECODE_SYMBOLX1_2(op3, &bitD3);656HUF_DECODE_SYMBOLX1_2(op4, &bitD4);657HUF_DECODE_SYMBOLX1_1(op1, &bitD1);658HUF_DECODE_SYMBOLX1_1(op2, &bitD2);659HUF_DECODE_SYMBOLX1_1(op3, &bitD3);660HUF_DECODE_SYMBOLX1_1(op4, &bitD4);661HUF_DECODE_SYMBOLX1_2(op1, &bitD1);662HUF_DECODE_SYMBOLX1_2(op2, &bitD2);663HUF_DECODE_SYMBOLX1_2(op3, &bitD3);664HUF_DECODE_SYMBOLX1_2(op4, &bitD4);665HUF_DECODE_SYMBOLX1_0(op1, &bitD1);666HUF_DECODE_SYMBOLX1_0(op2, &bitD2);667HUF_DECODE_SYMBOLX1_0(op3, &bitD3);668HUF_DECODE_SYMBOLX1_0(op4, &bitD4);669endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;670endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;671endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;672endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;673}674}675676/* check corruption */677/* note : should not be necessary : op# advance in lock step, and we control op4.678* but curiously, binary generated by gcc 7.2 & 7.3 with -mbmi2 runs faster when >=1 test is present */679if (op1 > opStart2) return ERROR(corruption_detected);680if (op2 > opStart3) return ERROR(corruption_detected);681if (op3 > opStart4) return ERROR(corruption_detected);682/* note : op4 supposed already verified within main loop */683684/* finish bitStreams one by one */685HUF_decodeStreamX1(op1, &bitD1, opStart2, dt, dtLog);686HUF_decodeStreamX1(op2, &bitD2, opStart3, dt, dtLog);687HUF_decodeStreamX1(op3, &bitD3, opStart4, dt, dtLog);688HUF_decodeStreamX1(op4, &bitD4, oend, dt, dtLog);689690/* check */691{ U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);692if (!endCheck) return ERROR(corruption_detected); }693694/* decoded size */695return dstSize;696}697}698699#if HUF_NEED_BMI2_FUNCTION700static BMI2_TARGET_ATTRIBUTE701size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,702size_t cSrcSize, HUF_DTable const* DTable) {703return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);704}705#endif706707static708size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,709size_t cSrcSize, HUF_DTable const* DTable) {710return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);711}712713#if ZSTD_ENABLE_ASM_X86_64_BMI2714715HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;716717#endif718719static HUF_FAST_BMI2_ATTRS720void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)721{722U64 bits[4];723BYTE const* ip[4];724BYTE* op[4];725U16 const* const dtable = (U16 const*)args->dt;726BYTE* const oend = args->oend;727BYTE const* const ilowest = args->ilowest;728729/* Copy the arguments to local variables */730ZSTD_memcpy(&bits, &args->bits, sizeof(bits));731ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));732ZSTD_memcpy(&op, &args->op, sizeof(op));733734assert(MEM_isLittleEndian());735assert(!MEM_32bits());736737for (;;) {738BYTE* olimit;739int stream;740741/* Assert loop preconditions */742#ifndef NDEBUG743for (stream = 0; stream < 4; ++stream) {744assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));745assert(ip[stream] >= ilowest);746}747#endif748/* Compute olimit */749{750/* Each iteration produces 5 output symbols per stream */751size_t const oiters = (size_t)(oend - op[3]) / 5;752/* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes753* per stream.754*/755size_t const iiters = (size_t)(ip[0] - ilowest) / 7;756/* We can safely run iters iterations before running bounds checks */757size_t const iters = MIN(oiters, iiters);758size_t const symbols = iters * 5;759760/* We can simply check that op[3] < olimit, instead of checking all761* of our bounds, since we can't hit the other bounds until we've run762* iters iterations, which only happens when op[3] == olimit.763*/764olimit = op[3] + symbols;765766/* Exit fast decoding loop once we reach the end. */767if (op[3] == olimit)768break;769770/* Exit the decoding loop if any input pointer has crossed the771* previous one. This indicates corruption, and a precondition772* to our loop is that ip[i] >= ip[0].773*/774for (stream = 1; stream < 4; ++stream) {775if (ip[stream] < ip[stream - 1])776goto _out;777}778}779780#ifndef NDEBUG781for (stream = 1; stream < 4; ++stream) {782assert(ip[stream] >= ip[stream - 1]);783}784#endif785786#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \787do { \788int const index = (int)(bits[(_stream)] >> 53); \789int const entry = (int)dtable[index]; \790bits[(_stream)] <<= (entry & 0x3F); \791op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \792} while (0)793794#define HUF_4X1_RELOAD_STREAM(_stream) \795do { \796int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \797int const nbBits = ctz & 7; \798int const nbBytes = ctz >> 3; \799op[(_stream)] += 5; \800ip[(_stream)] -= nbBytes; \801bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \802bits[(_stream)] <<= nbBits; \803} while (0)804805/* Manually unroll the loop because compilers don't consistently806* unroll the inner loops, which destroys performance.807*/808do {809/* Decode 5 symbols in each of the 4 streams */810HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0);811HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1);812HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2);813HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3);814HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4);815816/* Reload each of the 4 the bitstreams */817HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM);818} while (op[3] < olimit);819820#undef HUF_4X1_DECODE_SYMBOL821#undef HUF_4X1_RELOAD_STREAM822}823824_out:825826/* Save the final values of each of the state variables back to args. */827ZSTD_memcpy(&args->bits, &bits, sizeof(bits));828ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));829ZSTD_memcpy(&args->op, &op, sizeof(op));830}831832/**833* @returns @p dstSize on success (>= 6)834* 0 if the fallback implementation should be used835* An error if an error occurred836*/837static HUF_FAST_BMI2_ATTRS838size_t839HUF_decompress4X1_usingDTable_internal_fast(840void* dst, size_t dstSize,841const void* cSrc, size_t cSrcSize,842const HUF_DTable* DTable,843HUF_DecompressFastLoopFn loopFn)844{845void const* dt = DTable + 1;846BYTE const* const ilowest = (BYTE const*)cSrc;847BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);848HUF_DecompressFastArgs args;849{ size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);850FORWARD_IF_ERROR(ret, "Failed to init fast loop args");851if (ret == 0)852return 0;853}854855assert(args.ip[0] >= args.ilowest);856loopFn(&args);857858/* Our loop guarantees that ip[] >= ilowest and that we haven't859* overwritten any op[].860*/861assert(args.ip[0] >= ilowest);862assert(args.ip[0] >= ilowest);863assert(args.ip[1] >= ilowest);864assert(args.ip[2] >= ilowest);865assert(args.ip[3] >= ilowest);866assert(args.op[3] <= oend);867868assert(ilowest == args.ilowest);869assert(ilowest + 6 == args.iend[0]);870(void)ilowest;871872/* finish bit streams one by one. */873{ size_t const segmentSize = (dstSize+3) / 4;874BYTE* segmentEnd = (BYTE*)dst;875int i;876for (i = 0; i < 4; ++i) {877BIT_DStream_t bit;878if (segmentSize <= (size_t)(oend - segmentEnd))879segmentEnd += segmentSize;880else881segmentEnd = oend;882FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");883/* Decompress and validate that we've produced exactly the expected length. */884args.op[i] += HUF_decodeStreamX1(args.op[i], &bit, segmentEnd, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG);885if (args.op[i] != segmentEnd) return ERROR(corruption_detected);886}887}888889/* decoded size */890assert(dstSize != 0);891return dstSize;892}893894HUF_DGEN(HUF_decompress1X1_usingDTable_internal)895896static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,897size_t cSrcSize, HUF_DTable const* DTable, int flags)898{899HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;900HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;901902#if DYNAMIC_BMI2903if (flags & HUF_flags_bmi2) {904fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;905# if ZSTD_ENABLE_ASM_X86_64_BMI2906if (!(flags & HUF_flags_disableAsm)) {907loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;908}909# endif910} else {911return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);912}913#endif914915#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)916if (!(flags & HUF_flags_disableAsm)) {917loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;918}919#endif920921if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {922size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);923if (ret != 0)924return ret;925}926return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);927}928929static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,930const void* cSrc, size_t cSrcSize,931void* workSpace, size_t wkspSize, int flags)932{933const BYTE* ip = (const BYTE*) cSrc;934935size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);936if (HUF_isError(hSize)) return hSize;937if (hSize >= cSrcSize) return ERROR(srcSize_wrong);938ip += hSize; cSrcSize -= hSize;939940return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);941}942943#endif /* HUF_FORCE_DECOMPRESS_X2 */944945946#ifndef HUF_FORCE_DECOMPRESS_X1947948/* *************************/949/* double-symbols decoding */950/* *************************/951952typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2; /* double-symbols decoding */953typedef struct { BYTE symbol; } sortedSymbol_t;954typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];955typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];956957/**958* Constructs a HUF_DEltX2 in a U32.959*/960static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int level)961{962U32 seq;963DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) == 0);964DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) == 2);965DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) == 3);966DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U32));967if (MEM_isLittleEndian()) {968seq = level == 1 ? symbol : (baseSeq + (symbol << 8));969return seq + (nbBits << 16) + ((U32)level << 24);970} else {971seq = level == 1 ? (symbol << 8) : ((baseSeq << 8) + symbol);972return (seq << 16) + (nbBits << 8) + (U32)level;973}974}975976/**977* Constructs a HUF_DEltX2.978*/979static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int level)980{981HUF_DEltX2 DElt;982U32 const val = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);983DEBUG_STATIC_ASSERT(sizeof(DElt) == sizeof(val));984ZSTD_memcpy(&DElt, &val, sizeof(val));985return DElt;986}987988/**989* Constructs 2 HUF_DEltX2s and packs them into a U64.990*/991static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int level)992{993U32 DElt = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);994return (U64)DElt + ((U64)DElt << 32);995}996997/**998* Fills the DTable rank with all the symbols from [begin, end) that are each999* nbBits long.1000*1001* @param DTableRank The start of the rank in the DTable.1002* @param begin The first symbol to fill (inclusive).1003* @param end The last symbol to fill (exclusive).1004* @param nbBits Each symbol is nbBits long.1005* @param tableLog The table log.1006* @param baseSeq If level == 1 { 0 } else { the first level symbol }1007* @param level The level in the table. Must be 1 or 2.1008*/1009static void HUF_fillDTableX2ForWeight(1010HUF_DEltX2* DTableRank,1011sortedSymbol_t const* begin, sortedSymbol_t const* end,1012U32 nbBits, U32 tableLog,1013U16 baseSeq, int const level)1014{1015U32 const length = 1U << ((tableLog - nbBits) & 0x1F /* quiet static-analyzer */);1016const sortedSymbol_t* ptr;1017assert(level >= 1 && level <= 2);1018switch (length) {1019case 1:1020for (ptr = begin; ptr != end; ++ptr) {1021HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);1022*DTableRank++ = DElt;1023}1024break;1025case 2:1026for (ptr = begin; ptr != end; ++ptr) {1027HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level);1028DTableRank[0] = DElt;1029DTableRank[1] = DElt;1030DTableRank += 2;1031}1032break;1033case 4:1034for (ptr = begin; ptr != end; ++ptr) {1035U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);1036ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));1037ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));1038DTableRank += 4;1039}1040break;1041case 8:1042for (ptr = begin; ptr != end; ++ptr) {1043U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);1044ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));1045ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));1046ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));1047ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));1048DTableRank += 8;1049}1050break;1051default:1052for (ptr = begin; ptr != end; ++ptr) {1053U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level);1054HUF_DEltX2* const DTableRankEnd = DTableRank + length;1055for (; DTableRank != DTableRankEnd; DTableRank += 8) {1056ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));1057ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));1058ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));1059ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));1060}1061}1062break;1063}1064}10651066/* HUF_fillDTableX2Level2() :1067* `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */1068static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 consumedBits,1069const U32* rankVal, const int minWeight, const int maxWeight1,1070const sortedSymbol_t* sortedSymbols, U32 const* rankStart,1071U32 nbBitsBaseline, U16 baseSeq)1072{1073/* Fill skipped values (all positions up to rankVal[minWeight]).1074* These are positions only get a single symbol because the combined weight1075* is too large.1076*/1077if (minWeight>1) {1078U32 const length = 1U << ((targetLog - consumedBits) & 0x1F /* quiet static-analyzer */);1079U64 const DEltX2 = HUF_buildDEltX2U64(baseSeq, consumedBits, /* baseSeq */ 0, /* level */ 1);1080int const skipSize = rankVal[minWeight];1081assert(length > 1);1082assert((U32)skipSize < length);1083switch (length) {1084case 2:1085assert(skipSize == 1);1086ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2));1087break;1088case 4:1089assert(skipSize <= 4);1090ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2));1091ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2));1092break;1093default:1094{1095int i;1096for (i = 0; i < skipSize; i += 8) {1097ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2));1098ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2));1099ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2));1100ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2));1101}1102}1103}1104}11051106/* Fill each of the second level symbols by weight. */1107{1108int w;1109for (w = minWeight; w < maxWeight1; ++w) {1110int const begin = rankStart[w];1111int const end = rankStart[w+1];1112U32 const nbBits = nbBitsBaseline - w;1113U32 const totalBits = nbBits + consumedBits;1114HUF_fillDTableX2ForWeight(1115DTable + rankVal[w],1116sortedSymbols + begin, sortedSymbols + end,1117totalBits, targetLog,1118baseSeq, /* level */ 2);1119}1120}1121}11221123static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,1124const sortedSymbol_t* sortedList,1125const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,1126const U32 nbBitsBaseline)1127{1128U32* const rankVal = rankValOrigin[0];1129const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */1130const U32 minBits = nbBitsBaseline - maxWeight;1131int w;1132int const wEnd = (int)maxWeight + 1;11331134/* Fill DTable in order of weight. */1135for (w = 1; w < wEnd; ++w) {1136int const begin = (int)rankStart[w];1137int const end = (int)rankStart[w+1];1138U32 const nbBits = nbBitsBaseline - w;11391140if (targetLog-nbBits >= minBits) {1141/* Enough room for a second symbol. */1142int start = rankVal[w];1143U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */);1144int minWeight = nbBits + scaleLog;1145int s;1146if (minWeight < 1) minWeight = 1;1147/* Fill the DTable for every symbol of weight w.1148* These symbols get at least 1 second symbol.1149*/1150for (s = begin; s != end; ++s) {1151HUF_fillDTableX2Level2(1152DTable + start, targetLog, nbBits,1153rankValOrigin[nbBits], minWeight, wEnd,1154sortedList, rankStart,1155nbBitsBaseline, sortedList[s].symbol);1156start += length;1157}1158} else {1159/* Only a single symbol. */1160HUF_fillDTableX2ForWeight(1161DTable + rankVal[w],1162sortedList + begin, sortedList + end,1163nbBits, targetLog,1164/* baseSeq */ 0, /* level */ 1);1165}1166}1167}11681169typedef struct {1170rankValCol_t rankVal[HUF_TABLELOG_MAX];1171U32 rankStats[HUF_TABLELOG_MAX + 1];1172U32 rankStart0[HUF_TABLELOG_MAX + 3];1173sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1];1174BYTE weightList[HUF_SYMBOLVALUE_MAX + 1];1175U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];1176} HUF_ReadDTableX2_Workspace;11771178size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,1179const void* src, size_t srcSize,1180void* workSpace, size_t wkspSize, int flags)1181{1182U32 tableLog, maxW, nbSymbols;1183DTableDesc dtd = HUF_getDTableDesc(DTable);1184U32 maxTableLog = dtd.maxTableLog;1185size_t iSize;1186void* dtPtr = DTable+1; /* force compiler to avoid strict-aliasing */1187HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;1188U32 *rankStart;11891190HUF_ReadDTableX2_Workspace* const wksp = (HUF_ReadDTableX2_Workspace*)workSpace;11911192if (sizeof(*wksp) > wkspSize) return ERROR(GENERIC);11931194rankStart = wksp->rankStart0 + 1;1195ZSTD_memset(wksp->rankStats, 0, sizeof(wksp->rankStats));1196ZSTD_memset(wksp->rankStart0, 0, sizeof(wksp->rankStart0));11971198DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */1199if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);1200/* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */12011202iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);1203if (HUF_isError(iSize)) return iSize;12041205/* check result */1206if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */1207if (tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG) maxTableLog = HUF_DECODER_FAST_TABLELOG;12081209/* find maxWeight */1210for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */12111212/* Get start index of each weight */1213{ U32 w, nextRankStart = 0;1214for (w=1; w<maxW+1; w++) {1215U32 curr = nextRankStart;1216nextRankStart += wksp->rankStats[w];1217rankStart[w] = curr;1218}1219rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/1220rankStart[maxW+1] = nextRankStart;1221}12221223/* sort symbols by weight */1224{ U32 s;1225for (s=0; s<nbSymbols; s++) {1226U32 const w = wksp->weightList[s];1227U32 const r = rankStart[w]++;1228wksp->sortedSymbol[r].symbol = (BYTE)s;1229}1230rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */1231}12321233/* Build rankVal */1234{ U32* const rankVal0 = wksp->rankVal[0];1235{ int const rescale = (maxTableLog-tableLog) - 1; /* tableLog <= maxTableLog */1236U32 nextRankVal = 0;1237U32 w;1238for (w=1; w<maxW+1; w++) {1239U32 curr = nextRankVal;1240nextRankVal += wksp->rankStats[w] << (w+rescale);1241rankVal0[w] = curr;1242} }1243{ U32 const minBits = tableLog+1 - maxW;1244U32 consumed;1245for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) {1246U32* const rankValPtr = wksp->rankVal[consumed];1247U32 w;1248for (w = 1; w < maxW+1; w++) {1249rankValPtr[w] = rankVal0[w] >> consumed;1250} } } }12511252HUF_fillDTableX2(dt, maxTableLog,1253wksp->sortedSymbol,1254wksp->rankStart0, wksp->rankVal, maxW,1255tableLog+1);12561257dtd.tableLog = (BYTE)maxTableLog;1258dtd.tableType = 1;1259ZSTD_memcpy(DTable, &dtd, sizeof(dtd));1260return iSize;1261}126212631264FORCE_INLINE_TEMPLATE U321265HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)1266{1267size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */1268ZSTD_memcpy(op, &dt[val].sequence, 2);1269BIT_skipBits(DStream, dt[val].nbBits);1270return dt[val].length;1271}12721273FORCE_INLINE_TEMPLATE U321274HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)1275{1276size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */1277ZSTD_memcpy(op, &dt[val].sequence, 1);1278if (dt[val].length==1) {1279BIT_skipBits(DStream, dt[val].nbBits);1280} else {1281if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {1282BIT_skipBits(DStream, dt[val].nbBits);1283if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))1284/* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */1285DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);1286}1287}1288return 1;1289}12901291#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \1292do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0)12931294#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \1295do { \1296if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \1297ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \1298} while (0)12991300#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \1301do { \1302if (MEM_64bits()) \1303ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \1304} while (0)13051306HINT_INLINE size_t1307HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,1308const HUF_DEltX2* const dt, const U32 dtLog)1309{1310BYTE* const pStart = p;13111312/* up to 8 symbols at a time */1313if ((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) {1314if (dtLog <= 11 && MEM_64bits()) {1315/* up to 10 symbols at a time */1316while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-9)) {1317HUF_DECODE_SYMBOLX2_0(p, bitDPtr);1318HUF_DECODE_SYMBOLX2_0(p, bitDPtr);1319HUF_DECODE_SYMBOLX2_0(p, bitDPtr);1320HUF_DECODE_SYMBOLX2_0(p, bitDPtr);1321HUF_DECODE_SYMBOLX2_0(p, bitDPtr);1322}1323} else {1324/* up to 8 symbols at a time */1325while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {1326HUF_DECODE_SYMBOLX2_2(p, bitDPtr);1327HUF_DECODE_SYMBOLX2_1(p, bitDPtr);1328HUF_DECODE_SYMBOLX2_2(p, bitDPtr);1329HUF_DECODE_SYMBOLX2_0(p, bitDPtr);1330}1331}1332} else {1333BIT_reloadDStream(bitDPtr);1334}13351336/* closer to end : up to 2 symbols at a time */1337if ((size_t)(pEnd - p) >= 2) {1338while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))1339HUF_DECODE_SYMBOLX2_0(p, bitDPtr);13401341while (p <= pEnd-2)1342HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */1343}13441345if (p < pEnd)1346p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog);13471348return p-pStart;1349}13501351FORCE_INLINE_TEMPLATE size_t1352HUF_decompress1X2_usingDTable_internal_body(1353void* dst, size_t dstSize,1354const void* cSrc, size_t cSrcSize,1355const HUF_DTable* DTable)1356{1357BIT_DStream_t bitD;13581359/* Init */1360CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );13611362/* decode */1363{ BYTE* const ostart = (BYTE*) dst;1364BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize);1365const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */1366const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;1367DTableDesc const dtd = HUF_getDTableDesc(DTable);1368HUF_decodeStreamX2(ostart, &bitD, oend, dt, dtd.tableLog);1369}13701371/* check */1372if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);13731374/* decoded size */1375return dstSize;1376}13771378/* HUF_decompress4X2_usingDTable_internal_body():1379* Conditions:1380* @dstSize >= 61381*/1382FORCE_INLINE_TEMPLATE size_t1383HUF_decompress4X2_usingDTable_internal_body(1384void* dst, size_t dstSize,1385const void* cSrc, size_t cSrcSize,1386const HUF_DTable* DTable)1387{1388if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */1389if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */13901391{ const BYTE* const istart = (const BYTE*) cSrc;1392BYTE* const ostart = (BYTE*) dst;1393BYTE* const oend = ostart + dstSize;1394BYTE* const olimit = oend - (sizeof(size_t)-1);1395const void* const dtPtr = DTable+1;1396const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;13971398/* Init */1399BIT_DStream_t bitD1;1400BIT_DStream_t bitD2;1401BIT_DStream_t bitD3;1402BIT_DStream_t bitD4;1403size_t const length1 = MEM_readLE16(istart);1404size_t const length2 = MEM_readLE16(istart+2);1405size_t const length3 = MEM_readLE16(istart+4);1406size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);1407const BYTE* const istart1 = istart + 6; /* jumpTable */1408const BYTE* const istart2 = istart1 + length1;1409const BYTE* const istart3 = istart2 + length2;1410const BYTE* const istart4 = istart3 + length3;1411size_t const segmentSize = (dstSize+3) / 4;1412BYTE* const opStart2 = ostart + segmentSize;1413BYTE* const opStart3 = opStart2 + segmentSize;1414BYTE* const opStart4 = opStart3 + segmentSize;1415BYTE* op1 = ostart;1416BYTE* op2 = opStart2;1417BYTE* op3 = opStart3;1418BYTE* op4 = opStart4;1419U32 endSignal = 1;1420DTableDesc const dtd = HUF_getDTableDesc(DTable);1421U32 const dtLog = dtd.tableLog;14221423if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */1424if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */1425assert(dstSize >= 6 /* validated above */);1426CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );1427CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );1428CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );1429CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );14301431/* 16-32 symbols per loop (4-8 symbols per stream) */1432if ((size_t)(oend - op4) >= sizeof(size_t)) {1433for ( ; (endSignal) & (op4 < olimit); ) {1434#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))1435HUF_DECODE_SYMBOLX2_2(op1, &bitD1);1436HUF_DECODE_SYMBOLX2_1(op1, &bitD1);1437HUF_DECODE_SYMBOLX2_2(op1, &bitD1);1438HUF_DECODE_SYMBOLX2_0(op1, &bitD1);1439HUF_DECODE_SYMBOLX2_2(op2, &bitD2);1440HUF_DECODE_SYMBOLX2_1(op2, &bitD2);1441HUF_DECODE_SYMBOLX2_2(op2, &bitD2);1442HUF_DECODE_SYMBOLX2_0(op2, &bitD2);1443endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;1444endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;1445HUF_DECODE_SYMBOLX2_2(op3, &bitD3);1446HUF_DECODE_SYMBOLX2_1(op3, &bitD3);1447HUF_DECODE_SYMBOLX2_2(op3, &bitD3);1448HUF_DECODE_SYMBOLX2_0(op3, &bitD3);1449HUF_DECODE_SYMBOLX2_2(op4, &bitD4);1450HUF_DECODE_SYMBOLX2_1(op4, &bitD4);1451HUF_DECODE_SYMBOLX2_2(op4, &bitD4);1452HUF_DECODE_SYMBOLX2_0(op4, &bitD4);1453endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;1454endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;1455#else1456HUF_DECODE_SYMBOLX2_2(op1, &bitD1);1457HUF_DECODE_SYMBOLX2_2(op2, &bitD2);1458HUF_DECODE_SYMBOLX2_2(op3, &bitD3);1459HUF_DECODE_SYMBOLX2_2(op4, &bitD4);1460HUF_DECODE_SYMBOLX2_1(op1, &bitD1);1461HUF_DECODE_SYMBOLX2_1(op2, &bitD2);1462HUF_DECODE_SYMBOLX2_1(op3, &bitD3);1463HUF_DECODE_SYMBOLX2_1(op4, &bitD4);1464HUF_DECODE_SYMBOLX2_2(op1, &bitD1);1465HUF_DECODE_SYMBOLX2_2(op2, &bitD2);1466HUF_DECODE_SYMBOLX2_2(op3, &bitD3);1467HUF_DECODE_SYMBOLX2_2(op4, &bitD4);1468HUF_DECODE_SYMBOLX2_0(op1, &bitD1);1469HUF_DECODE_SYMBOLX2_0(op2, &bitD2);1470HUF_DECODE_SYMBOLX2_0(op3, &bitD3);1471HUF_DECODE_SYMBOLX2_0(op4, &bitD4);1472endSignal = (U32)LIKELY((U32)1473(BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished)1474& (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished)1475& (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished)1476& (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished));1477#endif1478}1479}14801481/* check corruption */1482if (op1 > opStart2) return ERROR(corruption_detected);1483if (op2 > opStart3) return ERROR(corruption_detected);1484if (op3 > opStart4) return ERROR(corruption_detected);1485/* note : op4 already verified within main loop */14861487/* finish bitStreams one by one */1488HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);1489HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);1490HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);1491HUF_decodeStreamX2(op4, &bitD4, oend, dt, dtLog);14921493/* check */1494{ U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);1495if (!endCheck) return ERROR(corruption_detected); }14961497/* decoded size */1498return dstSize;1499}1500}15011502#if HUF_NEED_BMI2_FUNCTION1503static BMI2_TARGET_ATTRIBUTE1504size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc,1505size_t cSrcSize, HUF_DTable const* DTable) {1506return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);1507}1508#endif15091510static1511size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,1512size_t cSrcSize, HUF_DTable const* DTable) {1513return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);1514}15151516#if ZSTD_ENABLE_ASM_X86_64_BMI215171518HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;15191520#endif15211522static HUF_FAST_BMI2_ATTRS1523void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)1524{1525U64 bits[4];1526BYTE const* ip[4];1527BYTE* op[4];1528BYTE* oend[4];1529HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;1530BYTE const* const ilowest = args->ilowest;15311532/* Copy the arguments to local registers. */1533ZSTD_memcpy(&bits, &args->bits, sizeof(bits));1534ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));1535ZSTD_memcpy(&op, &args->op, sizeof(op));15361537oend[0] = op[1];1538oend[1] = op[2];1539oend[2] = op[3];1540oend[3] = args->oend;15411542assert(MEM_isLittleEndian());1543assert(!MEM_32bits());15441545for (;;) {1546BYTE* olimit;1547int stream;15481549/* Assert loop preconditions */1550#ifndef NDEBUG1551for (stream = 0; stream < 4; ++stream) {1552assert(op[stream] <= oend[stream]);1553assert(ip[stream] >= ilowest);1554}1555#endif1556/* Compute olimit */1557{1558/* Each loop does 5 table lookups for each of the 4 streams.1559* Each table lookup consumes up to 11 bits of input, and produces1560* up to 2 bytes of output.1561*/1562/* We can consume up to 7 bytes of input per iteration per stream.1563* We also know that each input pointer is >= ip[0]. So we can run1564* iters loops before running out of input.1565*/1566size_t iters = (size_t)(ip[0] - ilowest) / 7;1567/* Each iteration can produce up to 10 bytes of output per stream.1568* Each output stream my advance at different rates. So take the1569* minimum number of safe iterations among all the output streams.1570*/1571for (stream = 0; stream < 4; ++stream) {1572size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;1573iters = MIN(iters, oiters);1574}15751576/* Each iteration produces at least 5 output symbols. So until1577* op[3] crosses olimit, we know we haven't executed iters1578* iterations yet. This saves us maintaining an iters counter,1579* at the expense of computing the remaining # of iterations1580* more frequently.1581*/1582olimit = op[3] + (iters * 5);15831584/* Exit the fast decoding loop once we reach the end. */1585if (op[3] == olimit)1586break;15871588/* Exit the decoding loop if any input pointer has crossed the1589* previous one. This indicates corruption, and a precondition1590* to our loop is that ip[i] >= ip[0].1591*/1592for (stream = 1; stream < 4; ++stream) {1593if (ip[stream] < ip[stream - 1])1594goto _out;1595}1596}15971598#ifndef NDEBUG1599for (stream = 1; stream < 4; ++stream) {1600assert(ip[stream] >= ip[stream - 1]);1601}1602#endif16031604#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \1605do { \1606if ((_decode3) || (_stream) != 3) { \1607int const index = (int)(bits[(_stream)] >> 53); \1608HUF_DEltX2 const entry = dtable[index]; \1609MEM_write16(op[(_stream)], entry.sequence); \1610bits[(_stream)] <<= (entry.nbBits) & 0x3F; \1611op[(_stream)] += (entry.length); \1612} \1613} while (0)16141615#define HUF_4X2_RELOAD_STREAM(_stream) \1616do { \1617HUF_4X2_DECODE_SYMBOL(3, 1); \1618{ \1619int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \1620int const nbBits = ctz & 7; \1621int const nbBytes = ctz >> 3; \1622ip[(_stream)] -= nbBytes; \1623bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \1624bits[(_stream)] <<= nbBits; \1625} \1626} while (0)16271628/* Manually unroll the loop because compilers don't consistently1629* unroll the inner loops, which destroys performance.1630*/1631do {1632/* Decode 5 symbols from each of the first 3 streams.1633* The final stream will be decoded during the reload phase1634* to reduce register pressure.1635*/1636HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);1637HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);1638HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);1639HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);1640HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);16411642/* Decode one symbol from the final stream */1643HUF_4X2_DECODE_SYMBOL(3, 1);16441645/* Decode 4 symbols from the final stream & reload bitstreams.1646* The final stream is reloaded last, meaning that all 5 symbols1647* are decoded from the final stream before it is reloaded.1648*/1649HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM);1650} while (op[3] < olimit);1651}16521653#undef HUF_4X2_DECODE_SYMBOL1654#undef HUF_4X2_RELOAD_STREAM16551656_out:16571658/* Save the final values of each of the state variables back to args. */1659ZSTD_memcpy(&args->bits, &bits, sizeof(bits));1660ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));1661ZSTD_memcpy(&args->op, &op, sizeof(op));1662}166316641665static HUF_FAST_BMI2_ATTRS size_t1666HUF_decompress4X2_usingDTable_internal_fast(1667void* dst, size_t dstSize,1668const void* cSrc, size_t cSrcSize,1669const HUF_DTable* DTable,1670HUF_DecompressFastLoopFn loopFn) {1671void const* dt = DTable + 1;1672const BYTE* const ilowest = (const BYTE*)cSrc;1673BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);1674HUF_DecompressFastArgs args;1675{1676size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);1677FORWARD_IF_ERROR(ret, "Failed to init asm args");1678if (ret == 0)1679return 0;1680}16811682assert(args.ip[0] >= args.ilowest);1683loopFn(&args);16841685/* note : op4 already verified within main loop */1686assert(args.ip[0] >= ilowest);1687assert(args.ip[1] >= ilowest);1688assert(args.ip[2] >= ilowest);1689assert(args.ip[3] >= ilowest);1690assert(args.op[3] <= oend);16911692assert(ilowest == args.ilowest);1693assert(ilowest + 6 == args.iend[0]);1694(void)ilowest;16951696/* finish bitStreams one by one */1697{1698size_t const segmentSize = (dstSize+3) / 4;1699BYTE* segmentEnd = (BYTE*)dst;1700int i;1701for (i = 0; i < 4; ++i) {1702BIT_DStream_t bit;1703if (segmentSize <= (size_t)(oend - segmentEnd))1704segmentEnd += segmentSize;1705else1706segmentEnd = oend;1707FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption");1708args.op[i] += HUF_decodeStreamX2(args.op[i], &bit, segmentEnd, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG);1709if (args.op[i] != segmentEnd)1710return ERROR(corruption_detected);1711}1712}17131714/* decoded size */1715return dstSize;1716}17171718static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,1719size_t cSrcSize, HUF_DTable const* DTable, int flags)1720{1721HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;1722HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;17231724#if DYNAMIC_BMI21725if (flags & HUF_flags_bmi2) {1726fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;1727# if ZSTD_ENABLE_ASM_X86_64_BMI21728if (!(flags & HUF_flags_disableAsm)) {1729loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;1730}1731# endif1732} else {1733return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);1734}1735#endif17361737#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)1738if (!(flags & HUF_flags_disableAsm)) {1739loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;1740}1741#endif17421743if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {1744size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);1745if (ret != 0)1746return ret;1747}1748return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);1749}17501751HUF_DGEN(HUF_decompress1X2_usingDTable_internal)17521753size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,1754const void* cSrc, size_t cSrcSize,1755void* workSpace, size_t wkspSize, int flags)1756{1757const BYTE* ip = (const BYTE*) cSrc;17581759size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,1760workSpace, wkspSize, flags);1761if (HUF_isError(hSize)) return hSize;1762if (hSize >= cSrcSize) return ERROR(srcSize_wrong);1763ip += hSize; cSrcSize -= hSize;17641765return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);1766}17671768static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,1769const void* cSrc, size_t cSrcSize,1770void* workSpace, size_t wkspSize, int flags)1771{1772const BYTE* ip = (const BYTE*) cSrc;17731774size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,1775workSpace, wkspSize, flags);1776if (HUF_isError(hSize)) return hSize;1777if (hSize >= cSrcSize) return ERROR(srcSize_wrong);1778ip += hSize; cSrcSize -= hSize;17791780return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);1781}17821783#endif /* HUF_FORCE_DECOMPRESS_X1 */178417851786/* ***********************************/1787/* Universal decompression selectors */1788/* ***********************************/178917901791#if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)1792typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;1793static const algo_time_t algoTime[16 /* Quantization */][2 /* single, double */] =1794{1795/* single, double, quad */1796{{0,0}, {1,1}}, /* Q==0 : impossible */1797{{0,0}, {1,1}}, /* Q==1 : impossible */1798{{ 150,216}, { 381,119}}, /* Q == 2 : 12-18% */1799{{ 170,205}, { 514,112}}, /* Q == 3 : 18-25% */1800{{ 177,199}, { 539,110}}, /* Q == 4 : 25-32% */1801{{ 197,194}, { 644,107}}, /* Q == 5 : 32-38% */1802{{ 221,192}, { 735,107}}, /* Q == 6 : 38-44% */1803{{ 256,189}, { 881,106}}, /* Q == 7 : 44-50% */1804{{ 359,188}, {1167,109}}, /* Q == 8 : 50-56% */1805{{ 582,187}, {1570,114}}, /* Q == 9 : 56-62% */1806{{ 688,187}, {1712,122}}, /* Q ==10 : 62-69% */1807{{ 825,186}, {1965,136}}, /* Q ==11 : 69-75% */1808{{ 976,185}, {2131,150}}, /* Q ==12 : 75-81% */1809{{1180,186}, {2070,175}}, /* Q ==13 : 81-87% */1810{{1377,185}, {1731,202}}, /* Q ==14 : 87-93% */1811{{1412,185}, {1695,202}}, /* Q ==15 : 93-99% */1812};1813#endif18141815/** HUF_selectDecoder() :1816* Tells which decoder is likely to decode faster,1817* based on a set of pre-computed metrics.1818* @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 .1819* Assumption : 0 < dstSize <= 128 KB */1820U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)1821{1822assert(dstSize > 0);1823assert(dstSize <= 128*1024);1824#if defined(HUF_FORCE_DECOMPRESS_X1)1825(void)dstSize;1826(void)cSrcSize;1827return 0;1828#elif defined(HUF_FORCE_DECOMPRESS_X2)1829(void)dstSize;1830(void)cSrcSize;1831return 1;1832#else1833/* decoder timing evaluation */1834{ U32 const Q = (cSrcSize >= dstSize) ? 15 : (U32)(cSrcSize * 16 / dstSize); /* Q < 16 */1835U32 const D256 = (U32)(dstSize >> 8);1836U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);1837U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);1838DTime1 += DTime1 >> 5; /* small advantage to algorithm using less memory, to reduce cache eviction */1839return DTime1 < DTime0;1840}1841#endif1842}18431844size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,1845const void* cSrc, size_t cSrcSize,1846void* workSpace, size_t wkspSize, int flags)1847{1848/* validation checks */1849if (dstSize == 0) return ERROR(dstSize_tooSmall);1850if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */1851if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */1852if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */18531854{ U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);1855#if defined(HUF_FORCE_DECOMPRESS_X1)1856(void)algoNb;1857assert(algoNb == 0);1858return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,1859cSrcSize, workSpace, wkspSize, flags);1860#elif defined(HUF_FORCE_DECOMPRESS_X2)1861(void)algoNb;1862assert(algoNb == 1);1863return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,1864cSrcSize, workSpace, wkspSize, flags);1865#else1866return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,1867cSrcSize, workSpace, wkspSize, flags):1868HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,1869cSrcSize, workSpace, wkspSize, flags);1870#endif1871}1872}187318741875size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)1876{1877DTableDesc const dtd = HUF_getDTableDesc(DTable);1878#if defined(HUF_FORCE_DECOMPRESS_X1)1879(void)dtd;1880assert(dtd.tableType == 0);1881return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);1882#elif defined(HUF_FORCE_DECOMPRESS_X2)1883(void)dtd;1884assert(dtd.tableType == 1);1885return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);1886#else1887return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :1888HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);1889#endif1890}18911892#ifndef HUF_FORCE_DECOMPRESS_X21893size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)1894{1895const BYTE* ip = (const BYTE*) cSrc;18961897size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);1898if (HUF_isError(hSize)) return hSize;1899if (hSize >= cSrcSize) return ERROR(srcSize_wrong);1900ip += hSize; cSrcSize -= hSize;19011902return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);1903}1904#endif19051906size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)1907{1908DTableDesc const dtd = HUF_getDTableDesc(DTable);1909#if defined(HUF_FORCE_DECOMPRESS_X1)1910(void)dtd;1911assert(dtd.tableType == 0);1912return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);1913#elif defined(HUF_FORCE_DECOMPRESS_X2)1914(void)dtd;1915assert(dtd.tableType == 1);1916return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);1917#else1918return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :1919HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);1920#endif1921}19221923size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)1924{1925/* validation checks */1926if (dstSize == 0) return ERROR(dstSize_tooSmall);1927if (cSrcSize == 0) return ERROR(corruption_detected);19281929{ U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);1930#if defined(HUF_FORCE_DECOMPRESS_X1)1931(void)algoNb;1932assert(algoNb == 0);1933return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);1934#elif defined(HUF_FORCE_DECOMPRESS_X2)1935(void)algoNb;1936assert(algoNb == 1);1937return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);1938#else1939return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :1940HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);1941#endif1942}1943}194419451946