Path: blob/main/sys/contrib/zstd/lib/compress/zstd_lazy.c
48378 views
/*1* Copyright (c) Yann Collet, Facebook, Inc.2* All rights reserved.3*4* This source code is licensed under both the BSD-style license (found in the5* LICENSE file in the root directory of this source tree) and the GPLv2 (found6* in the COPYING file in the root directory of this source tree).7* You may select, at your option, one of the above-listed licenses.8*/910#include "zstd_compress_internal.h"11#include "zstd_lazy.h"121314/*-*************************************15* Binary Tree search16***************************************/1718static void19ZSTD_updateDUBT(ZSTD_matchState_t* ms,20const BYTE* ip, const BYTE* iend,21U32 mls)22{23const ZSTD_compressionParameters* const cParams = &ms->cParams;24U32* const hashTable = ms->hashTable;25U32 const hashLog = cParams->hashLog;2627U32* const bt = ms->chainTable;28U32 const btLog = cParams->chainLog - 1;29U32 const btMask = (1 << btLog) - 1;3031const BYTE* const base = ms->window.base;32U32 const target = (U32)(ip - base);33U32 idx = ms->nextToUpdate;3435if (idx != target)36DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)",37idx, target, ms->window.dictLimit);38assert(ip + 8 <= iend); /* condition for ZSTD_hashPtr */39(void)iend;4041assert(idx >= ms->window.dictLimit); /* condition for valid base+idx */42for ( ; idx < target ; idx++) {43size_t const h = ZSTD_hashPtr(base + idx, hashLog, mls); /* assumption : ip + 8 <= iend */44U32 const matchIndex = hashTable[h];4546U32* const nextCandidatePtr = bt + 2*(idx&btMask);47U32* const sortMarkPtr = nextCandidatePtr + 1;4849DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx);50hashTable[h] = idx; /* Update Hash Table */51*nextCandidatePtr = matchIndex; /* update BT like a chain */52*sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK;53}54ms->nextToUpdate = target;55}565758/** ZSTD_insertDUBT1() :59* sort one already inserted but unsorted position60* assumption : curr >= btlow == (curr - btmask)61* doesn't fail */62static void63ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,64U32 curr, const BYTE* inputEnd,65U32 nbCompares, U32 btLow,66const ZSTD_dictMode_e dictMode)67{68const ZSTD_compressionParameters* const cParams = &ms->cParams;69U32* const bt = ms->chainTable;70U32 const btLog = cParams->chainLog - 1;71U32 const btMask = (1 << btLog) - 1;72size_t commonLengthSmaller=0, commonLengthLarger=0;73const BYTE* const base = ms->window.base;74const BYTE* const dictBase = ms->window.dictBase;75const U32 dictLimit = ms->window.dictLimit;76const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr;77const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit;78const BYTE* const dictEnd = dictBase + dictLimit;79const BYTE* const prefixStart = base + dictLimit;80const BYTE* match;81U32* smallerPtr = bt + 2*(curr&btMask);82U32* largerPtr = smallerPtr + 1;83U32 matchIndex = *smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */84U32 dummy32; /* to be nullified at the end */85U32 const windowValid = ms->window.lowLimit;86U32 const maxDistance = 1U << cParams->windowLog;87U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid;888990DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",91curr, dictLimit, windowLow);92assert(curr >= btLow);93assert(ip < iend); /* condition for ZSTD_count */9495for (; nbCompares && (matchIndex > windowLow); --nbCompares) {96U32* const nextPtr = bt + 2*(matchIndex & btMask);97size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */98assert(matchIndex < curr);99/* note : all candidates are now supposed sorted,100* but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK101* when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */102103if ( (dictMode != ZSTD_extDict)104|| (matchIndex+matchLength >= dictLimit) /* both in current segment*/105|| (curr < dictLimit) /* both in extDict */) {106const BYTE* const mBase = ( (dictMode != ZSTD_extDict)107|| (matchIndex+matchLength >= dictLimit)) ?108base : dictBase;109assert( (matchIndex+matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */110|| (curr < dictLimit) );111match = mBase + matchIndex;112matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);113} else {114match = dictBase + matchIndex;115matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);116if (matchIndex+matchLength >= dictLimit)117match = base + matchIndex; /* preparation for next read of match[matchLength] */118}119120DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",121curr, matchIndex, (U32)matchLength);122123if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */124break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */125}126127if (match[matchLength] < ip[matchLength]) { /* necessarily within buffer */128/* match is smaller than current */129*smallerPtr = matchIndex; /* update smaller idx */130commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */131if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop searching */132DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u",133matchIndex, btLow, nextPtr[1]);134smallerPtr = nextPtr+1; /* new "candidate" => larger than match, which was smaller than target */135matchIndex = nextPtr[1]; /* new matchIndex, larger than previous and closer to current */136} else {137/* match is larger than current */138*largerPtr = matchIndex;139commonLengthLarger = matchLength;140if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop searching */141DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u",142matchIndex, btLow, nextPtr[0]);143largerPtr = nextPtr;144matchIndex = nextPtr[0];145} }146147*smallerPtr = *largerPtr = 0;148}149150151static size_t152ZSTD_DUBT_findBetterDictMatch (153const ZSTD_matchState_t* ms,154const BYTE* const ip, const BYTE* const iend,155size_t* offsetPtr,156size_t bestLength,157U32 nbCompares,158U32 const mls,159const ZSTD_dictMode_e dictMode)160{161const ZSTD_matchState_t * const dms = ms->dictMatchState;162const ZSTD_compressionParameters* const dmsCParams = &dms->cParams;163const U32 * const dictHashTable = dms->hashTable;164U32 const hashLog = dmsCParams->hashLog;165size_t const h = ZSTD_hashPtr(ip, hashLog, mls);166U32 dictMatchIndex = dictHashTable[h];167168const BYTE* const base = ms->window.base;169const BYTE* const prefixStart = base + ms->window.dictLimit;170U32 const curr = (U32)(ip-base);171const BYTE* const dictBase = dms->window.base;172const BYTE* const dictEnd = dms->window.nextSrc;173U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);174U32 const dictLowLimit = dms->window.lowLimit;175U32 const dictIndexDelta = ms->window.lowLimit - dictHighLimit;176177U32* const dictBt = dms->chainTable;178U32 const btLog = dmsCParams->chainLog - 1;179U32 const btMask = (1 << btLog) - 1;180U32 const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask;181182size_t commonLengthSmaller=0, commonLengthLarger=0;183184(void)dictMode;185assert(dictMode == ZSTD_dictMatchState);186187for (; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) {188U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask);189size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */190const BYTE* match = dictBase + dictMatchIndex;191matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);192if (dictMatchIndex+matchLength >= dictHighLimit)193match = base + dictMatchIndex + dictIndexDelta; /* to prepare for next usage of match[matchLength] */194195if (matchLength > bestLength) {196U32 matchIndex = dictMatchIndex + dictIndexDelta;197if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {198DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",199curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);200bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);201}202if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */203break; /* drop, to guarantee consistency (miss a little bit of compression) */204}205}206207if (match[matchLength] < ip[matchLength]) {208if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */209commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */210dictMatchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */211} else {212/* match is larger than current */213if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */214commonLengthLarger = matchLength;215dictMatchIndex = nextPtr[0];216}217}218219if (bestLength >= MINMATCH) {220U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;221DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",222curr, (U32)bestLength, (U32)*offsetPtr, mIndex);223}224return bestLength;225226}227228229static size_t230ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,231const BYTE* const ip, const BYTE* const iend,232size_t* offsetPtr,233U32 const mls,234const ZSTD_dictMode_e dictMode)235{236const ZSTD_compressionParameters* const cParams = &ms->cParams;237U32* const hashTable = ms->hashTable;238U32 const hashLog = cParams->hashLog;239size_t const h = ZSTD_hashPtr(ip, hashLog, mls);240U32 matchIndex = hashTable[h];241242const BYTE* const base = ms->window.base;243U32 const curr = (U32)(ip-base);244U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog);245246U32* const bt = ms->chainTable;247U32 const btLog = cParams->chainLog - 1;248U32 const btMask = (1 << btLog) - 1;249U32 const btLow = (btMask >= curr) ? 0 : curr - btMask;250U32 const unsortLimit = MAX(btLow, windowLow);251252U32* nextCandidate = bt + 2*(matchIndex&btMask);253U32* unsortedMark = bt + 2*(matchIndex&btMask) + 1;254U32 nbCompares = 1U << cParams->searchLog;255U32 nbCandidates = nbCompares;256U32 previousCandidate = 0;257258DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr);259assert(ip <= iend-8); /* required for h calculation */260assert(dictMode != ZSTD_dedicatedDictSearch);261262/* reach end of unsorted candidates list */263while ( (matchIndex > unsortLimit)264&& (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK)265&& (nbCandidates > 1) ) {266DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted",267matchIndex);268*unsortedMark = previousCandidate; /* the unsortedMark becomes a reversed chain, to move up back to original position */269previousCandidate = matchIndex;270matchIndex = *nextCandidate;271nextCandidate = bt + 2*(matchIndex&btMask);272unsortedMark = bt + 2*(matchIndex&btMask) + 1;273nbCandidates --;274}275276/* nullify last candidate if it's still unsorted277* simplification, detrimental to compression ratio, beneficial for speed */278if ( (matchIndex > unsortLimit)279&& (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) {280DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u",281matchIndex);282*nextCandidate = *unsortedMark = 0;283}284285/* batch sort stacked candidates */286matchIndex = previousCandidate;287while (matchIndex) { /* will end on matchIndex == 0 */288U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1;289U32 const nextCandidateIdx = *nextCandidateIdxPtr;290ZSTD_insertDUBT1(ms, matchIndex, iend,291nbCandidates, unsortLimit, dictMode);292matchIndex = nextCandidateIdx;293nbCandidates++;294}295296/* find longest match */297{ size_t commonLengthSmaller = 0, commonLengthLarger = 0;298const BYTE* const dictBase = ms->window.dictBase;299const U32 dictLimit = ms->window.dictLimit;300const BYTE* const dictEnd = dictBase + dictLimit;301const BYTE* const prefixStart = base + dictLimit;302U32* smallerPtr = bt + 2*(curr&btMask);303U32* largerPtr = bt + 2*(curr&btMask) + 1;304U32 matchEndIdx = curr + 8 + 1;305U32 dummy32; /* to be nullified at the end */306size_t bestLength = 0;307308matchIndex = hashTable[h];309hashTable[h] = curr; /* Update Hash Table */310311for (; nbCompares && (matchIndex > windowLow); --nbCompares) {312U32* const nextPtr = bt + 2*(matchIndex & btMask);313size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */314const BYTE* match;315316if ((dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit)) {317match = base + matchIndex;318matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);319} else {320match = dictBase + matchIndex;321matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);322if (matchIndex+matchLength >= dictLimit)323match = base + matchIndex; /* to prepare for next usage of match[matchLength] */324}325326if (matchLength > bestLength) {327if (matchLength > matchEndIdx - matchIndex)328matchEndIdx = matchIndex + (U32)matchLength;329if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )330bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);331if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */332if (dictMode == ZSTD_dictMatchState) {333nbCompares = 0; /* in addition to avoiding checking any334* further in this loop, make sure we335* skip checking in the dictionary. */336}337break; /* drop, to guarantee consistency (miss a little bit of compression) */338}339}340341if (match[matchLength] < ip[matchLength]) {342/* match is smaller than current */343*smallerPtr = matchIndex; /* update smaller idx */344commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */345if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */346smallerPtr = nextPtr+1; /* new "smaller" => larger of match */347matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */348} else {349/* match is larger than current */350*largerPtr = matchIndex;351commonLengthLarger = matchLength;352if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */353largerPtr = nextPtr;354matchIndex = nextPtr[0];355} }356357*smallerPtr = *largerPtr = 0;358359assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */360if (dictMode == ZSTD_dictMatchState && nbCompares) {361bestLength = ZSTD_DUBT_findBetterDictMatch(362ms, ip, iend,363offsetPtr, bestLength, nbCompares,364mls, dictMode);365}366367assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */368ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */369if (bestLength >= MINMATCH) {370U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;371DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",372curr, (U32)bestLength, (U32)*offsetPtr, mIndex);373}374return bestLength;375}376}377378379/** ZSTD_BtFindBestMatch() : Tree updater, providing best match */380FORCE_INLINE_TEMPLATE size_t381ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,382const BYTE* const ip, const BYTE* const iLimit,383size_t* offsetPtr,384const U32 mls /* template */,385const ZSTD_dictMode_e dictMode)386{387DEBUGLOG(7, "ZSTD_BtFindBestMatch");388if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */389ZSTD_updateDUBT(ms, ip, iLimit, mls);390return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);391}392393/***********************************394* Dedicated dict search395***********************************/396397void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)398{399const BYTE* const base = ms->window.base;400U32 const target = (U32)(ip - base);401U32* const hashTable = ms->hashTable;402U32* const chainTable = ms->chainTable;403U32 const chainSize = 1 << ms->cParams.chainLog;404U32 idx = ms->nextToUpdate;405U32 const minChain = chainSize < target - idx ? target - chainSize : idx;406U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;407U32 const cacheSize = bucketSize - 1;408U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;409U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts;410411/* We know the hashtable is oversized by a factor of `bucketSize`.412* We are going to temporarily pretend `bucketSize == 1`, keeping only a413* single entry. We will use the rest of the space to construct a temporary414* chaintable.415*/416U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;417U32* const tmpHashTable = hashTable;418U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);419U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;420U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;421U32 hashIdx;422423assert(ms->cParams.chainLog <= 24);424assert(ms->cParams.hashLog > ms->cParams.chainLog);425assert(idx != 0);426assert(tmpMinChain <= minChain);427428/* fill conventional hash table and conventional chain table */429for ( ; idx < target; idx++) {430U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch);431if (idx >= tmpMinChain) {432tmpChainTable[idx - tmpMinChain] = hashTable[h];433}434tmpHashTable[h] = idx;435}436437/* sort chains into ddss chain table */438{439U32 chainPos = 0;440for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) {441U32 count;442U32 countBeyondMinChain = 0;443U32 i = tmpHashTable[hashIdx];444for (count = 0; i >= tmpMinChain && count < cacheSize; count++) {445/* skip through the chain to the first position that won't be446* in the hash cache bucket */447if (i < minChain) {448countBeyondMinChain++;449}450i = tmpChainTable[i - tmpMinChain];451}452if (count == cacheSize) {453for (count = 0; count < chainLimit;) {454if (i < minChain) {455if (!i || ++countBeyondMinChain > cacheSize) {456/* only allow pulling `cacheSize` number of entries457* into the cache or chainTable beyond `minChain`,458* to replace the entries pulled out of the459* chainTable into the cache. This lets us reach460* back further without increasing the total number461* of entries in the chainTable, guaranteeing the462* DDSS chain table will fit into the space463* allocated for the regular one. */464break;465}466}467chainTable[chainPos++] = i;468count++;469if (i < tmpMinChain) {470break;471}472i = tmpChainTable[i - tmpMinChain];473}474} else {475count = 0;476}477if (count) {478tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count;479} else {480tmpHashTable[hashIdx] = 0;481}482}483assert(chainPos <= chainSize); /* I believe this is guaranteed... */484}485486/* move chain pointers into the last entry of each hash bucket */487for (hashIdx = (1 << hashLog); hashIdx; ) {488U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG;489U32 const chainPackedPointer = tmpHashTable[hashIdx];490U32 i;491for (i = 0; i < cacheSize; i++) {492hashTable[bucketIdx + i] = 0;493}494hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer;495}496497/* fill the buckets of the hash table */498for (idx = ms->nextToUpdate; idx < target; idx++) {499U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch)500<< ZSTD_LAZY_DDSS_BUCKET_LOG;501U32 i;502/* Shift hash cache down 1. */503for (i = cacheSize - 1; i; i--)504hashTable[h + i] = hashTable[h + i - 1];505hashTable[h] = idx;506}507508ms->nextToUpdate = target;509}510511/* Returns the longest match length found in the dedicated dict search structure.512* If none are longer than the argument ml, then ml will be returned.513*/514FORCE_INLINE_TEMPLATE515size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,516const ZSTD_matchState_t* const dms,517const BYTE* const ip, const BYTE* const iLimit,518const BYTE* const prefixStart, const U32 curr,519const U32 dictLimit, const size_t ddsIdx) {520const U32 ddsLowestIndex = dms->window.dictLimit;521const BYTE* const ddsBase = dms->window.base;522const BYTE* const ddsEnd = dms->window.nextSrc;523const U32 ddsSize = (U32)(ddsEnd - ddsBase);524const U32 ddsIndexDelta = dictLimit - ddsSize;525const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);526const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;527U32 ddsAttempt;528U32 matchIndex;529530for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {531PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);532}533534{535U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];536U32 const chainIndex = chainPackedPointer >> 8;537538PREFETCH_L1(&dms->chainTable[chainIndex]);539}540541for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {542size_t currentMl=0;543const BYTE* match;544matchIndex = dms->hashTable[ddsIdx + ddsAttempt];545match = ddsBase + matchIndex;546547if (!matchIndex) {548return ml;549}550551/* guaranteed by table construction */552(void)ddsLowestIndex;553assert(matchIndex >= ddsLowestIndex);554assert(match+4 <= ddsEnd);555if (MEM_read32(match) == MEM_read32(ip)) {556/* assumption : matchIndex <= dictLimit-4 (by table construction) */557currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;558}559560/* save best solution */561if (currentMl > ml) {562ml = currentMl;563*offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));564if (ip+currentMl == iLimit) {565/* best possible, avoids read overflow on next attempt */566return ml;567}568}569}570571{572U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];573U32 chainIndex = chainPackedPointer >> 8;574U32 const chainLength = chainPackedPointer & 0xFF;575U32 const chainAttempts = nbAttempts - ddsAttempt;576U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;577U32 chainAttempt;578579for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {580PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);581}582583for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {584size_t currentMl=0;585const BYTE* match;586matchIndex = dms->chainTable[chainIndex];587match = ddsBase + matchIndex;588589/* guaranteed by table construction */590assert(matchIndex >= ddsLowestIndex);591assert(match+4 <= ddsEnd);592if (MEM_read32(match) == MEM_read32(ip)) {593/* assumption : matchIndex <= dictLimit-4 (by table construction) */594currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;595}596597/* save best solution */598if (currentMl > ml) {599ml = currentMl;600*offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));601if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */602}603}604}605return ml;606}607608609/* *********************************610* Hash Chain611***********************************/612#define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)]613614/* Update chains up to ip (excluded)615Assumption : always within prefix (i.e. not within extDict) */616FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(617ZSTD_matchState_t* ms,618const ZSTD_compressionParameters* const cParams,619const BYTE* ip, U32 const mls)620{621U32* const hashTable = ms->hashTable;622const U32 hashLog = cParams->hashLog;623U32* const chainTable = ms->chainTable;624const U32 chainMask = (1 << cParams->chainLog) - 1;625const BYTE* const base = ms->window.base;626const U32 target = (U32)(ip - base);627U32 idx = ms->nextToUpdate;628629while(idx < target) { /* catch up */630size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);631NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];632hashTable[h] = idx;633idx++;634}635636ms->nextToUpdate = target;637return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];638}639640U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {641const ZSTD_compressionParameters* const cParams = &ms->cParams;642return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);643}644645/* inlining is important to hardwire a hot branch (template emulation) */646FORCE_INLINE_TEMPLATE647size_t ZSTD_HcFindBestMatch(648ZSTD_matchState_t* ms,649const BYTE* const ip, const BYTE* const iLimit,650size_t* offsetPtr,651const U32 mls, const ZSTD_dictMode_e dictMode)652{653const ZSTD_compressionParameters* const cParams = &ms->cParams;654U32* const chainTable = ms->chainTable;655const U32 chainSize = (1 << cParams->chainLog);656const U32 chainMask = chainSize-1;657const BYTE* const base = ms->window.base;658const BYTE* const dictBase = ms->window.dictBase;659const U32 dictLimit = ms->window.dictLimit;660const BYTE* const prefixStart = base + dictLimit;661const BYTE* const dictEnd = dictBase + dictLimit;662const U32 curr = (U32)(ip-base);663const U32 maxDistance = 1U << cParams->windowLog;664const U32 lowestValid = ms->window.lowLimit;665const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;666const U32 isDictionary = (ms->loadedDictEnd != 0);667const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;668const U32 minChain = curr > chainSize ? curr - chainSize : 0;669U32 nbAttempts = 1U << cParams->searchLog;670size_t ml=4-1;671672const ZSTD_matchState_t* const dms = ms->dictMatchState;673const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch674? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0;675const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch676? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0;677678U32 matchIndex;679680if (dictMode == ZSTD_dedicatedDictSearch) {681const U32* entry = &dms->hashTable[ddsIdx];682PREFETCH_L1(entry);683}684685/* HC4 match finder */686matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);687688for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {689size_t currentMl=0;690if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {691const BYTE* const match = base + matchIndex;692assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */693if (match[ml] == ip[ml]) /* potentially better */694currentMl = ZSTD_count(ip, match, iLimit);695} else {696const BYTE* const match = dictBase + matchIndex;697assert(match+4 <= dictEnd);698if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */699currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;700}701702/* save best solution */703if (currentMl > ml) {704ml = currentMl;705*offsetPtr = STORE_OFFSET(curr - matchIndex);706if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */707}708709if (matchIndex <= minChain) break;710matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);711}712713assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */714if (dictMode == ZSTD_dedicatedDictSearch) {715ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,716ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);717} else if (dictMode == ZSTD_dictMatchState) {718const U32* const dmsChainTable = dms->chainTable;719const U32 dmsChainSize = (1 << dms->cParams.chainLog);720const U32 dmsChainMask = dmsChainSize - 1;721const U32 dmsLowestIndex = dms->window.dictLimit;722const BYTE* const dmsBase = dms->window.base;723const BYTE* const dmsEnd = dms->window.nextSrc;724const U32 dmsSize = (U32)(dmsEnd - dmsBase);725const U32 dmsIndexDelta = dictLimit - dmsSize;726const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : 0;727728matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];729730for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {731size_t currentMl=0;732const BYTE* const match = dmsBase + matchIndex;733assert(match+4 <= dmsEnd);734if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */735currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;736737/* save best solution */738if (currentMl > ml) {739ml = currentMl;740assert(curr > matchIndex + dmsIndexDelta);741*offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));742if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */743}744745if (matchIndex <= dmsMinChain) break;746747matchIndex = dmsChainTable[matchIndex & dmsChainMask];748}749}750751return ml;752}753754/* *********************************755* (SIMD) Row-based matchfinder756***********************************/757/* Constants for row-based hash */758#define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */759#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */760#define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)761#define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */762763#define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)764765typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 representing a mask of matches */766767/* ZSTD_VecMask_next():768* Starting from the LSB, returns the idx of the next non-zero bit.769* Basically counting the nb of trailing zeroes.770*/771static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {772assert(val != 0);773# if defined(_MSC_VER) && defined(_WIN64)774if (val != 0) {775unsigned long r;776_BitScanForward64(&r, val);777return (U32)(r);778} else {779/* Should not reach this code path */780__assume(0);781}782# elif (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))783if (sizeof(size_t) == 4) {784U32 mostSignificantWord = (U32)(val >> 32);785U32 leastSignificantWord = (U32)val;786if (leastSignificantWord == 0) {787return 32 + (U32)__builtin_ctz(mostSignificantWord);788} else {789return (U32)__builtin_ctz(leastSignificantWord);790}791} else {792return (U32)__builtin_ctzll(val);793}794# else795/* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count796* and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer797*/798val = ~val & (val - 1ULL); /* Lowest set bit mask */799val = val - ((val >> 1) & 0x5555555555555555);800val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);801return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);802# endif803}804805/* ZSTD_rotateRight_*():806* Rotates a bitfield to the right by "count" bits.807* https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts808*/809FORCE_INLINE_TEMPLATE810U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {811assert(count < 64);812count &= 0x3F; /* for fickle pattern recognition */813return (value >> count) | (U64)(value << ((0U - count) & 0x3F));814}815816FORCE_INLINE_TEMPLATE817U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {818assert(count < 32);819count &= 0x1F; /* for fickle pattern recognition */820return (value >> count) | (U32)(value << ((0U - count) & 0x1F));821}822823FORCE_INLINE_TEMPLATE824U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {825assert(count < 16);826count &= 0x0F; /* for fickle pattern recognition */827return (value >> count) | (U16)(value << ((0U - count) & 0x0F));828}829830/* ZSTD_row_nextIndex():831* Returns the next index to insert at within a tagTable row, and updates the "head"832* value to reflect the update. Essentially cycles backwards from [0, {entries per row})833*/834FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {835U32 const next = (*tagRow - 1) & rowMask;836*tagRow = (BYTE)next;837return next;838}839840/* ZSTD_isAligned():841* Checks that a pointer is aligned to "align" bytes which must be a power of 2.842*/843MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {844assert((align & (align - 1)) == 0);845return (((size_t)ptr) & (align - 1)) == 0;846}847848/* ZSTD_row_prefetch():849* Performs prefetching for the hashTable and tagTable at a given row.850*/851FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {852PREFETCH_L1(hashTable + relRow);853if (rowLog >= 5) {854PREFETCH_L1(hashTable + relRow + 16);855/* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */856}857PREFETCH_L1(tagTable + relRow);858if (rowLog == 6) {859PREFETCH_L1(tagTable + relRow + 32);860}861assert(rowLog == 4 || rowLog == 5 || rowLog == 6);862assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */863assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */864}865866/* ZSTD_row_fillHashCache():867* Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,868* but not beyond iLimit.869*/870FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,871U32 const rowLog, U32 const mls,872U32 idx, const BYTE* const iLimit)873{874U32 const* const hashTable = ms->hashTable;875U16 const* const tagTable = ms->tagTable;876U32 const hashLog = ms->rowHashLog;877U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);878U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);879880for (; idx < lim; ++idx) {881U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);882U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;883ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);884ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;885}886887DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1],888ms->hashCache[2], ms->hashCache[3], ms->hashCache[4],889ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);890}891892/* ZSTD_row_nextCachedHash():893* Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at894* base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.895*/896FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,897U16 const* tagTable, BYTE const* base,898U32 idx, U32 const hashLog,899U32 const rowLog, U32 const mls)900{901U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);902U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;903ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);904{ U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];905cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;906return hash;907}908}909910/* ZSTD_row_update_internalImpl():911* Updates the hash table with positions starting from updateStartIdx until updateEndIdx.912*/913FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,914U32 updateStartIdx, U32 const updateEndIdx,915U32 const mls, U32 const rowLog,916U32 const rowMask, U32 const useCache)917{918U32* const hashTable = ms->hashTable;919U16* const tagTable = ms->tagTable;920U32 const hashLog = ms->rowHashLog;921const BYTE* const base = ms->window.base;922923DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);924for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {925U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)926: (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);927U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;928U32* const row = hashTable + relRow;929BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.930Explicit cast allows us to get exact desired position within each row */931U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);932933assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));934((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;935row[pos] = updateStartIdx;936}937}938939/* ZSTD_row_update_internal():940* Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.941* Skips sections of long matches as is necessary.942*/943FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,944U32 const mls, U32 const rowLog,945U32 const rowMask, U32 const useCache)946{947U32 idx = ms->nextToUpdate;948const BYTE* const base = ms->window.base;949const U32 target = (U32)(ip - base);950const U32 kSkipThreshold = 384;951const U32 kMaxMatchStartPositionsToUpdate = 96;952const U32 kMaxMatchEndPositionsToUpdate = 32;953954if (useCache) {955/* Only skip positions when using hash cache, i.e.956* if we are loading a dict, don't skip anything.957* If we decide to skip, then we only update a set number958* of positions at the beginning and end of the match.959*/960if (UNLIKELY(target - idx > kSkipThreshold)) {961U32 const bound = idx + kMaxMatchStartPositionsToUpdate;962ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache);963idx = target - kMaxMatchEndPositionsToUpdate;964ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1);965}966}967assert(target >= idx);968ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache);969ms->nextToUpdate = target;970}971972/* ZSTD_row_update():973* External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary974* processing.975*/976void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {977const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);978const U32 rowMask = (1u << rowLog) - 1;979const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);980981DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);982ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);983}984985#if defined(ZSTD_ARCH_X86_SSE2)986FORCE_INLINE_TEMPLATE ZSTD_VecMask987ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head)988{989const __m128i comparisonMask = _mm_set1_epi8((char)tag);990int matches[4] = {0};991int i;992assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4);993for (i=0; i<nbChunks; i++) {994const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)(src + 16*i));995const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask);996matches[i] = _mm_movemask_epi8(equalMask);997}998if (nbChunks == 1) return ZSTD_rotateRight_U16((U16)matches[0], head);999if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head);1000assert(nbChunks == 4);1001return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head);1002}1003#endif10041005/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches1006* the hash at the nth position in a row of the tagTable.1007* Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield1008* to match up with the actual layout of the entries within the hashTable */1009FORCE_INLINE_TEMPLATE ZSTD_VecMask1010ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)1011{1012const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;1013assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);1014assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);10151016#if defined(ZSTD_ARCH_X86_SSE2)10171018return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);10191020#else /* SW or NEON-LE */10211022# if defined(ZSTD_ARCH_ARM_NEON)1023/* This NEON path only works for little endian - otherwise use SWAR below */1024if (MEM_isLittleEndian()) {1025if (rowEntries == 16) {1026const uint8x16_t chunk = vld1q_u8(src);1027const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));1028const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);1029const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));1030const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));1031const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));1032const U16 hi = (U16)vgetq_lane_u8(t3, 8);1033const U16 lo = (U16)vgetq_lane_u8(t3, 0);1034return ZSTD_rotateRight_U16((hi << 8) | lo, head);1035} else if (rowEntries == 32) {1036const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);1037const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);1038const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);1039const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));1040const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));1041const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));1042const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));1043const uint8x8_t t0 = vreinterpret_u8_s8(pack0);1044const uint8x8_t t1 = vreinterpret_u8_s8(pack1);1045const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);1046const uint8x8x2_t t3 = vuzp_u8(t2, t0);1047const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);1048const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);1049return ZSTD_rotateRight_U32(matches, head);1050} else { /* rowEntries == 64 */1051const uint8x16x4_t chunk = vld4q_u8(src);1052const uint8x16_t dup = vdupq_n_u8(tag);1053const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);1054const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);1055const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);1056const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);10571058const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);1059const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);1060const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);1061const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);1062const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);1063const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);1064return ZSTD_rotateRight_U64(matches, head);1065}1066}1067# endif /* ZSTD_ARCH_ARM_NEON */1068/* SWAR */1069{ const size_t chunkSize = sizeof(size_t);1070const size_t shiftAmount = ((chunkSize * 8) - chunkSize);1071const size_t xFF = ~((size_t)0);1072const size_t x01 = xFF / 0xFF;1073const size_t x80 = x01 << 7;1074const size_t splatChar = tag * x01;1075ZSTD_VecMask matches = 0;1076int i = rowEntries - chunkSize;1077assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8));1078if (MEM_isLittleEndian()) { /* runtime check so have two loops */1079const size_t extractMagic = (xFF / 0x7F) >> chunkSize;1080do {1081size_t chunk = MEM_readST(&src[i]);1082chunk ^= splatChar;1083chunk = (((chunk | x80) - x01) | chunk) & x80;1084matches <<= chunkSize;1085matches |= (chunk * extractMagic) >> shiftAmount;1086i -= chunkSize;1087} while (i >= 0);1088} else { /* big endian: reverse bits during extraction */1089const size_t msb = xFF ^ (xFF >> 1);1090const size_t extractMagic = (msb / 0x1FF) | msb;1091do {1092size_t chunk = MEM_readST(&src[i]);1093chunk ^= splatChar;1094chunk = (((chunk | x80) - x01) | chunk) & x80;1095matches <<= chunkSize;1096matches |= ((chunk >> 7) * extractMagic) >> shiftAmount;1097i -= chunkSize;1098} while (i >= 0);1099}1100matches = ~matches;1101if (rowEntries == 16) {1102return ZSTD_rotateRight_U16((U16)matches, head);1103} else if (rowEntries == 32) {1104return ZSTD_rotateRight_U32((U32)matches, head);1105} else {1106return ZSTD_rotateRight_U64((U64)matches, head);1107}1108}1109#endif1110}11111112/* The high-level approach of the SIMD row based match finder is as follows:1113* - Figure out where to insert the new entry:1114* - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"1115* - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines1116* which row to insert into.1117* - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can1118* be considered as a circular buffer with a "head" index that resides in the tagTable.1119* - Also insert the "tag" into the equivalent row and position in the tagTable.1120* - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.1121* The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,1122* for alignment/performance reasons, leaving some bytes unused.1123* - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and1124* generate a bitfield that we can cycle through to check the collisions in the hash table.1125* - Pick the longest match.1126*/1127FORCE_INLINE_TEMPLATE1128size_t ZSTD_RowFindBestMatch(1129ZSTD_matchState_t* ms,1130const BYTE* const ip, const BYTE* const iLimit,1131size_t* offsetPtr,1132const U32 mls, const ZSTD_dictMode_e dictMode,1133const U32 rowLog)1134{1135U32* const hashTable = ms->hashTable;1136U16* const tagTable = ms->tagTable;1137U32* const hashCache = ms->hashCache;1138const U32 hashLog = ms->rowHashLog;1139const ZSTD_compressionParameters* const cParams = &ms->cParams;1140const BYTE* const base = ms->window.base;1141const BYTE* const dictBase = ms->window.dictBase;1142const U32 dictLimit = ms->window.dictLimit;1143const BYTE* const prefixStart = base + dictLimit;1144const BYTE* const dictEnd = dictBase + dictLimit;1145const U32 curr = (U32)(ip-base);1146const U32 maxDistance = 1U << cParams->windowLog;1147const U32 lowestValid = ms->window.lowLimit;1148const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;1149const U32 isDictionary = (ms->loadedDictEnd != 0);1150const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;1151const U32 rowEntries = (1U << rowLog);1152const U32 rowMask = rowEntries - 1;1153const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */1154U32 nbAttempts = 1U << cappedSearchLog;1155size_t ml=4-1;11561157/* DMS/DDS variables that may be referenced laster */1158const ZSTD_matchState_t* const dms = ms->dictMatchState;11591160/* Initialize the following variables to satisfy static analyzer */1161size_t ddsIdx = 0;1162U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */1163U32 dmsTag = 0;1164U32* dmsRow = NULL;1165BYTE* dmsTagRow = NULL;11661167if (dictMode == ZSTD_dedicatedDictSearch) {1168const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;1169{ /* Prefetch DDS hashtable entry */1170ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;1171PREFETCH_L1(&dms->hashTable[ddsIdx]);1172}1173ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0;1174}11751176if (dictMode == ZSTD_dictMatchState) {1177/* Prefetch DMS rows */1178U32* const dmsHashTable = dms->hashTable;1179U16* const dmsTagTable = dms->tagTable;1180U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);1181U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;1182dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;1183dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);1184dmsRow = dmsHashTable + dmsRelRow;1185ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);1186}11871188/* Update the hashTable and tagTable up to (but not including) ip */1189ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);1190{ /* Get the hash for ip, compute the appropriate row */1191U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);1192U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;1193U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;1194U32* const row = hashTable + relRow;1195BYTE* tagRow = (BYTE*)(tagTable + relRow);1196U32 const head = *tagRow & rowMask;1197U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];1198size_t numMatches = 0;1199size_t currMatch = 0;1200ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);12011202/* Cycle through the matches and prefetch */1203for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {1204U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;1205U32 const matchIndex = row[matchPos];1206assert(numMatches < rowEntries);1207if (matchIndex < lowLimit)1208break;1209if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {1210PREFETCH_L1(base + matchIndex);1211} else {1212PREFETCH_L1(dictBase + matchIndex);1213}1214matchBuffer[numMatches++] = matchIndex;1215}12161217/* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop1218in ZSTD_row_update_internal() at the next search. */1219{1220U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);1221tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;1222row[pos] = ms->nextToUpdate++;1223}12241225/* Return the longest match */1226for (; currMatch < numMatches; ++currMatch) {1227U32 const matchIndex = matchBuffer[currMatch];1228size_t currentMl=0;1229assert(matchIndex < curr);1230assert(matchIndex >= lowLimit);12311232if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {1233const BYTE* const match = base + matchIndex;1234assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */1235if (match[ml] == ip[ml]) /* potentially better */1236currentMl = ZSTD_count(ip, match, iLimit);1237} else {1238const BYTE* const match = dictBase + matchIndex;1239assert(match+4 <= dictEnd);1240if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */1241currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;1242}12431244/* Save best solution */1245if (currentMl > ml) {1246ml = currentMl;1247*offsetPtr = STORE_OFFSET(curr - matchIndex);1248if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */1249}1250}1251}12521253assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */1254if (dictMode == ZSTD_dedicatedDictSearch) {1255ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,1256ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);1257} else if (dictMode == ZSTD_dictMatchState) {1258/* TODO: Measure and potentially add prefetching to DMS */1259const U32 dmsLowestIndex = dms->window.dictLimit;1260const BYTE* const dmsBase = dms->window.base;1261const BYTE* const dmsEnd = dms->window.nextSrc;1262const U32 dmsSize = (U32)(dmsEnd - dmsBase);1263const U32 dmsIndexDelta = dictLimit - dmsSize;12641265{ U32 const head = *dmsTagRow & rowMask;1266U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];1267size_t numMatches = 0;1268size_t currMatch = 0;1269ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);12701271for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {1272U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;1273U32 const matchIndex = dmsRow[matchPos];1274if (matchIndex < dmsLowestIndex)1275break;1276PREFETCH_L1(dmsBase + matchIndex);1277matchBuffer[numMatches++] = matchIndex;1278}12791280/* Return the longest match */1281for (; currMatch < numMatches; ++currMatch) {1282U32 const matchIndex = matchBuffer[currMatch];1283size_t currentMl=0;1284assert(matchIndex >= dmsLowestIndex);1285assert(matchIndex < curr);12861287{ const BYTE* const match = dmsBase + matchIndex;1288assert(match+4 <= dmsEnd);1289if (MEM_read32(match) == MEM_read32(ip))1290currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;1291}12921293if (currentMl > ml) {1294ml = currentMl;1295assert(curr > matchIndex + dmsIndexDelta);1296*offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));1297if (ip+currentMl == iLimit) break;1298}1299}1300}1301}1302return ml;1303}130413051306typedef size_t (*searchMax_f)(1307ZSTD_matchState_t* ms,1308const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);13091310/**1311* This struct contains the functions necessary for lazy to search.1312* Currently, that is only searchMax. However, it is still valuable to have the1313* VTable because this makes it easier to add more functions to the VTable later.1314*1315* TODO: The start of the search function involves loading and calculating a1316* bunch of constants from the ZSTD_matchState_t. These computations could be1317* done in an initialization function, and saved somewhere in the match state.1318* Then we could pass a pointer to the saved state instead of the match state,1319* and avoid duplicate computations.1320*1321* TODO: Move the match re-winding into searchMax. This improves compression1322* ratio, and unlocks further simplifications with the next TODO.1323*1324* TODO: Try moving the repcode search into searchMax. After the re-winding1325* and repcode search are in searchMax, there is no more logic in the match1326* finder loop that requires knowledge about the dictMode. So we should be1327* able to avoid force inlining it, and we can join the extDict loop with1328* the single segment loop. It should go in searchMax instead of its own1329* function to avoid having multiple virtual function calls per search.1330*/1331typedef struct {1332searchMax_f searchMax;1333} ZSTD_LazyVTable;13341335#define GEN_ZSTD_BT_VTABLE(dictMode, mls) \1336static size_t ZSTD_BtFindBestMatch_##dictMode##_##mls( \1337ZSTD_matchState_t* ms, \1338const BYTE* ip, const BYTE* const iLimit, \1339size_t* offsetPtr) \1340{ \1341assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \1342return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \1343} \1344static const ZSTD_LazyVTable ZSTD_BtVTable_##dictMode##_##mls = { \1345ZSTD_BtFindBestMatch_##dictMode##_##mls \1346};13471348#define GEN_ZSTD_HC_VTABLE(dictMode, mls) \1349static size_t ZSTD_HcFindBestMatch_##dictMode##_##mls( \1350ZSTD_matchState_t* ms, \1351const BYTE* ip, const BYTE* const iLimit, \1352size_t* offsetPtr) \1353{ \1354assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \1355return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \1356} \1357static const ZSTD_LazyVTable ZSTD_HcVTable_##dictMode##_##mls = { \1358ZSTD_HcFindBestMatch_##dictMode##_##mls \1359};13601361#define GEN_ZSTD_ROW_VTABLE(dictMode, mls, rowLog) \1362static size_t ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog( \1363ZSTD_matchState_t* ms, \1364const BYTE* ip, const BYTE* const iLimit, \1365size_t* offsetPtr) \1366{ \1367assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \1368assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \1369return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \1370} \1371static const ZSTD_LazyVTable ZSTD_RowVTable_##dictMode##_##mls##_##rowLog = { \1372ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog \1373};13741375#define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \1376X(dictMode, mls, 4) \1377X(dictMode, mls, 5) \1378X(dictMode, mls, 6)13791380#define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \1381ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \1382ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \1383ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6)13841385#define ZSTD_FOR_EACH_MLS(X, dictMode) \1386X(dictMode, 4) \1387X(dictMode, 5) \1388X(dictMode, 6)13891390#define ZSTD_FOR_EACH_DICT_MODE(X, ...) \1391X(__VA_ARGS__, noDict) \1392X(__VA_ARGS__, extDict) \1393X(__VA_ARGS__, dictMatchState) \1394X(__VA_ARGS__, dedicatedDictSearch)13951396/* Generate Row VTables for each combination of (dictMode, mls, rowLog) */1397ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_VTABLE)1398/* Generate Binary Tree VTables for each combination of (dictMode, mls) */1399ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_VTABLE)1400/* Generate Hash Chain VTables for each combination of (dictMode, mls) */1401ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_VTABLE)14021403#define GEN_ZSTD_BT_VTABLE_ARRAY(dictMode) \1404{ \1405&ZSTD_BtVTable_##dictMode##_4, \1406&ZSTD_BtVTable_##dictMode##_5, \1407&ZSTD_BtVTable_##dictMode##_6 \1408}14091410#define GEN_ZSTD_HC_VTABLE_ARRAY(dictMode) \1411{ \1412&ZSTD_HcVTable_##dictMode##_4, \1413&ZSTD_HcVTable_##dictMode##_5, \1414&ZSTD_HcVTable_##dictMode##_6 \1415}14161417#define GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, mls) \1418{ \1419&ZSTD_RowVTable_##dictMode##_##mls##_4, \1420&ZSTD_RowVTable_##dictMode##_##mls##_5, \1421&ZSTD_RowVTable_##dictMode##_##mls##_6 \1422}14231424#define GEN_ZSTD_ROW_VTABLE_ARRAY(dictMode) \1425{ \1426GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 4), \1427GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 5), \1428GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 6) \1429}14301431#define GEN_ZSTD_VTABLE_ARRAY(X) \1432{ \1433X(noDict), \1434X(extDict), \1435X(dictMatchState), \1436X(dedicatedDictSearch) \1437}14381439/* *******************************1440* Common parser - lazy strategy1441*********************************/1442typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;14431444/**1445* This table is indexed first by the four ZSTD_dictMode_e values, and then1446* by the two searchMethod_e values. NULLs are placed for configurations1447* that should never occur (extDict modes go to the other implementation1448* below and there is no DDSS for binary tree search yet).1449*/14501451static ZSTD_LazyVTable const*1452ZSTD_selectLazyVTable(ZSTD_matchState_t const* ms, searchMethod_e searchMethod, ZSTD_dictMode_e dictMode)1453{1454/* Fill the Hc/Bt VTable arrays with the right functions for the (dictMode, mls) combination. */1455ZSTD_LazyVTable const* const hcVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_HC_VTABLE_ARRAY);1456ZSTD_LazyVTable const* const btVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_BT_VTABLE_ARRAY);1457/* Fill the Row VTable array with the right functions for the (dictMode, mls, rowLog) combination. */1458ZSTD_LazyVTable const* const rowVTables[4][3][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_ROW_VTABLE_ARRAY);14591460U32 const mls = MAX(4, MIN(6, ms->cParams.minMatch));1461U32 const rowLog = MAX(4, MIN(6, ms->cParams.searchLog));1462switch (searchMethod) {1463case search_hashChain:1464return hcVTables[dictMode][mls - 4];1465case search_binaryTree:1466return btVTables[dictMode][mls - 4];1467case search_rowHash:1468return rowVTables[dictMode][mls - 4][rowLog - 4];1469default:1470return NULL;1471}1472}14731474FORCE_INLINE_TEMPLATE size_t1475ZSTD_compressBlock_lazy_generic(1476ZSTD_matchState_t* ms, seqStore_t* seqStore,1477U32 rep[ZSTD_REP_NUM],1478const void* src, size_t srcSize,1479const searchMethod_e searchMethod, const U32 depth,1480ZSTD_dictMode_e const dictMode)1481{1482const BYTE* const istart = (const BYTE*)src;1483const BYTE* ip = istart;1484const BYTE* anchor = istart;1485const BYTE* const iend = istart + srcSize;1486const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;1487const BYTE* const base = ms->window.base;1488const U32 prefixLowestIndex = ms->window.dictLimit;1489const BYTE* const prefixLowest = base + prefixLowestIndex;14901491searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, dictMode)->searchMax;1492U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;14931494const int isDMS = dictMode == ZSTD_dictMatchState;1495const int isDDS = dictMode == ZSTD_dedicatedDictSearch;1496const int isDxS = isDMS || isDDS;1497const ZSTD_matchState_t* const dms = ms->dictMatchState;1498const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : 0;1499const BYTE* const dictBase = isDxS ? dms->window.base : NULL;1500const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL;1501const BYTE* const dictEnd = isDxS ? dms->window.nextSrc : NULL;1502const U32 dictIndexDelta = isDxS ?1503prefixLowestIndex - (U32)(dictEnd - dictBase) :15040;1505const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));15061507assert(searchMax != NULL);15081509DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);1510ip += (dictAndPrefixLength == 0);1511if (dictMode == ZSTD_noDict) {1512U32 const curr = (U32)(ip - base);1513U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);1514U32 const maxRep = curr - windowLow;1515if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;1516if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;1517}1518if (isDxS) {1519/* dictMatchState repCode checks don't currently handle repCode == 01520* disabling. */1521assert(offset_1 <= dictAndPrefixLength);1522assert(offset_2 <= dictAndPrefixLength);1523}15241525if (searchMethod == search_rowHash) {1526const U32 rowLog = MAX(4, MIN(6, ms->cParams.searchLog));1527ZSTD_row_fillHashCache(ms, base, rowLog,1528MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),1529ms->nextToUpdate, ilimit);1530}15311532/* Match Loop */1533#if defined(__GNUC__) && defined(__x86_64__)1534/* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the1535* code alignment is perturbed. To fix the instability align the loop on 32-bytes.1536*/1537__asm__(".p2align 5");1538#endif1539while (ip < ilimit) {1540size_t matchLength=0;1541size_t offcode=STORE_REPCODE_1;1542const BYTE* start=ip+1;1543DEBUGLOG(7, "search baseline (depth 0)");15441545/* check repCode */1546if (isDxS) {1547const U32 repIndex = (U32)(ip - base) + 1 - offset_1;1548const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch)1549&& repIndex < prefixLowestIndex) ?1550dictBase + (repIndex - dictIndexDelta) :1551base + repIndex;1552if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)1553&& (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {1554const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;1555matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;1556if (depth==0) goto _storeSequence;1557}1558}1559if ( dictMode == ZSTD_noDict1560&& ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {1561matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;1562if (depth==0) goto _storeSequence;1563}15641565/* first search (depth 0) */1566{ size_t offsetFound = 999999999;1567size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);1568if (ml2 > matchLength)1569matchLength = ml2, start = ip, offcode=offsetFound;1570}15711572if (matchLength < 4) {1573ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */1574continue;1575}15761577/* let's try to find a better solution */1578if (depth>=1)1579while (ip<ilimit) {1580DEBUGLOG(7, "search depth 1");1581ip ++;1582if ( (dictMode == ZSTD_noDict)1583&& (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {1584size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;1585int const gain2 = (int)(mlRep * 3);1586int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);1587if ((mlRep >= 4) && (gain2 > gain1))1588matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;1589}1590if (isDxS) {1591const U32 repIndex = (U32)(ip - base) - offset_1;1592const BYTE* repMatch = repIndex < prefixLowestIndex ?1593dictBase + (repIndex - dictIndexDelta) :1594base + repIndex;1595if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)1596&& (MEM_read32(repMatch) == MEM_read32(ip)) ) {1597const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;1598size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;1599int const gain2 = (int)(mlRep * 3);1600int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);1601if ((mlRep >= 4) && (gain2 > gain1))1602matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;1603}1604}1605{ size_t offset2=999999999;1606size_t const ml2 = searchMax(ms, ip, iend, &offset2);1607int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */1608int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);1609if ((ml2 >= 4) && (gain2 > gain1)) {1610matchLength = ml2, offcode = offset2, start = ip;1611continue; /* search a better one */1612} }16131614/* let's find an even better one */1615if ((depth==2) && (ip<ilimit)) {1616DEBUGLOG(7, "search depth 2");1617ip ++;1618if ( (dictMode == ZSTD_noDict)1619&& (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {1620size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;1621int const gain2 = (int)(mlRep * 4);1622int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);1623if ((mlRep >= 4) && (gain2 > gain1))1624matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;1625}1626if (isDxS) {1627const U32 repIndex = (U32)(ip - base) - offset_1;1628const BYTE* repMatch = repIndex < prefixLowestIndex ?1629dictBase + (repIndex - dictIndexDelta) :1630base + repIndex;1631if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)1632&& (MEM_read32(repMatch) == MEM_read32(ip)) ) {1633const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;1634size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;1635int const gain2 = (int)(mlRep * 4);1636int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);1637if ((mlRep >= 4) && (gain2 > gain1))1638matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;1639}1640}1641{ size_t offset2=999999999;1642size_t const ml2 = searchMax(ms, ip, iend, &offset2);1643int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */1644int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);1645if ((ml2 >= 4) && (gain2 > gain1)) {1646matchLength = ml2, offcode = offset2, start = ip;1647continue;1648} } }1649break; /* nothing found : store previous solution */1650}16511652/* NOTE:1653* Pay attention that `start[-value]` can lead to strange undefined behavior1654* notably if `value` is unsigned, resulting in a large positive `-value`.1655*/1656/* catch up */1657if (STORED_IS_OFFSET(offcode)) {1658if (dictMode == ZSTD_noDict) {1659while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest))1660&& (start[-1] == (start-STORED_OFFSET(offcode))[-1]) ) /* only search for offset within prefix */1661{ start--; matchLength++; }1662}1663if (isDxS) {1664U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));1665const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;1666const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;1667while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */1668}1669offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);1670}1671/* store sequence */1672_storeSequence:1673{ size_t const litLength = (size_t)(start - anchor);1674ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);1675anchor = ip = start + matchLength;1676}16771678/* check immediate repcode */1679if (isDxS) {1680while (ip <= ilimit) {1681U32 const current2 = (U32)(ip-base);1682U32 const repIndex = current2 - offset_2;1683const BYTE* repMatch = repIndex < prefixLowestIndex ?1684dictBase - dictIndexDelta + repIndex :1685base + repIndex;1686if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */)1687&& (MEM_read32(repMatch) == MEM_read32(ip)) ) {1688const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;1689matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;1690offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset_2 <=> offset_1 */1691ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);1692ip += matchLength;1693anchor = ip;1694continue;1695}1696break;1697}1698}16991700if (dictMode == ZSTD_noDict) {1701while ( ((ip <= ilimit) & (offset_2>0))1702&& (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {1703/* store sequence */1704matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;1705offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */1706ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);1707ip += matchLength;1708anchor = ip;1709continue; /* faster when present ... (?) */1710} } }17111712/* Save reps for next block */1713rep[0] = offset_1 ? offset_1 : savedOffset;1714rep[1] = offset_2 ? offset_2 : savedOffset;17151716/* Return the last literals size */1717return (size_t)(iend - anchor);1718}171917201721size_t ZSTD_compressBlock_btlazy2(1722ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1723void const* src, size_t srcSize)1724{1725return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);1726}17271728size_t ZSTD_compressBlock_lazy2(1729ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1730void const* src, size_t srcSize)1731{1732return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);1733}17341735size_t ZSTD_compressBlock_lazy(1736ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1737void const* src, size_t srcSize)1738{1739return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);1740}17411742size_t ZSTD_compressBlock_greedy(1743ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1744void const* src, size_t srcSize)1745{1746return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);1747}17481749size_t ZSTD_compressBlock_btlazy2_dictMatchState(1750ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1751void const* src, size_t srcSize)1752{1753return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);1754}17551756size_t ZSTD_compressBlock_lazy2_dictMatchState(1757ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1758void const* src, size_t srcSize)1759{1760return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);1761}17621763size_t ZSTD_compressBlock_lazy_dictMatchState(1764ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1765void const* src, size_t srcSize)1766{1767return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);1768}17691770size_t ZSTD_compressBlock_greedy_dictMatchState(1771ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1772void const* src, size_t srcSize)1773{1774return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);1775}177617771778size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(1779ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1780void const* src, size_t srcSize)1781{1782return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);1783}17841785size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(1786ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1787void const* src, size_t srcSize)1788{1789return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);1790}17911792size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(1793ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1794void const* src, size_t srcSize)1795{1796return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);1797}17981799/* Row-based matchfinder */1800size_t ZSTD_compressBlock_lazy2_row(1801ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1802void const* src, size_t srcSize)1803{1804return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);1805}18061807size_t ZSTD_compressBlock_lazy_row(1808ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1809void const* src, size_t srcSize)1810{1811return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);1812}18131814size_t ZSTD_compressBlock_greedy_row(1815ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1816void const* src, size_t srcSize)1817{1818return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);1819}18201821size_t ZSTD_compressBlock_lazy2_dictMatchState_row(1822ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1823void const* src, size_t srcSize)1824{1825return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);1826}18271828size_t ZSTD_compressBlock_lazy_dictMatchState_row(1829ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1830void const* src, size_t srcSize)1831{1832return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);1833}18341835size_t ZSTD_compressBlock_greedy_dictMatchState_row(1836ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1837void const* src, size_t srcSize)1838{1839return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);1840}184118421843size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(1844ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1845void const* src, size_t srcSize)1846{1847return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);1848}18491850size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(1851ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1852void const* src, size_t srcSize)1853{1854return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);1855}18561857size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(1858ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1859void const* src, size_t srcSize)1860{1861return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);1862}18631864FORCE_INLINE_TEMPLATE1865size_t ZSTD_compressBlock_lazy_extDict_generic(1866ZSTD_matchState_t* ms, seqStore_t* seqStore,1867U32 rep[ZSTD_REP_NUM],1868const void* src, size_t srcSize,1869const searchMethod_e searchMethod, const U32 depth)1870{1871const BYTE* const istart = (const BYTE*)src;1872const BYTE* ip = istart;1873const BYTE* anchor = istart;1874const BYTE* const iend = istart + srcSize;1875const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;1876const BYTE* const base = ms->window.base;1877const U32 dictLimit = ms->window.dictLimit;1878const BYTE* const prefixStart = base + dictLimit;1879const BYTE* const dictBase = ms->window.dictBase;1880const BYTE* const dictEnd = dictBase + dictLimit;1881const BYTE* const dictStart = dictBase + ms->window.lowLimit;1882const U32 windowLog = ms->cParams.windowLog;1883const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;18841885searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, ZSTD_extDict)->searchMax;1886U32 offset_1 = rep[0], offset_2 = rep[1];18871888DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);18891890/* init */1891ip += (ip == prefixStart);1892if (searchMethod == search_rowHash) {1893ZSTD_row_fillHashCache(ms, base, rowLog,1894MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),1895ms->nextToUpdate, ilimit);1896}18971898/* Match Loop */1899#if defined(__GNUC__) && defined(__x86_64__)1900/* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the1901* code alignment is perturbed. To fix the instability align the loop on 32-bytes.1902*/1903__asm__(".p2align 5");1904#endif1905while (ip < ilimit) {1906size_t matchLength=0;1907size_t offcode=STORE_REPCODE_1;1908const BYTE* start=ip+1;1909U32 curr = (U32)(ip-base);19101911/* check repCode */1912{ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+1, windowLog);1913const U32 repIndex = (U32)(curr+1 - offset_1);1914const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;1915const BYTE* const repMatch = repBase + repIndex;1916if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */1917& (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */1918if (MEM_read32(ip+1) == MEM_read32(repMatch)) {1919/* repcode detected we should take it */1920const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;1921matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repEnd, prefixStart) + 4;1922if (depth==0) goto _storeSequence;1923} }19241925/* first search (depth 0) */1926{ size_t offsetFound = 999999999;1927size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);1928if (ml2 > matchLength)1929matchLength = ml2, start = ip, offcode=offsetFound;1930}19311932if (matchLength < 4) {1933ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */1934continue;1935}19361937/* let's try to find a better solution */1938if (depth>=1)1939while (ip<ilimit) {1940ip ++;1941curr++;1942/* check repCode */1943if (offcode) {1944const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);1945const U32 repIndex = (U32)(curr - offset_1);1946const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;1947const BYTE* const repMatch = repBase + repIndex;1948if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */1949& (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */1950if (MEM_read32(ip) == MEM_read32(repMatch)) {1951/* repcode detected */1952const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;1953size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;1954int const gain2 = (int)(repLength * 3);1955int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);1956if ((repLength >= 4) && (gain2 > gain1))1957matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;1958} }19591960/* search match, depth 1 */1961{ size_t offset2=999999999;1962size_t const ml2 = searchMax(ms, ip, iend, &offset2);1963int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */1964int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);1965if ((ml2 >= 4) && (gain2 > gain1)) {1966matchLength = ml2, offcode = offset2, start = ip;1967continue; /* search a better one */1968} }19691970/* let's find an even better one */1971if ((depth==2) && (ip<ilimit)) {1972ip ++;1973curr++;1974/* check repCode */1975if (offcode) {1976const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);1977const U32 repIndex = (U32)(curr - offset_1);1978const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;1979const BYTE* const repMatch = repBase + repIndex;1980if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */1981& (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */1982if (MEM_read32(ip) == MEM_read32(repMatch)) {1983/* repcode detected */1984const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;1985size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;1986int const gain2 = (int)(repLength * 4);1987int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);1988if ((repLength >= 4) && (gain2 > gain1))1989matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;1990} }19911992/* search match, depth 2 */1993{ size_t offset2=999999999;1994size_t const ml2 = searchMax(ms, ip, iend, &offset2);1995int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */1996int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);1997if ((ml2 >= 4) && (gain2 > gain1)) {1998matchLength = ml2, offcode = offset2, start = ip;1999continue;2000} } }2001break; /* nothing found : store previous solution */2002}20032004/* catch up */2005if (STORED_IS_OFFSET(offcode)) {2006U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));2007const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;2008const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;2009while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */2010offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);2011}20122013/* store sequence */2014_storeSequence:2015{ size_t const litLength = (size_t)(start - anchor);2016ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);2017anchor = ip = start + matchLength;2018}20192020/* check immediate repcode */2021while (ip <= ilimit) {2022const U32 repCurrent = (U32)(ip-base);2023const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog);2024const U32 repIndex = repCurrent - offset_2;2025const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;2026const BYTE* const repMatch = repBase + repIndex;2027if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */2028& (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */2029if (MEM_read32(ip) == MEM_read32(repMatch)) {2030/* repcode detected we should take it */2031const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;2032matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;2033offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset history */2034ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);2035ip += matchLength;2036anchor = ip;2037continue; /* faster when present ... (?) */2038}2039break;2040} }20412042/* Save reps for next block */2043rep[0] = offset_1;2044rep[1] = offset_2;20452046/* Return the last literals size */2047return (size_t)(iend - anchor);2048}204920502051size_t ZSTD_compressBlock_greedy_extDict(2052ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],2053void const* src, size_t srcSize)2054{2055return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);2056}20572058size_t ZSTD_compressBlock_lazy_extDict(2059ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],2060void const* src, size_t srcSize)20612062{2063return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);2064}20652066size_t ZSTD_compressBlock_lazy2_extDict(2067ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],2068void const* src, size_t srcSize)20692070{2071return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);2072}20732074size_t ZSTD_compressBlock_btlazy2_extDict(2075ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],2076void const* src, size_t srcSize)20772078{2079return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);2080}20812082size_t ZSTD_compressBlock_greedy_extDict_row(2083ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],2084void const* src, size_t srcSize)2085{2086return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);2087}20882089size_t ZSTD_compressBlock_lazy_extDict_row(2090ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],2091void const* src, size_t srcSize)20922093{2094return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);2095}20962097size_t ZSTD_compressBlock_lazy2_extDict_row(2098ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],2099void const* src, size_t srcSize)21002101{2102return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);2103}210421052106