Path: blob/master/Utilities/cmzstd/lib/compress/zstd_lazy.c
4997 views
/*1* Copyright (c) Meta Platforms, Inc. and affiliates.2* All rights reserved.3*4* This source code is licensed under both the BSD-style license (found in the5* LICENSE file in the root directory of this source tree) and the GPLv2 (found6* in the COPYING file in the root directory of this source tree).7* You may select, at your option, one of the above-listed licenses.8*/910#include "zstd_compress_internal.h"11#include "zstd_lazy.h"12#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */1314#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \15|| !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \16|| !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \17|| !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)1819#define kLazySkippingStep 8202122/*-*************************************23* Binary Tree search24***************************************/2526static27ZSTD_ALLOW_POINTER_OVERFLOW_ATTR28void ZSTD_updateDUBT(ZSTD_MatchState_t* ms,29const BYTE* ip, const BYTE* iend,30U32 mls)31{32const ZSTD_compressionParameters* const cParams = &ms->cParams;33U32* const hashTable = ms->hashTable;34U32 const hashLog = cParams->hashLog;3536U32* const bt = ms->chainTable;37U32 const btLog = cParams->chainLog - 1;38U32 const btMask = (1 << btLog) - 1;3940const BYTE* const base = ms->window.base;41U32 const target = (U32)(ip - base);42U32 idx = ms->nextToUpdate;4344if (idx != target)45DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)",46idx, target, ms->window.dictLimit);47assert(ip + 8 <= iend); /* condition for ZSTD_hashPtr */48(void)iend;4950assert(idx >= ms->window.dictLimit); /* condition for valid base+idx */51for ( ; idx < target ; idx++) {52size_t const h = ZSTD_hashPtr(base + idx, hashLog, mls); /* assumption : ip + 8 <= iend */53U32 const matchIndex = hashTable[h];5455U32* const nextCandidatePtr = bt + 2*(idx&btMask);56U32* const sortMarkPtr = nextCandidatePtr + 1;5758DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx);59hashTable[h] = idx; /* Update Hash Table */60*nextCandidatePtr = matchIndex; /* update BT like a chain */61*sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK;62}63ms->nextToUpdate = target;64}656667/** ZSTD_insertDUBT1() :68* sort one already inserted but unsorted position69* assumption : curr >= btlow == (curr - btmask)70* doesn't fail */71static72ZSTD_ALLOW_POINTER_OVERFLOW_ATTR73void ZSTD_insertDUBT1(const ZSTD_MatchState_t* ms,74U32 curr, const BYTE* inputEnd,75U32 nbCompares, U32 btLow,76const ZSTD_dictMode_e dictMode)77{78const ZSTD_compressionParameters* const cParams = &ms->cParams;79U32* const bt = ms->chainTable;80U32 const btLog = cParams->chainLog - 1;81U32 const btMask = (1 << btLog) - 1;82size_t commonLengthSmaller=0, commonLengthLarger=0;83const BYTE* const base = ms->window.base;84const BYTE* const dictBase = ms->window.dictBase;85const U32 dictLimit = ms->window.dictLimit;86const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr;87const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit;88const BYTE* const dictEnd = dictBase + dictLimit;89const BYTE* const prefixStart = base + dictLimit;90const BYTE* match;91U32* smallerPtr = bt + 2*(curr&btMask);92U32* largerPtr = smallerPtr + 1;93U32 matchIndex = *smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */94U32 dummy32; /* to be nullified at the end */95U32 const windowValid = ms->window.lowLimit;96U32 const maxDistance = 1U << cParams->windowLog;97U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid;9899100DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",101curr, dictLimit, windowLow);102assert(curr >= btLow);103assert(ip < iend); /* condition for ZSTD_count */104105for (; nbCompares && (matchIndex > windowLow); --nbCompares) {106U32* const nextPtr = bt + 2*(matchIndex & btMask);107size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */108assert(matchIndex < curr);109/* note : all candidates are now supposed sorted,110* but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK111* when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */112113if ( (dictMode != ZSTD_extDict)114|| (matchIndex+matchLength >= dictLimit) /* both in current segment*/115|| (curr < dictLimit) /* both in extDict */) {116const BYTE* const mBase = ( (dictMode != ZSTD_extDict)117|| (matchIndex+matchLength >= dictLimit)) ?118base : dictBase;119assert( (matchIndex+matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */120|| (curr < dictLimit) );121match = mBase + matchIndex;122matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);123} else {124match = dictBase + matchIndex;125matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);126if (matchIndex+matchLength >= dictLimit)127match = base + matchIndex; /* preparation for next read of match[matchLength] */128}129130DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",131curr, matchIndex, (U32)matchLength);132133if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */134break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */135}136137if (match[matchLength] < ip[matchLength]) { /* necessarily within buffer */138/* match is smaller than current */139*smallerPtr = matchIndex; /* update smaller idx */140commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */141if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop searching */142DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u",143matchIndex, btLow, nextPtr[1]);144smallerPtr = nextPtr+1; /* new "candidate" => larger than match, which was smaller than target */145matchIndex = nextPtr[1]; /* new matchIndex, larger than previous and closer to current */146} else {147/* match is larger than current */148*largerPtr = matchIndex;149commonLengthLarger = matchLength;150if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop searching */151DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u",152matchIndex, btLow, nextPtr[0]);153largerPtr = nextPtr;154matchIndex = nextPtr[0];155} }156157*smallerPtr = *largerPtr = 0;158}159160161static162ZSTD_ALLOW_POINTER_OVERFLOW_ATTR163size_t ZSTD_DUBT_findBetterDictMatch (164const ZSTD_MatchState_t* ms,165const BYTE* const ip, const BYTE* const iend,166size_t* offsetPtr,167size_t bestLength,168U32 nbCompares,169U32 const mls,170const ZSTD_dictMode_e dictMode)171{172const ZSTD_MatchState_t * const dms = ms->dictMatchState;173const ZSTD_compressionParameters* const dmsCParams = &dms->cParams;174const U32 * const dictHashTable = dms->hashTable;175U32 const hashLog = dmsCParams->hashLog;176size_t const h = ZSTD_hashPtr(ip, hashLog, mls);177U32 dictMatchIndex = dictHashTable[h];178179const BYTE* const base = ms->window.base;180const BYTE* const prefixStart = base + ms->window.dictLimit;181U32 const curr = (U32)(ip-base);182const BYTE* const dictBase = dms->window.base;183const BYTE* const dictEnd = dms->window.nextSrc;184U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);185U32 const dictLowLimit = dms->window.lowLimit;186U32 const dictIndexDelta = ms->window.lowLimit - dictHighLimit;187188U32* const dictBt = dms->chainTable;189U32 const btLog = dmsCParams->chainLog - 1;190U32 const btMask = (1 << btLog) - 1;191U32 const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask;192193size_t commonLengthSmaller=0, commonLengthLarger=0;194195(void)dictMode;196assert(dictMode == ZSTD_dictMatchState);197198for (; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) {199U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask);200size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */201const BYTE* match = dictBase + dictMatchIndex;202matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);203if (dictMatchIndex+matchLength >= dictHighLimit)204match = base + dictMatchIndex + dictIndexDelta; /* to prepare for next usage of match[matchLength] */205206if (matchLength > bestLength) {207U32 matchIndex = dictMatchIndex + dictIndexDelta;208if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {209DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",210curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);211bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);212}213if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */214break; /* drop, to guarantee consistency (miss a little bit of compression) */215}216}217218if (match[matchLength] < ip[matchLength]) {219if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */220commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */221dictMatchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */222} else {223/* match is larger than current */224if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */225commonLengthLarger = matchLength;226dictMatchIndex = nextPtr[0];227}228}229230if (bestLength >= MINMATCH) {231U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;232DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",233curr, (U32)bestLength, (U32)*offsetPtr, mIndex);234}235return bestLength;236237}238239240static241ZSTD_ALLOW_POINTER_OVERFLOW_ATTR242size_t ZSTD_DUBT_findBestMatch(ZSTD_MatchState_t* ms,243const BYTE* const ip, const BYTE* const iend,244size_t* offBasePtr,245U32 const mls,246const ZSTD_dictMode_e dictMode)247{248const ZSTD_compressionParameters* const cParams = &ms->cParams;249U32* const hashTable = ms->hashTable;250U32 const hashLog = cParams->hashLog;251size_t const h = ZSTD_hashPtr(ip, hashLog, mls);252U32 matchIndex = hashTable[h];253254const BYTE* const base = ms->window.base;255U32 const curr = (U32)(ip-base);256U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog);257258U32* const bt = ms->chainTable;259U32 const btLog = cParams->chainLog - 1;260U32 const btMask = (1 << btLog) - 1;261U32 const btLow = (btMask >= curr) ? 0 : curr - btMask;262U32 const unsortLimit = MAX(btLow, windowLow);263264U32* nextCandidate = bt + 2*(matchIndex&btMask);265U32* unsortedMark = bt + 2*(matchIndex&btMask) + 1;266U32 nbCompares = 1U << cParams->searchLog;267U32 nbCandidates = nbCompares;268U32 previousCandidate = 0;269270DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr);271assert(ip <= iend-8); /* required for h calculation */272assert(dictMode != ZSTD_dedicatedDictSearch);273274/* reach end of unsorted candidates list */275while ( (matchIndex > unsortLimit)276&& (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK)277&& (nbCandidates > 1) ) {278DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted",279matchIndex);280*unsortedMark = previousCandidate; /* the unsortedMark becomes a reversed chain, to move up back to original position */281previousCandidate = matchIndex;282matchIndex = *nextCandidate;283nextCandidate = bt + 2*(matchIndex&btMask);284unsortedMark = bt + 2*(matchIndex&btMask) + 1;285nbCandidates --;286}287288/* nullify last candidate if it's still unsorted289* simplification, detrimental to compression ratio, beneficial for speed */290if ( (matchIndex > unsortLimit)291&& (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) {292DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u",293matchIndex);294*nextCandidate = *unsortedMark = 0;295}296297/* batch sort stacked candidates */298matchIndex = previousCandidate;299while (matchIndex) { /* will end on matchIndex == 0 */300U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1;301U32 const nextCandidateIdx = *nextCandidateIdxPtr;302ZSTD_insertDUBT1(ms, matchIndex, iend,303nbCandidates, unsortLimit, dictMode);304matchIndex = nextCandidateIdx;305nbCandidates++;306}307308/* find longest match */309{ size_t commonLengthSmaller = 0, commonLengthLarger = 0;310const BYTE* const dictBase = ms->window.dictBase;311const U32 dictLimit = ms->window.dictLimit;312const BYTE* const dictEnd = dictBase + dictLimit;313const BYTE* const prefixStart = base + dictLimit;314U32* smallerPtr = bt + 2*(curr&btMask);315U32* largerPtr = bt + 2*(curr&btMask) + 1;316U32 matchEndIdx = curr + 8 + 1;317U32 dummy32; /* to be nullified at the end */318size_t bestLength = 0;319320matchIndex = hashTable[h];321hashTable[h] = curr; /* Update Hash Table */322323for (; nbCompares && (matchIndex > windowLow); --nbCompares) {324U32* const nextPtr = bt + 2*(matchIndex & btMask);325size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */326const BYTE* match;327328if ((dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit)) {329match = base + matchIndex;330matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);331} else {332match = dictBase + matchIndex;333matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);334if (matchIndex+matchLength >= dictLimit)335match = base + matchIndex; /* to prepare for next usage of match[matchLength] */336}337338if (matchLength > bestLength) {339if (matchLength > matchEndIdx - matchIndex)340matchEndIdx = matchIndex + (U32)matchLength;341if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )342bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);343if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */344if (dictMode == ZSTD_dictMatchState) {345nbCompares = 0; /* in addition to avoiding checking any346* further in this loop, make sure we347* skip checking in the dictionary. */348}349break; /* drop, to guarantee consistency (miss a little bit of compression) */350}351}352353if (match[matchLength] < ip[matchLength]) {354/* match is smaller than current */355*smallerPtr = matchIndex; /* update smaller idx */356commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */357if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */358smallerPtr = nextPtr+1; /* new "smaller" => larger of match */359matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */360} else {361/* match is larger than current */362*largerPtr = matchIndex;363commonLengthLarger = matchLength;364if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */365largerPtr = nextPtr;366matchIndex = nextPtr[0];367} }368369*smallerPtr = *largerPtr = 0;370371assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */372if (dictMode == ZSTD_dictMatchState && nbCompares) {373bestLength = ZSTD_DUBT_findBetterDictMatch(374ms, ip, iend,375offBasePtr, bestLength, nbCompares,376mls, dictMode);377}378379assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */380ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */381if (bestLength >= MINMATCH) {382U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;383DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",384curr, (U32)bestLength, (U32)*offBasePtr, mIndex);385}386return bestLength;387}388}389390391/** ZSTD_BtFindBestMatch() : Tree updater, providing best match */392FORCE_INLINE_TEMPLATE393ZSTD_ALLOW_POINTER_OVERFLOW_ATTR394size_t ZSTD_BtFindBestMatch( ZSTD_MatchState_t* ms,395const BYTE* const ip, const BYTE* const iLimit,396size_t* offBasePtr,397const U32 mls /* template */,398const ZSTD_dictMode_e dictMode)399{400DEBUGLOG(7, "ZSTD_BtFindBestMatch");401if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */402ZSTD_updateDUBT(ms, ip, iLimit, mls);403return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);404}405406/***********************************407* Dedicated dict search408***********************************/409410void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_MatchState_t* ms, const BYTE* const ip)411{412const BYTE* const base = ms->window.base;413U32 const target = (U32)(ip - base);414U32* const hashTable = ms->hashTable;415U32* const chainTable = ms->chainTable;416U32 const chainSize = 1 << ms->cParams.chainLog;417U32 idx = ms->nextToUpdate;418U32 const minChain = chainSize < target - idx ? target - chainSize : idx;419U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;420U32 const cacheSize = bucketSize - 1;421U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;422U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts;423424/* We know the hashtable is oversized by a factor of `bucketSize`.425* We are going to temporarily pretend `bucketSize == 1`, keeping only a426* single entry. We will use the rest of the space to construct a temporary427* chaintable.428*/429U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;430U32* const tmpHashTable = hashTable;431U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);432U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;433U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;434U32 hashIdx;435436assert(ms->cParams.chainLog <= 24);437assert(ms->cParams.hashLog > ms->cParams.chainLog);438assert(idx != 0);439assert(tmpMinChain <= minChain);440441/* fill conventional hash table and conventional chain table */442for ( ; idx < target; idx++) {443U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch);444if (idx >= tmpMinChain) {445tmpChainTable[idx - tmpMinChain] = hashTable[h];446}447tmpHashTable[h] = idx;448}449450/* sort chains into ddss chain table */451{452U32 chainPos = 0;453for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) {454U32 count;455U32 countBeyondMinChain = 0;456U32 i = tmpHashTable[hashIdx];457for (count = 0; i >= tmpMinChain && count < cacheSize; count++) {458/* skip through the chain to the first position that won't be459* in the hash cache bucket */460if (i < minChain) {461countBeyondMinChain++;462}463i = tmpChainTable[i - tmpMinChain];464}465if (count == cacheSize) {466for (count = 0; count < chainLimit;) {467if (i < minChain) {468if (!i || ++countBeyondMinChain > cacheSize) {469/* only allow pulling `cacheSize` number of entries470* into the cache or chainTable beyond `minChain`,471* to replace the entries pulled out of the472* chainTable into the cache. This lets us reach473* back further without increasing the total number474* of entries in the chainTable, guaranteeing the475* DDSS chain table will fit into the space476* allocated for the regular one. */477break;478}479}480chainTable[chainPos++] = i;481count++;482if (i < tmpMinChain) {483break;484}485i = tmpChainTable[i - tmpMinChain];486}487} else {488count = 0;489}490if (count) {491tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count;492} else {493tmpHashTable[hashIdx] = 0;494}495}496assert(chainPos <= chainSize); /* I believe this is guaranteed... */497}498499/* move chain pointers into the last entry of each hash bucket */500for (hashIdx = (1 << hashLog); hashIdx; ) {501U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG;502U32 const chainPackedPointer = tmpHashTable[hashIdx];503U32 i;504for (i = 0; i < cacheSize; i++) {505hashTable[bucketIdx + i] = 0;506}507hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer;508}509510/* fill the buckets of the hash table */511for (idx = ms->nextToUpdate; idx < target; idx++) {512U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch)513<< ZSTD_LAZY_DDSS_BUCKET_LOG;514U32 i;515/* Shift hash cache down 1. */516for (i = cacheSize - 1; i; i--)517hashTable[h + i] = hashTable[h + i - 1];518hashTable[h] = idx;519}520521ms->nextToUpdate = target;522}523524/* Returns the longest match length found in the dedicated dict search structure.525* If none are longer than the argument ml, then ml will be returned.526*/527FORCE_INLINE_TEMPLATE528size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,529const ZSTD_MatchState_t* const dms,530const BYTE* const ip, const BYTE* const iLimit,531const BYTE* const prefixStart, const U32 curr,532const U32 dictLimit, const size_t ddsIdx) {533const U32 ddsLowestIndex = dms->window.dictLimit;534const BYTE* const ddsBase = dms->window.base;535const BYTE* const ddsEnd = dms->window.nextSrc;536const U32 ddsSize = (U32)(ddsEnd - ddsBase);537const U32 ddsIndexDelta = dictLimit - ddsSize;538const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);539const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;540U32 ddsAttempt;541U32 matchIndex;542543for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {544PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);545}546547{548U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];549U32 const chainIndex = chainPackedPointer >> 8;550551PREFETCH_L1(&dms->chainTable[chainIndex]);552}553554for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {555size_t currentMl=0;556const BYTE* match;557matchIndex = dms->hashTable[ddsIdx + ddsAttempt];558match = ddsBase + matchIndex;559560if (!matchIndex) {561return ml;562}563564/* guaranteed by table construction */565(void)ddsLowestIndex;566assert(matchIndex >= ddsLowestIndex);567assert(match+4 <= ddsEnd);568if (MEM_read32(match) == MEM_read32(ip)) {569/* assumption : matchIndex <= dictLimit-4 (by table construction) */570currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;571}572573/* save best solution */574if (currentMl > ml) {575ml = currentMl;576*offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));577if (ip+currentMl == iLimit) {578/* best possible, avoids read overflow on next attempt */579return ml;580}581}582}583584{585U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];586U32 chainIndex = chainPackedPointer >> 8;587U32 const chainLength = chainPackedPointer & 0xFF;588U32 const chainAttempts = nbAttempts - ddsAttempt;589U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;590U32 chainAttempt;591592for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {593PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);594}595596for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {597size_t currentMl=0;598const BYTE* match;599matchIndex = dms->chainTable[chainIndex];600match = ddsBase + matchIndex;601602/* guaranteed by table construction */603assert(matchIndex >= ddsLowestIndex);604assert(match+4 <= ddsEnd);605if (MEM_read32(match) == MEM_read32(ip)) {606/* assumption : matchIndex <= dictLimit-4 (by table construction) */607currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;608}609610/* save best solution */611if (currentMl > ml) {612ml = currentMl;613*offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));614if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */615}616}617}618return ml;619}620621622/* *********************************623* Hash Chain624***********************************/625#define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)]626627/* Update chains up to ip (excluded)628Assumption : always within prefix (i.e. not within extDict) */629FORCE_INLINE_TEMPLATE630ZSTD_ALLOW_POINTER_OVERFLOW_ATTR631U32 ZSTD_insertAndFindFirstIndex_internal(632ZSTD_MatchState_t* ms,633const ZSTD_compressionParameters* const cParams,634const BYTE* ip, U32 const mls, U32 const lazySkipping)635{636U32* const hashTable = ms->hashTable;637const U32 hashLog = cParams->hashLog;638U32* const chainTable = ms->chainTable;639const U32 chainMask = (1 << cParams->chainLog) - 1;640const BYTE* const base = ms->window.base;641const U32 target = (U32)(ip - base);642U32 idx = ms->nextToUpdate;643644while(idx < target) { /* catch up */645size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);646NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];647hashTable[h] = idx;648idx++;649/* Stop inserting every position when in the lazy skipping mode. */650if (lazySkipping)651break;652}653654ms->nextToUpdate = target;655return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];656}657658U32 ZSTD_insertAndFindFirstIndex(ZSTD_MatchState_t* ms, const BYTE* ip) {659const ZSTD_compressionParameters* const cParams = &ms->cParams;660return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);661}662663/* inlining is important to hardwire a hot branch (template emulation) */664FORCE_INLINE_TEMPLATE665ZSTD_ALLOW_POINTER_OVERFLOW_ATTR666size_t ZSTD_HcFindBestMatch(667ZSTD_MatchState_t* ms,668const BYTE* const ip, const BYTE* const iLimit,669size_t* offsetPtr,670const U32 mls, const ZSTD_dictMode_e dictMode)671{672const ZSTD_compressionParameters* const cParams = &ms->cParams;673U32* const chainTable = ms->chainTable;674const U32 chainSize = (1 << cParams->chainLog);675const U32 chainMask = chainSize-1;676const BYTE* const base = ms->window.base;677const BYTE* const dictBase = ms->window.dictBase;678const U32 dictLimit = ms->window.dictLimit;679const BYTE* const prefixStart = base + dictLimit;680const BYTE* const dictEnd = dictBase + dictLimit;681const U32 curr = (U32)(ip-base);682const U32 maxDistance = 1U << cParams->windowLog;683const U32 lowestValid = ms->window.lowLimit;684const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;685const U32 isDictionary = (ms->loadedDictEnd != 0);686const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;687const U32 minChain = curr > chainSize ? curr - chainSize : 0;688U32 nbAttempts = 1U << cParams->searchLog;689size_t ml=4-1;690691const ZSTD_MatchState_t* const dms = ms->dictMatchState;692const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch693? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0;694const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch695? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0;696697U32 matchIndex;698699if (dictMode == ZSTD_dedicatedDictSearch) {700const U32* entry = &dms->hashTable[ddsIdx];701PREFETCH_L1(entry);702}703704/* HC4 match finder */705matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);706707for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {708size_t currentMl=0;709if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {710const BYTE* const match = base + matchIndex;711assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */712/* read 4B starting from (match + ml + 1 - sizeof(U32)) */713if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */714currentMl = ZSTD_count(ip, match, iLimit);715} else {716const BYTE* const match = dictBase + matchIndex;717assert(match+4 <= dictEnd);718if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */719currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;720}721722/* save best solution */723if (currentMl > ml) {724ml = currentMl;725*offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);726if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */727}728729if (matchIndex <= minChain) break;730matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);731}732733assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */734if (dictMode == ZSTD_dedicatedDictSearch) {735ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,736ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);737} else if (dictMode == ZSTD_dictMatchState) {738const U32* const dmsChainTable = dms->chainTable;739const U32 dmsChainSize = (1 << dms->cParams.chainLog);740const U32 dmsChainMask = dmsChainSize - 1;741const U32 dmsLowestIndex = dms->window.dictLimit;742const BYTE* const dmsBase = dms->window.base;743const BYTE* const dmsEnd = dms->window.nextSrc;744const U32 dmsSize = (U32)(dmsEnd - dmsBase);745const U32 dmsIndexDelta = dictLimit - dmsSize;746const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : 0;747748matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];749750for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {751size_t currentMl=0;752const BYTE* const match = dmsBase + matchIndex;753assert(match+4 <= dmsEnd);754if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */755currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;756757/* save best solution */758if (currentMl > ml) {759ml = currentMl;760assert(curr > matchIndex + dmsIndexDelta);761*offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));762if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */763}764765if (matchIndex <= dmsMinChain) break;766767matchIndex = dmsChainTable[matchIndex & dmsChainMask];768}769}770771return ml;772}773774/* *********************************775* (SIMD) Row-based matchfinder776***********************************/777/* Constants for row-based hash */778#define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)779#define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */780781#define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)782783typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 representing a mask of matches */784785/* ZSTD_VecMask_next():786* Starting from the LSB, returns the idx of the next non-zero bit.787* Basically counting the nb of trailing zeroes.788*/789MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {790return ZSTD_countTrailingZeros64(val);791}792793/* ZSTD_row_nextIndex():794* Returns the next index to insert at within a tagTable row, and updates the "head"795* value to reflect the update. Essentially cycles backwards from [1, {entries per row})796*/797FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {798U32 next = (*tagRow-1) & rowMask;799next += (next == 0) ? rowMask : 0; /* skip first position */800*tagRow = (BYTE)next;801return next;802}803804/* ZSTD_isAligned():805* Checks that a pointer is aligned to "align" bytes which must be a power of 2.806*/807MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {808assert((align & (align - 1)) == 0);809return (((size_t)ptr) & (align - 1)) == 0;810}811812/* ZSTD_row_prefetch():813* Performs prefetching for the hashTable and tagTable at a given row.814*/815FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {816PREFETCH_L1(hashTable + relRow);817if (rowLog >= 5) {818PREFETCH_L1(hashTable + relRow + 16);819/* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */820}821PREFETCH_L1(tagTable + relRow);822if (rowLog == 6) {823PREFETCH_L1(tagTable + relRow + 32);824}825assert(rowLog == 4 || rowLog == 5 || rowLog == 6);826assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */827assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */828}829830/* ZSTD_row_fillHashCache():831* Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,832* but not beyond iLimit.833*/834FORCE_INLINE_TEMPLATE835ZSTD_ALLOW_POINTER_OVERFLOW_ATTR836void ZSTD_row_fillHashCache(ZSTD_MatchState_t* ms, const BYTE* base,837U32 const rowLog, U32 const mls,838U32 idx, const BYTE* const iLimit)839{840U32 const* const hashTable = ms->hashTable;841BYTE const* const tagTable = ms->tagTable;842U32 const hashLog = ms->rowHashLog;843U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);844U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);845846for (; idx < lim; ++idx) {847U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);848U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;849ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);850ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;851}852853DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1],854ms->hashCache[2], ms->hashCache[3], ms->hashCache[4],855ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);856}857858/* ZSTD_row_nextCachedHash():859* Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at860* base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.861*/862FORCE_INLINE_TEMPLATE863ZSTD_ALLOW_POINTER_OVERFLOW_ATTR864U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,865BYTE const* tagTable, BYTE const* base,866U32 idx, U32 const hashLog,867U32 const rowLog, U32 const mls,868U64 const hashSalt)869{870U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);871U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;872ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);873{ U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];874cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;875return hash;876}877}878879/* ZSTD_row_update_internalImpl():880* Updates the hash table with positions starting from updateStartIdx until updateEndIdx.881*/882FORCE_INLINE_TEMPLATE883ZSTD_ALLOW_POINTER_OVERFLOW_ATTR884void ZSTD_row_update_internalImpl(ZSTD_MatchState_t* ms,885U32 updateStartIdx, U32 const updateEndIdx,886U32 const mls, U32 const rowLog,887U32 const rowMask, U32 const useCache)888{889U32* const hashTable = ms->hashTable;890BYTE* const tagTable = ms->tagTable;891U32 const hashLog = ms->rowHashLog;892const BYTE* const base = ms->window.base;893894DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);895for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {896U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)897: (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);898U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;899U32* const row = hashTable + relRow;900BYTE* tagRow = tagTable + relRow;901U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);902903assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));904tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;905row[pos] = updateStartIdx;906}907}908909/* ZSTD_row_update_internal():910* Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.911* Skips sections of long matches as is necessary.912*/913FORCE_INLINE_TEMPLATE914ZSTD_ALLOW_POINTER_OVERFLOW_ATTR915void ZSTD_row_update_internal(ZSTD_MatchState_t* ms, const BYTE* ip,916U32 const mls, U32 const rowLog,917U32 const rowMask, U32 const useCache)918{919U32 idx = ms->nextToUpdate;920const BYTE* const base = ms->window.base;921const U32 target = (U32)(ip - base);922const U32 kSkipThreshold = 384;923const U32 kMaxMatchStartPositionsToUpdate = 96;924const U32 kMaxMatchEndPositionsToUpdate = 32;925926if (useCache) {927/* Only skip positions when using hash cache, i.e.928* if we are loading a dict, don't skip anything.929* If we decide to skip, then we only update a set number930* of positions at the beginning and end of the match.931*/932if (UNLIKELY(target - idx > kSkipThreshold)) {933U32 const bound = idx + kMaxMatchStartPositionsToUpdate;934ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache);935idx = target - kMaxMatchEndPositionsToUpdate;936ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1);937}938}939assert(target >= idx);940ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache);941ms->nextToUpdate = target;942}943944/* ZSTD_row_update():945* External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary946* processing.947*/948void ZSTD_row_update(ZSTD_MatchState_t* const ms, const BYTE* ip) {949const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);950const U32 rowMask = (1u << rowLog) - 1;951const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);952953DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);954ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);955}956957/* Returns the mask width of bits group of which will be set to 1. Given not all958* architectures have easy movemask instruction, this helps to iterate over959* groups of bits easier and faster.960*/961FORCE_INLINE_TEMPLATE U32962ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)963{964assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);965assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);966(void)rowEntries;967#if defined(ZSTD_ARCH_ARM_NEON)968/* NEON path only works for little endian */969if (!MEM_isLittleEndian()) {970return 1;971}972if (rowEntries == 16) {973return 4;974}975if (rowEntries == 32) {976return 2;977}978if (rowEntries == 64) {979return 1;980}981#endif982return 1;983}984985#if defined(ZSTD_ARCH_X86_SSE2)986FORCE_INLINE_TEMPLATE ZSTD_VecMask987ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head)988{989const __m128i comparisonMask = _mm_set1_epi8((char)tag);990int matches[4] = {0};991int i;992assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4);993for (i=0; i<nbChunks; i++) {994const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)(src + 16*i));995const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask);996matches[i] = _mm_movemask_epi8(equalMask);997}998if (nbChunks == 1) return ZSTD_rotateRight_U16((U16)matches[0], head);999if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head);1000assert(nbChunks == 4);1001return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head);1002}1003#endif10041005#if defined(ZSTD_ARCH_ARM_NEON)1006FORCE_INLINE_TEMPLATE ZSTD_VecMask1007ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)1008{1009assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);1010if (rowEntries == 16) {1011/* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.1012* After that groups of 4 bits represent the equalMask. We lower1013* all bits except the highest in these groups by doing AND with1014* 0x88 = 0b10001000.1015*/1016const uint8x16_t chunk = vld1q_u8(src);1017const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));1018const uint8x8_t res = vshrn_n_u16(equalMask, 4);1019const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);1020return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;1021} else if (rowEntries == 32) {1022/* Same idea as with rowEntries == 16 but doing AND with1023* 0x55 = 0b01010101.1024*/1025const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);1026const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);1027const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);1028const uint8x16_t dup = vdupq_n_u8(tag);1029const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);1030const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);1031const uint8x8_t res = vsli_n_u8(t0, t1, 4);1032const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;1033return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;1034} else { /* rowEntries == 64 */1035const uint8x16x4_t chunk = vld4q_u8(src);1036const uint8x16_t dup = vdupq_n_u8(tag);1037const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);1038const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);1039const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);1040const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);10411042const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);1043const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);1044const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);1045const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);1046const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);1047const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);1048return ZSTD_rotateRight_U64(matches, headGrouped);1049}1050}1051#endif10521053/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by1054* ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"1055* matches the hash at the nth position in a row of the tagTable.1056* Each row is a circular buffer beginning at the value of "headGrouped". So we1057* must rotate the "matches" bitfield to match up with the actual layout of the1058* entries within the hashTable */1059FORCE_INLINE_TEMPLATE ZSTD_VecMask1060ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)1061{1062const BYTE* const src = tagRow;1063assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);1064assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);1065assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);10661067#if defined(ZSTD_ARCH_X86_SSE2)10681069return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);10701071#else /* SW or NEON-LE */10721073# if defined(ZSTD_ARCH_ARM_NEON)1074/* This NEON path only works for little endian - otherwise use SWAR below */1075if (MEM_isLittleEndian()) {1076return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);1077}1078# endif /* ZSTD_ARCH_ARM_NEON */1079/* SWAR */1080{ const int chunkSize = sizeof(size_t);1081const size_t shiftAmount = ((chunkSize * 8) - chunkSize);1082const size_t xFF = ~((size_t)0);1083const size_t x01 = xFF / 0xFF;1084const size_t x80 = x01 << 7;1085const size_t splatChar = tag * x01;1086ZSTD_VecMask matches = 0;1087int i = rowEntries - chunkSize;1088assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8));1089if (MEM_isLittleEndian()) { /* runtime check so have two loops */1090const size_t extractMagic = (xFF / 0x7F) >> chunkSize;1091do {1092size_t chunk = MEM_readST(&src[i]);1093chunk ^= splatChar;1094chunk = (((chunk | x80) - x01) | chunk) & x80;1095matches <<= chunkSize;1096matches |= (chunk * extractMagic) >> shiftAmount;1097i -= chunkSize;1098} while (i >= 0);1099} else { /* big endian: reverse bits during extraction */1100const size_t msb = xFF ^ (xFF >> 1);1101const size_t extractMagic = (msb / 0x1FF) | msb;1102do {1103size_t chunk = MEM_readST(&src[i]);1104chunk ^= splatChar;1105chunk = (((chunk | x80) - x01) | chunk) & x80;1106matches <<= chunkSize;1107matches |= ((chunk >> 7) * extractMagic) >> shiftAmount;1108i -= chunkSize;1109} while (i >= 0);1110}1111matches = ~matches;1112if (rowEntries == 16) {1113return ZSTD_rotateRight_U16((U16)matches, headGrouped);1114} else if (rowEntries == 32) {1115return ZSTD_rotateRight_U32((U32)matches, headGrouped);1116} else {1117return ZSTD_rotateRight_U64((U64)matches, headGrouped);1118}1119}1120#endif1121}11221123/* The high-level approach of the SIMD row based match finder is as follows:1124* - Figure out where to insert the new entry:1125* - Generate a hash for current input position and split it into a one byte of tag and `rowHashLog` bits of index.1126* - The hash is salted by a value that changes on every context reset, so when the same table is used1127* we will avoid collisions that would otherwise slow us down by introducing phantom matches.1128* - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines1129* which row to insert into.1130* - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can1131* be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes1132* per row).1133* - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and1134* generate a bitfield that we can cycle through to check the collisions in the hash table.1135* - Pick the longest match.1136* - Insert the tag into the equivalent row and position in the tagTable.1137*/1138FORCE_INLINE_TEMPLATE1139ZSTD_ALLOW_POINTER_OVERFLOW_ATTR1140size_t ZSTD_RowFindBestMatch(1141ZSTD_MatchState_t* ms,1142const BYTE* const ip, const BYTE* const iLimit,1143size_t* offsetPtr,1144const U32 mls, const ZSTD_dictMode_e dictMode,1145const U32 rowLog)1146{1147U32* const hashTable = ms->hashTable;1148BYTE* const tagTable = ms->tagTable;1149U32* const hashCache = ms->hashCache;1150const U32 hashLog = ms->rowHashLog;1151const ZSTD_compressionParameters* const cParams = &ms->cParams;1152const BYTE* const base = ms->window.base;1153const BYTE* const dictBase = ms->window.dictBase;1154const U32 dictLimit = ms->window.dictLimit;1155const BYTE* const prefixStart = base + dictLimit;1156const BYTE* const dictEnd = dictBase + dictLimit;1157const U32 curr = (U32)(ip-base);1158const U32 maxDistance = 1U << cParams->windowLog;1159const U32 lowestValid = ms->window.lowLimit;1160const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;1161const U32 isDictionary = (ms->loadedDictEnd != 0);1162const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;1163const U32 rowEntries = (1U << rowLog);1164const U32 rowMask = rowEntries - 1;1165const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */1166const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);1167const U64 hashSalt = ms->hashSalt;1168U32 nbAttempts = 1U << cappedSearchLog;1169size_t ml=4-1;1170U32 hash;11711172/* DMS/DDS variables that may be referenced laster */1173const ZSTD_MatchState_t* const dms = ms->dictMatchState;11741175/* Initialize the following variables to satisfy static analyzer */1176size_t ddsIdx = 0;1177U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */1178U32 dmsTag = 0;1179U32* dmsRow = NULL;1180BYTE* dmsTagRow = NULL;11811182if (dictMode == ZSTD_dedicatedDictSearch) {1183const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;1184{ /* Prefetch DDS hashtable entry */1185ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;1186PREFETCH_L1(&dms->hashTable[ddsIdx]);1187}1188ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0;1189}11901191if (dictMode == ZSTD_dictMatchState) {1192/* Prefetch DMS rows */1193U32* const dmsHashTable = dms->hashTable;1194BYTE* const dmsTagTable = dms->tagTable;1195U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);1196U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;1197dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;1198dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);1199dmsRow = dmsHashTable + dmsRelRow;1200ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);1201}12021203/* Update the hashTable and tagTable up to (but not including) ip */1204if (!ms->lazySkipping) {1205ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);1206hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);1207} else {1208/* Stop inserting every position when in the lazy skipping mode.1209* The hash cache is also not kept up to date in this mode.1210*/1211hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);1212ms->nextToUpdate = curr;1213}1214ms->hashSaltEntropy += hash; /* collect salt entropy */12151216{ /* Get the hash for ip, compute the appropriate row */1217U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;1218U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;1219U32* const row = hashTable + relRow;1220BYTE* tagRow = (BYTE*)(tagTable + relRow);1221U32 const headGrouped = (*tagRow & rowMask) * groupWidth;1222U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];1223size_t numMatches = 0;1224size_t currMatch = 0;1225ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);12261227/* Cycle through the matches and prefetch */1228for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {1229U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;1230U32 const matchIndex = row[matchPos];1231if(matchPos == 0) continue;1232assert(numMatches < rowEntries);1233if (matchIndex < lowLimit)1234break;1235if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {1236PREFETCH_L1(base + matchIndex);1237} else {1238PREFETCH_L1(dictBase + matchIndex);1239}1240matchBuffer[numMatches++] = matchIndex;1241--nbAttempts;1242}12431244/* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop1245in ZSTD_row_update_internal() at the next search. */1246{1247U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);1248tagRow[pos] = (BYTE)tag;1249row[pos] = ms->nextToUpdate++;1250}12511252/* Return the longest match */1253for (; currMatch < numMatches; ++currMatch) {1254U32 const matchIndex = matchBuffer[currMatch];1255size_t currentMl=0;1256assert(matchIndex < curr);1257assert(matchIndex >= lowLimit);12581259if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {1260const BYTE* const match = base + matchIndex;1261assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */1262/* read 4B starting from (match + ml + 1 - sizeof(U32)) */1263if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */1264currentMl = ZSTD_count(ip, match, iLimit);1265} else {1266const BYTE* const match = dictBase + matchIndex;1267assert(match+4 <= dictEnd);1268if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */1269currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;1270}12711272/* Save best solution */1273if (currentMl > ml) {1274ml = currentMl;1275*offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);1276if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */1277}1278}1279}12801281assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */1282if (dictMode == ZSTD_dedicatedDictSearch) {1283ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,1284ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);1285} else if (dictMode == ZSTD_dictMatchState) {1286/* TODO: Measure and potentially add prefetching to DMS */1287const U32 dmsLowestIndex = dms->window.dictLimit;1288const BYTE* const dmsBase = dms->window.base;1289const BYTE* const dmsEnd = dms->window.nextSrc;1290const U32 dmsSize = (U32)(dmsEnd - dmsBase);1291const U32 dmsIndexDelta = dictLimit - dmsSize;12921293{ U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;1294U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];1295size_t numMatches = 0;1296size_t currMatch = 0;1297ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);12981299for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {1300U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;1301U32 const matchIndex = dmsRow[matchPos];1302if(matchPos == 0) continue;1303if (matchIndex < dmsLowestIndex)1304break;1305PREFETCH_L1(dmsBase + matchIndex);1306matchBuffer[numMatches++] = matchIndex;1307--nbAttempts;1308}13091310/* Return the longest match */1311for (; currMatch < numMatches; ++currMatch) {1312U32 const matchIndex = matchBuffer[currMatch];1313size_t currentMl=0;1314assert(matchIndex >= dmsLowestIndex);1315assert(matchIndex < curr);13161317{ const BYTE* const match = dmsBase + matchIndex;1318assert(match+4 <= dmsEnd);1319if (MEM_read32(match) == MEM_read32(ip))1320currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;1321}13221323if (currentMl > ml) {1324ml = currentMl;1325assert(curr > matchIndex + dmsIndexDelta);1326*offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));1327if (ip+currentMl == iLimit) break;1328}1329}1330}1331}1332return ml;1333}133413351336/**1337* Generate search functions templated on (dictMode, mls, rowLog).1338* These functions are outlined for code size & compilation time.1339* ZSTD_searchMax() dispatches to the correct implementation function.1340*1341* TODO: The start of the search function involves loading and calculating a1342* bunch of constants from the ZSTD_MatchState_t. These computations could be1343* done in an initialization function, and saved somewhere in the match state.1344* Then we could pass a pointer to the saved state instead of the match state,1345* and avoid duplicate computations.1346*1347* TODO: Move the match re-winding into searchMax. This improves compression1348* ratio, and unlocks further simplifications with the next TODO.1349*1350* TODO: Try moving the repcode search into searchMax. After the re-winding1351* and repcode search are in searchMax, there is no more logic in the match1352* finder loop that requires knowledge about the dictMode. So we should be1353* able to avoid force inlining it, and we can join the extDict loop with1354* the single segment loop. It should go in searchMax instead of its own1355* function to avoid having multiple virtual function calls per search.1356*/13571358#define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls1359#define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls1360#define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog13611362#define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE13631364#define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \1365ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \1366ZSTD_MatchState_t* ms, \1367const BYTE* ip, const BYTE* const iLimit, \1368size_t* offBasePtr) \1369{ \1370assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \1371return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \1372} \13731374#define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \1375ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \1376ZSTD_MatchState_t* ms, \1377const BYTE* ip, const BYTE* const iLimit, \1378size_t* offsetPtr) \1379{ \1380assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \1381return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \1382} \13831384#define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \1385ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \1386ZSTD_MatchState_t* ms, \1387const BYTE* ip, const BYTE* const iLimit, \1388size_t* offsetPtr) \1389{ \1390assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \1391assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \1392return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \1393} \13941395#define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \1396X(dictMode, mls, 4) \1397X(dictMode, mls, 5) \1398X(dictMode, mls, 6)13991400#define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \1401ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \1402ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \1403ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6)14041405#define ZSTD_FOR_EACH_MLS(X, dictMode) \1406X(dictMode, 4) \1407X(dictMode, 5) \1408X(dictMode, 6)14091410#define ZSTD_FOR_EACH_DICT_MODE(X, ...) \1411X(__VA_ARGS__, noDict) \1412X(__VA_ARGS__, extDict) \1413X(__VA_ARGS__, dictMatchState) \1414X(__VA_ARGS__, dedicatedDictSearch)14151416/* Generate row search fns for each combination of (dictMode, mls, rowLog) */1417ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN)1418/* Generate binary Tree search fns for each combination of (dictMode, mls) */1419ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN)1420/* Generate hash chain search fns for each combination of (dictMode, mls) */1421ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN)14221423typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;14241425#define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \1426case mls: \1427return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);1428#define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \1429case mls: \1430return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);1431#define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \1432case rowLog: \1433return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr);14341435#define ZSTD_SWITCH_MLS(X, dictMode) \1436switch (mls) { \1437ZSTD_FOR_EACH_MLS(X, dictMode) \1438}14391440#define ZSTD_SWITCH_ROWLOG(dictMode, mls) \1441case mls: \1442switch (rowLog) { \1443ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \1444} \1445ZSTD_UNREACHABLE; \1446break;14471448#define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \1449switch (searchMethod) { \1450case search_hashChain: \1451ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \1452break; \1453case search_binaryTree: \1454ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \1455break; \1456case search_rowHash: \1457ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \1458break; \1459} \1460ZSTD_UNREACHABLE;14611462/**1463* Searches for the longest match at @p ip.1464* Dispatches to the correct implementation function based on the1465* (searchMethod, dictMode, mls, rowLog). We use switch statements1466* here instead of using an indirect function call through a function1467* pointer because after Spectre and Meltdown mitigations, indirect1468* function calls can be very costly, especially in the kernel.1469*1470* NOTE: dictMode and searchMethod should be templated, so those switch1471* statements should be optimized out. Only the mls & rowLog switches1472* should be left.1473*1474* @param ms The match state.1475* @param ip The position to search at.1476* @param iend The end of the input data.1477* @param[out] offsetPtr Stores the match offset into this pointer.1478* @param mls The minimum search length, in the range [4, 6].1479* @param rowLog The row log (if applicable), in the range [4, 6].1480* @param searchMethod The search method to use (templated).1481* @param dictMode The dictMode (templated).1482*1483* @returns The length of the longest match found, or < mls if no match is found.1484* If a match is found its offset is stored in @p offsetPtr.1485*/1486FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(1487ZSTD_MatchState_t* ms,1488const BYTE* ip,1489const BYTE* iend,1490size_t* offsetPtr,1491U32 const mls,1492U32 const rowLog,1493searchMethod_e const searchMethod,1494ZSTD_dictMode_e const dictMode)1495{1496if (dictMode == ZSTD_noDict) {1497ZSTD_SWITCH_SEARCH_METHOD(noDict)1498} else if (dictMode == ZSTD_extDict) {1499ZSTD_SWITCH_SEARCH_METHOD(extDict)1500} else if (dictMode == ZSTD_dictMatchState) {1501ZSTD_SWITCH_SEARCH_METHOD(dictMatchState)1502} else if (dictMode == ZSTD_dedicatedDictSearch) {1503ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch)1504}1505ZSTD_UNREACHABLE;1506return 0;1507}15081509/* *******************************1510* Common parser - lazy strategy1511*********************************/15121513FORCE_INLINE_TEMPLATE1514ZSTD_ALLOW_POINTER_OVERFLOW_ATTR1515size_t ZSTD_compressBlock_lazy_generic(1516ZSTD_MatchState_t* ms, SeqStore_t* seqStore,1517U32 rep[ZSTD_REP_NUM],1518const void* src, size_t srcSize,1519const searchMethod_e searchMethod, const U32 depth,1520ZSTD_dictMode_e const dictMode)1521{1522const BYTE* const istart = (const BYTE*)src;1523const BYTE* ip = istart;1524const BYTE* anchor = istart;1525const BYTE* const iend = istart + srcSize;1526const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;1527const BYTE* const base = ms->window.base;1528const U32 prefixLowestIndex = ms->window.dictLimit;1529const BYTE* const prefixLowest = base + prefixLowestIndex;1530const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);1531const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);15321533U32 offset_1 = rep[0], offset_2 = rep[1];1534U32 offsetSaved1 = 0, offsetSaved2 = 0;15351536const int isDMS = dictMode == ZSTD_dictMatchState;1537const int isDDS = dictMode == ZSTD_dedicatedDictSearch;1538const int isDxS = isDMS || isDDS;1539const ZSTD_MatchState_t* const dms = ms->dictMatchState;1540const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : 0;1541const BYTE* const dictBase = isDxS ? dms->window.base : NULL;1542const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL;1543const BYTE* const dictEnd = isDxS ? dms->window.nextSrc : NULL;1544const U32 dictIndexDelta = isDxS ?1545prefixLowestIndex - (U32)(dictEnd - dictBase) :15460;1547const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));15481549DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);1550ip += (dictAndPrefixLength == 0);1551if (dictMode == ZSTD_noDict) {1552U32 const curr = (U32)(ip - base);1553U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);1554U32 const maxRep = curr - windowLow;1555if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;1556if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;1557}1558if (isDxS) {1559/* dictMatchState repCode checks don't currently handle repCode == 01560* disabling. */1561assert(offset_1 <= dictAndPrefixLength);1562assert(offset_2 <= dictAndPrefixLength);1563}15641565/* Reset the lazy skipping state */1566ms->lazySkipping = 0;15671568if (searchMethod == search_rowHash) {1569ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);1570}15711572/* Match Loop */1573#if defined(__GNUC__) && defined(__x86_64__)1574/* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the1575* code alignment is perturbed. To fix the instability align the loop on 32-bytes.1576*/1577__asm__(".p2align 5");1578#endif1579while (ip < ilimit) {1580size_t matchLength=0;1581size_t offBase = REPCODE1_TO_OFFBASE;1582const BYTE* start=ip+1;1583DEBUGLOG(7, "search baseline (depth 0)");15841585/* check repCode */1586if (isDxS) {1587const U32 repIndex = (U32)(ip - base) + 1 - offset_1;1588const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch)1589&& repIndex < prefixLowestIndex) ?1590dictBase + (repIndex - dictIndexDelta) :1591base + repIndex;1592if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex))1593&& (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {1594const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;1595matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;1596if (depth==0) goto _storeSequence;1597}1598}1599if ( dictMode == ZSTD_noDict1600&& ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {1601matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;1602if (depth==0) goto _storeSequence;1603}16041605/* first search (depth 0) */1606{ size_t offbaseFound = 999999999;1607size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);1608if (ml2 > matchLength)1609matchLength = ml2, start = ip, offBase = offbaseFound;1610}16111612if (matchLength < 4) {1613size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */;1614ip += step;1615/* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.1616* In this mode we stop inserting every position into our tables, and only insert1617* positions that we search, which is one in step positions.1618* The exact cutoff is flexible, I've just chosen a number that is reasonably high,1619* so we minimize the compression ratio loss in "normal" scenarios. This mode gets1620* triggered once we've gone 2KB without finding any matches.1621*/1622ms->lazySkipping = step > kLazySkippingStep;1623continue;1624}16251626/* let's try to find a better solution */1627if (depth>=1)1628while (ip<ilimit) {1629DEBUGLOG(7, "search depth 1");1630ip ++;1631if ( (dictMode == ZSTD_noDict)1632&& (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {1633size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;1634int const gain2 = (int)(mlRep * 3);1635int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);1636if ((mlRep >= 4) && (gain2 > gain1))1637matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;1638}1639if (isDxS) {1640const U32 repIndex = (U32)(ip - base) - offset_1;1641const BYTE* repMatch = repIndex < prefixLowestIndex ?1642dictBase + (repIndex - dictIndexDelta) :1643base + repIndex;1644if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex))1645&& (MEM_read32(repMatch) == MEM_read32(ip)) ) {1646const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;1647size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;1648int const gain2 = (int)(mlRep * 3);1649int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);1650if ((mlRep >= 4) && (gain2 > gain1))1651matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;1652}1653}1654{ size_t ofbCandidate=999999999;1655size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);1656int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */1657int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);1658if ((ml2 >= 4) && (gain2 > gain1)) {1659matchLength = ml2, offBase = ofbCandidate, start = ip;1660continue; /* search a better one */1661} }16621663/* let's find an even better one */1664if ((depth==2) && (ip<ilimit)) {1665DEBUGLOG(7, "search depth 2");1666ip ++;1667if ( (dictMode == ZSTD_noDict)1668&& (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {1669size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;1670int const gain2 = (int)(mlRep * 4);1671int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);1672if ((mlRep >= 4) && (gain2 > gain1))1673matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;1674}1675if (isDxS) {1676const U32 repIndex = (U32)(ip - base) - offset_1;1677const BYTE* repMatch = repIndex < prefixLowestIndex ?1678dictBase + (repIndex - dictIndexDelta) :1679base + repIndex;1680if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex))1681&& (MEM_read32(repMatch) == MEM_read32(ip)) ) {1682const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;1683size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;1684int const gain2 = (int)(mlRep * 4);1685int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);1686if ((mlRep >= 4) && (gain2 > gain1))1687matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;1688}1689}1690{ size_t ofbCandidate=999999999;1691size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);1692int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */1693int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);1694if ((ml2 >= 4) && (gain2 > gain1)) {1695matchLength = ml2, offBase = ofbCandidate, start = ip;1696continue;1697} } }1698break; /* nothing found : store previous solution */1699}17001701/* NOTE:1702* Pay attention that `start[-value]` can lead to strange undefined behavior1703* notably if `value` is unsigned, resulting in a large positive `-value`.1704*/1705/* catch up */1706if (OFFBASE_IS_OFFSET(offBase)) {1707if (dictMode == ZSTD_noDict) {1708while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))1709&& (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */1710{ start--; matchLength++; }1711}1712if (isDxS) {1713U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));1714const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;1715const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;1716while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */1717}1718offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);1719}1720/* store sequence */1721_storeSequence:1722{ size_t const litLength = (size_t)(start - anchor);1723ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);1724anchor = ip = start + matchLength;1725}1726if (ms->lazySkipping) {1727/* We've found a match, disable lazy skipping mode, and refill the hash cache. */1728if (searchMethod == search_rowHash) {1729ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);1730}1731ms->lazySkipping = 0;1732}17331734/* check immediate repcode */1735if (isDxS) {1736while (ip <= ilimit) {1737U32 const current2 = (U32)(ip-base);1738U32 const repIndex = current2 - offset_2;1739const BYTE* repMatch = repIndex < prefixLowestIndex ?1740dictBase - dictIndexDelta + repIndex :1741base + repIndex;1742if ( (ZSTD_index_overlap_check(prefixLowestIndex, repIndex))1743&& (MEM_read32(repMatch) == MEM_read32(ip)) ) {1744const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;1745matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;1746offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */1747ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);1748ip += matchLength;1749anchor = ip;1750continue;1751}1752break;1753}1754}17551756if (dictMode == ZSTD_noDict) {1757while ( ((ip <= ilimit) & (offset_2>0))1758&& (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {1759/* store sequence */1760matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;1761offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */1762ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);1763ip += matchLength;1764anchor = ip;1765continue; /* faster when present ... (?) */1766} } }17671768/* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),1769* rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */1770offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;17711772/* save reps for next block */1773rep[0] = offset_1 ? offset_1 : offsetSaved1;1774rep[1] = offset_2 ? offset_2 : offsetSaved2;17751776/* Return the last literals size */1777return (size_t)(iend - anchor);1778}1779#endif /* build exclusions */178017811782#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR1783size_t ZSTD_compressBlock_greedy(1784ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1785void const* src, size_t srcSize)1786{1787return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);1788}17891790size_t ZSTD_compressBlock_greedy_dictMatchState(1791ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1792void const* src, size_t srcSize)1793{1794return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);1795}17961797size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(1798ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1799void const* src, size_t srcSize)1800{1801return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);1802}18031804size_t ZSTD_compressBlock_greedy_row(1805ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1806void const* src, size_t srcSize)1807{1808return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);1809}18101811size_t ZSTD_compressBlock_greedy_dictMatchState_row(1812ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1813void const* src, size_t srcSize)1814{1815return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);1816}18171818size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(1819ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1820void const* src, size_t srcSize)1821{1822return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);1823}1824#endif18251826#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR1827size_t ZSTD_compressBlock_lazy(1828ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1829void const* src, size_t srcSize)1830{1831return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);1832}18331834size_t ZSTD_compressBlock_lazy_dictMatchState(1835ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1836void const* src, size_t srcSize)1837{1838return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);1839}18401841size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(1842ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1843void const* src, size_t srcSize)1844{1845return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);1846}18471848size_t ZSTD_compressBlock_lazy_row(1849ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1850void const* src, size_t srcSize)1851{1852return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);1853}18541855size_t ZSTD_compressBlock_lazy_dictMatchState_row(1856ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1857void const* src, size_t srcSize)1858{1859return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);1860}18611862size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(1863ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1864void const* src, size_t srcSize)1865{1866return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);1867}1868#endif18691870#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR1871size_t ZSTD_compressBlock_lazy2(1872ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1873void const* src, size_t srcSize)1874{1875return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);1876}18771878size_t ZSTD_compressBlock_lazy2_dictMatchState(1879ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1880void const* src, size_t srcSize)1881{1882return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);1883}18841885size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(1886ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1887void const* src, size_t srcSize)1888{1889return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);1890}18911892size_t ZSTD_compressBlock_lazy2_row(1893ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1894void const* src, size_t srcSize)1895{1896return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);1897}18981899size_t ZSTD_compressBlock_lazy2_dictMatchState_row(1900ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1901void const* src, size_t srcSize)1902{1903return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);1904}19051906size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(1907ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1908void const* src, size_t srcSize)1909{1910return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);1911}1912#endif19131914#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR1915size_t ZSTD_compressBlock_btlazy2(1916ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1917void const* src, size_t srcSize)1918{1919return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);1920}19211922size_t ZSTD_compressBlock_btlazy2_dictMatchState(1923ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],1924void const* src, size_t srcSize)1925{1926return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);1927}1928#endif19291930#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \1931|| !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \1932|| !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \1933|| !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)1934FORCE_INLINE_TEMPLATE1935ZSTD_ALLOW_POINTER_OVERFLOW_ATTR1936size_t ZSTD_compressBlock_lazy_extDict_generic(1937ZSTD_MatchState_t* ms, SeqStore_t* seqStore,1938U32 rep[ZSTD_REP_NUM],1939const void* src, size_t srcSize,1940const searchMethod_e searchMethod, const U32 depth)1941{1942const BYTE* const istart = (const BYTE*)src;1943const BYTE* ip = istart;1944const BYTE* anchor = istart;1945const BYTE* const iend = istart + srcSize;1946const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;1947const BYTE* const base = ms->window.base;1948const U32 dictLimit = ms->window.dictLimit;1949const BYTE* const prefixStart = base + dictLimit;1950const BYTE* const dictBase = ms->window.dictBase;1951const BYTE* const dictEnd = dictBase + dictLimit;1952const BYTE* const dictStart = dictBase + ms->window.lowLimit;1953const U32 windowLog = ms->cParams.windowLog;1954const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);1955const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);19561957U32 offset_1 = rep[0], offset_2 = rep[1];19581959DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);19601961/* Reset the lazy skipping state */1962ms->lazySkipping = 0;19631964/* init */1965ip += (ip == prefixStart);1966if (searchMethod == search_rowHash) {1967ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);1968}19691970/* Match Loop */1971#if defined(__GNUC__) && defined(__x86_64__)1972/* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the1973* code alignment is perturbed. To fix the instability align the loop on 32-bytes.1974*/1975__asm__(".p2align 5");1976#endif1977while (ip < ilimit) {1978size_t matchLength=0;1979size_t offBase = REPCODE1_TO_OFFBASE;1980const BYTE* start=ip+1;1981U32 curr = (U32)(ip-base);19821983/* check repCode */1984{ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+1, windowLog);1985const U32 repIndex = (U32)(curr+1 - offset_1);1986const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;1987const BYTE* const repMatch = repBase + repIndex;1988if ( (ZSTD_index_overlap_check(dictLimit, repIndex))1989& (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */1990if (MEM_read32(ip+1) == MEM_read32(repMatch)) {1991/* repcode detected we should take it */1992const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;1993matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repEnd, prefixStart) + 4;1994if (depth==0) goto _storeSequence;1995} }19961997/* first search (depth 0) */1998{ size_t ofbCandidate = 999999999;1999size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);2000if (ml2 > matchLength)2001matchLength = ml2, start = ip, offBase = ofbCandidate;2002}20032004if (matchLength < 4) {2005size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);2006ip += step + 1; /* jump faster over incompressible sections */2007/* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.2008* In this mode we stop inserting every position into our tables, and only insert2009* positions that we search, which is one in step positions.2010* The exact cutoff is flexible, I've just chosen a number that is reasonably high,2011* so we minimize the compression ratio loss in "normal" scenarios. This mode gets2012* triggered once we've gone 2KB without finding any matches.2013*/2014ms->lazySkipping = step > kLazySkippingStep;2015continue;2016}20172018/* let's try to find a better solution */2019if (depth>=1)2020while (ip<ilimit) {2021ip ++;2022curr++;2023/* check repCode */2024if (offBase) {2025const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);2026const U32 repIndex = (U32)(curr - offset_1);2027const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;2028const BYTE* const repMatch = repBase + repIndex;2029if ( (ZSTD_index_overlap_check(dictLimit, repIndex))2030& (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */2031if (MEM_read32(ip) == MEM_read32(repMatch)) {2032/* repcode detected */2033const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;2034size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;2035int const gain2 = (int)(repLength * 3);2036int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);2037if ((repLength >= 4) && (gain2 > gain1))2038matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;2039} }20402041/* search match, depth 1 */2042{ size_t ofbCandidate = 999999999;2043size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);2044int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */2045int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);2046if ((ml2 >= 4) && (gain2 > gain1)) {2047matchLength = ml2, offBase = ofbCandidate, start = ip;2048continue; /* search a better one */2049} }20502051/* let's find an even better one */2052if ((depth==2) && (ip<ilimit)) {2053ip ++;2054curr++;2055/* check repCode */2056if (offBase) {2057const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);2058const U32 repIndex = (U32)(curr - offset_1);2059const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;2060const BYTE* const repMatch = repBase + repIndex;2061if ( (ZSTD_index_overlap_check(dictLimit, repIndex))2062& (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */2063if (MEM_read32(ip) == MEM_read32(repMatch)) {2064/* repcode detected */2065const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;2066size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;2067int const gain2 = (int)(repLength * 4);2068int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);2069if ((repLength >= 4) && (gain2 > gain1))2070matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;2071} }20722073/* search match, depth 2 */2074{ size_t ofbCandidate = 999999999;2075size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);2076int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */2077int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);2078if ((ml2 >= 4) && (gain2 > gain1)) {2079matchLength = ml2, offBase = ofbCandidate, start = ip;2080continue;2081} } }2082break; /* nothing found : store previous solution */2083}20842085/* catch up */2086if (OFFBASE_IS_OFFSET(offBase)) {2087U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));2088const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;2089const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;2090while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */2091offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);2092}20932094/* store sequence */2095_storeSequence:2096{ size_t const litLength = (size_t)(start - anchor);2097ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);2098anchor = ip = start + matchLength;2099}2100if (ms->lazySkipping) {2101/* We've found a match, disable lazy skipping mode, and refill the hash cache. */2102if (searchMethod == search_rowHash) {2103ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);2104}2105ms->lazySkipping = 0;2106}21072108/* check immediate repcode */2109while (ip <= ilimit) {2110const U32 repCurrent = (U32)(ip-base);2111const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog);2112const U32 repIndex = repCurrent - offset_2;2113const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;2114const BYTE* const repMatch = repBase + repIndex;2115if ( (ZSTD_index_overlap_check(dictLimit, repIndex))2116& (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */2117if (MEM_read32(ip) == MEM_read32(repMatch)) {2118/* repcode detected we should take it */2119const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;2120matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;2121offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */2122ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);2123ip += matchLength;2124anchor = ip;2125continue; /* faster when present ... (?) */2126}2127break;2128} }21292130/* Save reps for next block */2131rep[0] = offset_1;2132rep[1] = offset_2;21332134/* Return the last literals size */2135return (size_t)(iend - anchor);2136}2137#endif /* build exclusions */21382139#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR2140size_t ZSTD_compressBlock_greedy_extDict(2141ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],2142void const* src, size_t srcSize)2143{2144return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);2145}21462147size_t ZSTD_compressBlock_greedy_extDict_row(2148ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],2149void const* src, size_t srcSize)2150{2151return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);2152}2153#endif21542155#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR2156size_t ZSTD_compressBlock_lazy_extDict(2157ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],2158void const* src, size_t srcSize)21592160{2161return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);2162}21632164size_t ZSTD_compressBlock_lazy_extDict_row(2165ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],2166void const* src, size_t srcSize)21672168{2169return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);2170}2171#endif21722173#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR2174size_t ZSTD_compressBlock_lazy2_extDict(2175ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],2176void const* src, size_t srcSize)21772178{2179return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);2180}21812182size_t ZSTD_compressBlock_lazy2_extDict_row(2183ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],2184void const* src, size_t srcSize)2185{2186return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);2187}2188#endif21892190#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR2191size_t ZSTD_compressBlock_btlazy2_extDict(2192ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],2193void const* src, size_t srcSize)21942195{2196return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);2197}2198#endif219922002201