Path: blob/master/Utilities/cmzstd/lib/compress/zstd_fast.c
5018 views
/*1* Copyright (c) Meta Platforms, Inc. and affiliates.2* All rights reserved.3*4* This source code is licensed under both the BSD-style license (found in the5* LICENSE file in the root directory of this source tree) and the GPLv2 (found6* in the COPYING file in the root directory of this source tree).7* You may select, at your option, one of the above-listed licenses.8*/910#include "zstd_compress_internal.h" /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */11#include "zstd_fast.h"1213static14ZSTD_ALLOW_POINTER_OVERFLOW_ATTR15void ZSTD_fillHashTableForCDict(ZSTD_MatchState_t* ms,16const void* const end,17ZSTD_dictTableLoadMethod_e dtlm)18{19const ZSTD_compressionParameters* const cParams = &ms->cParams;20U32* const hashTable = ms->hashTable;21U32 const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;22U32 const mls = cParams->minMatch;23const BYTE* const base = ms->window.base;24const BYTE* ip = base + ms->nextToUpdate;25const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;26const U32 fastHashFillStep = 3;2728/* Currently, we always use ZSTD_dtlm_full for filling CDict tables.29* Feel free to remove this assert if there's a good reason! */30assert(dtlm == ZSTD_dtlm_full);3132/* Always insert every fastHashFillStep position into the hash table.33* Insert the other positions if their hash entry is empty.34*/35for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {36U32 const curr = (U32)(ip - base);37{ size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);38ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr); }3940if (dtlm == ZSTD_dtlm_fast) continue;41/* Only load extra positions for ZSTD_dtlm_full */42{ U32 p;43for (p = 1; p < fastHashFillStep; ++p) {44size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);45if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { /* not yet filled */46ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);47} } } }48}4950static51ZSTD_ALLOW_POINTER_OVERFLOW_ATTR52void ZSTD_fillHashTableForCCtx(ZSTD_MatchState_t* ms,53const void* const end,54ZSTD_dictTableLoadMethod_e dtlm)55{56const ZSTD_compressionParameters* const cParams = &ms->cParams;57U32* const hashTable = ms->hashTable;58U32 const hBits = cParams->hashLog;59U32 const mls = cParams->minMatch;60const BYTE* const base = ms->window.base;61const BYTE* ip = base + ms->nextToUpdate;62const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;63const U32 fastHashFillStep = 3;6465/* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.66* Feel free to remove this assert if there's a good reason! */67assert(dtlm == ZSTD_dtlm_fast);6869/* Always insert every fastHashFillStep position into the hash table.70* Insert the other positions if their hash entry is empty.71*/72for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {73U32 const curr = (U32)(ip - base);74size_t const hash0 = ZSTD_hashPtr(ip, hBits, mls);75hashTable[hash0] = curr;76if (dtlm == ZSTD_dtlm_fast) continue;77/* Only load extra positions for ZSTD_dtlm_full */78{ U32 p;79for (p = 1; p < fastHashFillStep; ++p) {80size_t const hash = ZSTD_hashPtr(ip + p, hBits, mls);81if (hashTable[hash] == 0) { /* not yet filled */82hashTable[hash] = curr + p;83} } } }84}8586void ZSTD_fillHashTable(ZSTD_MatchState_t* ms,87const void* const end,88ZSTD_dictTableLoadMethod_e dtlm,89ZSTD_tableFillPurpose_e tfp)90{91if (tfp == ZSTD_tfp_forCDict) {92ZSTD_fillHashTableForCDict(ms, end, dtlm);93} else {94ZSTD_fillHashTableForCCtx(ms, end, dtlm);95}96}979899typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit);100101static int102ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit)103{104/* Array of ~random data, should have low probability of matching data.105* Load from here if the index is invalid.106* Used to avoid unpredictable branches. */107static const BYTE dummy[] = {0x12,0x34,0x56,0x78};108109/* currentIdx >= lowLimit is a (somewhat) unpredictable branch.110* However expression below compiles into conditional move.111*/112const BYTE* mvalAddr = ZSTD_selectAddr(matchIdx, idxLowLimit, matchAddress, dummy);113/* Note: this used to be written as : return test1 && test2;114* Unfortunately, once inlined, these tests become branches,115* in which case it becomes critical that they are executed in the right order (test1 then test2).116* So we have to write these tests in a specific manner to ensure their ordering.117*/118if (MEM_read32(currentPtr) != MEM_read32(mvalAddr)) return 0;119/* force ordering of these tests, which matters once the function is inlined, as they become branches */120#if defined(__GNUC__)121__asm__("");122#endif123return matchIdx >= idxLowLimit;124}125126static int127ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit)128{129/* using a branch instead of a cmov,130* because it's faster in scenarios where matchIdx >= idxLowLimit is generally true,131* aka almost all candidates are within range */132U32 mval;133if (matchIdx >= idxLowLimit) {134mval = MEM_read32(matchAddress);135} else {136mval = MEM_read32(currentPtr) ^ 1; /* guaranteed to not match. */137}138139return (MEM_read32(currentPtr) == mval);140}141142143/**144* If you squint hard enough (and ignore repcodes), the search operation at any145* given position is broken into 4 stages:146*147* 1. Hash (map position to hash value via input read)148* 2. Lookup (map hash val to index via hashtable read)149* 3. Load (map index to value at that position via input read)150* 4. Compare151*152* Each of these steps involves a memory read at an address which is computed153* from the previous step. This means these steps must be sequenced and their154* latencies are cumulative.155*156* Rather than do 1->2->3->4 sequentially for a single position before moving157* onto the next, this implementation interleaves these operations across the158* next few positions:159*160* R = Repcode Read & Compare161* H = Hash162* T = Table Lookup163* M = Match Read & Compare164*165* Pos | Time -->166* ----+-------------------167* N | ... M168* N+1 | ... TM169* N+2 | R H T M170* N+3 | H TM171* N+4 | R H T M172* N+5 | H ...173* N+6 | R ...174*175* This is very much analogous to the pipelining of execution in a CPU. And just176* like a CPU, we have to dump the pipeline when we find a match (i.e., take a177* branch).178*179* When this happens, we throw away our current state, and do the following prep180* to re-enter the loop:181*182* Pos | Time -->183* ----+-------------------184* N | H T185* N+1 | H186*187* This is also the work we do at the beginning to enter the loop initially.188*/189FORCE_INLINE_TEMPLATE190ZSTD_ALLOW_POINTER_OVERFLOW_ATTR191size_t ZSTD_compressBlock_fast_noDict_generic(192ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],193void const* src, size_t srcSize,194U32 const mls, int useCmov)195{196const ZSTD_compressionParameters* const cParams = &ms->cParams;197U32* const hashTable = ms->hashTable;198U32 const hlog = cParams->hashLog;199size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; /* min 2 */200const BYTE* const base = ms->window.base;201const BYTE* const istart = (const BYTE*)src;202const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);203const U32 prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);204const BYTE* const prefixStart = base + prefixStartIndex;205const BYTE* const iend = istart + srcSize;206const BYTE* const ilimit = iend - HASH_READ_SIZE;207208const BYTE* anchor = istart;209const BYTE* ip0 = istart;210const BYTE* ip1;211const BYTE* ip2;212const BYTE* ip3;213U32 current0;214215U32 rep_offset1 = rep[0];216U32 rep_offset2 = rep[1];217U32 offsetSaved1 = 0, offsetSaved2 = 0;218219size_t hash0; /* hash for ip0 */220size_t hash1; /* hash for ip1 */221U32 matchIdx; /* match idx for ip0 */222223U32 offcode;224const BYTE* match0;225size_t mLength;226227/* ip0 and ip1 are always adjacent. The targetLength skipping and228* uncompressibility acceleration is applied to every other position,229* matching the behavior of #1562. step therefore represents the gap230* between pairs of positions, from ip0 to ip2 or ip1 to ip3. */231size_t step;232const BYTE* nextStep;233const size_t kStepIncr = (1 << (kSearchStrength - 1));234const ZSTD_match4Found matchFound = useCmov ? ZSTD_match4Found_cmov : ZSTD_match4Found_branch;235236DEBUGLOG(5, "ZSTD_compressBlock_fast_generic");237ip0 += (ip0 == prefixStart);238{ U32 const curr = (U32)(ip0 - base);239U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);240U32 const maxRep = curr - windowLow;241if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0;242if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0;243}244245/* start each op */246_start: /* Requires: ip0 */247248step = stepSize;249nextStep = ip0 + kStepIncr;250251/* calculate positions, ip0 - anchor == 0, so we skip step calc */252ip1 = ip0 + 1;253ip2 = ip0 + step;254ip3 = ip2 + 1;255256if (ip3 >= ilimit) {257goto _cleanup;258}259260hash0 = ZSTD_hashPtr(ip0, hlog, mls);261hash1 = ZSTD_hashPtr(ip1, hlog, mls);262263matchIdx = hashTable[hash0];264265do {266/* load repcode match for ip[2]*/267const U32 rval = MEM_read32(ip2 - rep_offset1);268269/* write back hash table entry */270current0 = (U32)(ip0 - base);271hashTable[hash0] = current0;272273/* check repcode at ip[2] */274if ((MEM_read32(ip2) == rval) & (rep_offset1 > 0)) {275ip0 = ip2;276match0 = ip0 - rep_offset1;277mLength = ip0[-1] == match0[-1];278ip0 -= mLength;279match0 -= mLength;280offcode = REPCODE1_TO_OFFBASE;281mLength += 4;282283/* Write next hash table entry: it's already calculated.284* This write is known to be safe because ip1 is before the285* repcode (ip2). */286hashTable[hash1] = (U32)(ip1 - base);287288goto _match;289}290291if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {292/* Write next hash table entry (it's already calculated).293* This write is known to be safe because the ip1 == ip0 + 1,294* so searching will resume after ip1 */295hashTable[hash1] = (U32)(ip1 - base);296297goto _offset;298}299300/* lookup ip[1] */301matchIdx = hashTable[hash1];302303/* hash ip[2] */304hash0 = hash1;305hash1 = ZSTD_hashPtr(ip2, hlog, mls);306307/* advance to next positions */308ip0 = ip1;309ip1 = ip2;310ip2 = ip3;311312/* write back hash table entry */313current0 = (U32)(ip0 - base);314hashTable[hash0] = current0;315316if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {317/* Write next hash table entry, since it's already calculated */318if (step <= 4) {319/* Avoid writing an index if it's >= position where search will resume.320* The minimum possible match has length 4, so search can resume at ip0 + 4.321*/322hashTable[hash1] = (U32)(ip1 - base);323}324goto _offset;325}326327/* lookup ip[1] */328matchIdx = hashTable[hash1];329330/* hash ip[2] */331hash0 = hash1;332hash1 = ZSTD_hashPtr(ip2, hlog, mls);333334/* advance to next positions */335ip0 = ip1;336ip1 = ip2;337ip2 = ip0 + step;338ip3 = ip1 + step;339340/* calculate step */341if (ip2 >= nextStep) {342step++;343PREFETCH_L1(ip1 + 64);344PREFETCH_L1(ip1 + 128);345nextStep += kStepIncr;346}347} while (ip3 < ilimit);348349_cleanup:350/* Note that there are probably still a couple positions one could search.351* However, it seems to be a meaningful performance hit to try to search352* them. So let's not. */353354/* When the repcodes are outside of the prefix, we set them to zero before the loop.355* When the offsets are still zero, we need to restore them after the block to have a correct356* repcode history. If only one offset was invalid, it is easy. The tricky case is when both357* offsets were invalid. We need to figure out which offset to refill with.358* - If both offsets are zero they are in the same order.359* - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.360* - If only one is zero, we need to decide which offset to restore.361* - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.362* - It is impossible for rep_offset2 to be non-zero.363*364* So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then365* set rep[0] = rep_offset1 and rep[1] = offsetSaved1.366*/367offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;368369/* save reps for next block */370rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;371rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;372373/* Return the last literals size */374return (size_t)(iend - anchor);375376_offset: /* Requires: ip0, idx */377378/* Compute the offset code. */379match0 = base + matchIdx;380rep_offset2 = rep_offset1;381rep_offset1 = (U32)(ip0-match0);382offcode = OFFSET_TO_OFFBASE(rep_offset1);383mLength = 4;384385/* Count the backwards match length. */386while (((ip0>anchor) & (match0>prefixStart)) && (ip0[-1] == match0[-1])) {387ip0--;388match0--;389mLength++;390}391392_match: /* Requires: ip0, match0, offcode */393394/* Count the forward length. */395mLength += ZSTD_count(ip0 + mLength, match0 + mLength, iend);396397ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);398399ip0 += mLength;400anchor = ip0;401402/* Fill table and check for immediate repcode. */403if (ip0 <= ilimit) {404/* Fill Table */405assert(base+current0+2 > istart); /* check base overflow */406hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */407hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);408409if (rep_offset2 > 0) { /* rep_offset2==0 means rep_offset2 is invalidated */410while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - rep_offset2)) ) {411/* store sequence */412size_t const rLength = ZSTD_count(ip0+4, ip0+4-rep_offset2, iend) + 4;413{ U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */414hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);415ip0 += rLength;416ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);417anchor = ip0;418continue; /* faster when present (confirmed on gcc-8) ... (?) */419} } }420421goto _start;422}423424#define ZSTD_GEN_FAST_FN(dictMode, mml, cmov) \425static size_t ZSTD_compressBlock_fast_##dictMode##_##mml##_##cmov( \426ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \427void const* src, size_t srcSize) \428{ \429return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mml, cmov); \430}431432ZSTD_GEN_FAST_FN(noDict, 4, 1)433ZSTD_GEN_FAST_FN(noDict, 5, 1)434ZSTD_GEN_FAST_FN(noDict, 6, 1)435ZSTD_GEN_FAST_FN(noDict, 7, 1)436437ZSTD_GEN_FAST_FN(noDict, 4, 0)438ZSTD_GEN_FAST_FN(noDict, 5, 0)439ZSTD_GEN_FAST_FN(noDict, 6, 0)440ZSTD_GEN_FAST_FN(noDict, 7, 0)441442size_t ZSTD_compressBlock_fast(443ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],444void const* src, size_t srcSize)445{446U32 const mml = ms->cParams.minMatch;447/* use cmov when "candidate in range" branch is likely unpredictable */448int const useCmov = ms->cParams.windowLog < 19;449assert(ms->dictMatchState == NULL);450if (useCmov) {451switch(mml)452{453default: /* includes case 3 */454case 4 :455return ZSTD_compressBlock_fast_noDict_4_1(ms, seqStore, rep, src, srcSize);456case 5 :457return ZSTD_compressBlock_fast_noDict_5_1(ms, seqStore, rep, src, srcSize);458case 6 :459return ZSTD_compressBlock_fast_noDict_6_1(ms, seqStore, rep, src, srcSize);460case 7 :461return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize);462}463} else {464/* use a branch instead */465switch(mml)466{467default: /* includes case 3 */468case 4 :469return ZSTD_compressBlock_fast_noDict_4_0(ms, seqStore, rep, src, srcSize);470case 5 :471return ZSTD_compressBlock_fast_noDict_5_0(ms, seqStore, rep, src, srcSize);472case 6 :473return ZSTD_compressBlock_fast_noDict_6_0(ms, seqStore, rep, src, srcSize);474case 7 :475return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize);476}477}478}479480FORCE_INLINE_TEMPLATE481ZSTD_ALLOW_POINTER_OVERFLOW_ATTR482size_t ZSTD_compressBlock_fast_dictMatchState_generic(483ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],484void const* src, size_t srcSize, U32 const mls, U32 const hasStep)485{486const ZSTD_compressionParameters* const cParams = &ms->cParams;487U32* const hashTable = ms->hashTable;488U32 const hlog = cParams->hashLog;489/* support stepSize of 0 */490U32 const stepSize = cParams->targetLength + !(cParams->targetLength);491const BYTE* const base = ms->window.base;492const BYTE* const istart = (const BYTE*)src;493const BYTE* ip0 = istart;494const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */495const BYTE* anchor = istart;496const U32 prefixStartIndex = ms->window.dictLimit;497const BYTE* const prefixStart = base + prefixStartIndex;498const BYTE* const iend = istart + srcSize;499const BYTE* const ilimit = iend - HASH_READ_SIZE;500U32 offset_1=rep[0], offset_2=rep[1];501502const ZSTD_MatchState_t* const dms = ms->dictMatchState;503const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;504const U32* const dictHashTable = dms->hashTable;505const U32 dictStartIndex = dms->window.dictLimit;506const BYTE* const dictBase = dms->window.base;507const BYTE* const dictStart = dictBase + dictStartIndex;508const BYTE* const dictEnd = dms->window.nextSrc;509const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase);510const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart);511const U32 dictHBits = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;512513/* if a dictionary is still attached, it necessarily means that514* it is within window size. So we just check it. */515const U32 maxDistance = 1U << cParams->windowLog;516const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);517assert(endIndex - prefixStartIndex <= maxDistance);518(void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */519520(void)hasStep; /* not currently specialized on whether it's accelerated */521522/* ensure there will be no underflow523* when translating a dict index into a local index */524assert(prefixStartIndex >= (U32)(dictEnd - dictBase));525526if (ms->prefetchCDictTables) {527size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);528PREFETCH_AREA(dictHashTable, hashTableBytes);529}530531/* init */532DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");533ip0 += (dictAndPrefixLength == 0);534/* dictMatchState repCode checks don't currently handle repCode == 0535* disabling. */536assert(offset_1 <= dictAndPrefixLength);537assert(offset_2 <= dictAndPrefixLength);538539/* Outer search loop */540assert(stepSize >= 1);541while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */542size_t mLength;543size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);544545size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);546U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];547int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);548549U32 matchIndex = hashTable[hash0];550U32 curr = (U32)(ip0 - base);551size_t step = stepSize;552const size_t kStepIncr = 1 << kSearchStrength;553const BYTE* nextStep = ip0 + kStepIncr;554555/* Inner search loop */556while (1) {557const BYTE* match = base + matchIndex;558const U32 repIndex = curr + 1 - offset_1;559const BYTE* repMatch = (repIndex < prefixStartIndex) ?560dictBase + (repIndex - dictIndexDelta) :561base + repIndex;562const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);563size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);564hashTable[hash0] = curr; /* update hash table */565566if ((ZSTD_index_overlap_check(prefixStartIndex, repIndex))567&& (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {568const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;569mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;570ip0++;571ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);572break;573}574575if (dictTagsMatch) {576/* Found a possible dict match */577const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;578const BYTE* dictMatch = dictBase + dictMatchIndex;579if (dictMatchIndex > dictStartIndex &&580MEM_read32(dictMatch) == MEM_read32(ip0)) {581/* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */582if (matchIndex <= prefixStartIndex) {583U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);584mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;585while (((ip0 > anchor) & (dictMatch > dictStart))586&& (ip0[-1] == dictMatch[-1])) {587ip0--;588dictMatch--;589mLength++;590} /* catch up */591offset_2 = offset_1;592offset_1 = offset;593ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);594break;595}596}597}598599if (ZSTD_match4Found_cmov(ip0, match, matchIndex, prefixStartIndex)) {600/* found a regular match of size >= 4 */601U32 const offset = (U32) (ip0 - match);602mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;603while (((ip0 > anchor) & (match > prefixStart))604&& (ip0[-1] == match[-1])) {605ip0--;606match--;607mLength++;608} /* catch up */609offset_2 = offset_1;610offset_1 = offset;611ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);612break;613}614615/* Prepare for next iteration */616dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];617dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);618matchIndex = hashTable[hash1];619620if (ip1 >= nextStep) {621step++;622nextStep += kStepIncr;623}624ip0 = ip1;625ip1 = ip1 + step;626if (ip1 > ilimit) goto _cleanup;627628curr = (U32)(ip0 - base);629hash0 = hash1;630} /* end inner search loop */631632/* match found */633assert(mLength);634ip0 += mLength;635anchor = ip0;636637if (ip0 <= ilimit) {638/* Fill Table */639assert(base+curr+2 > istart); /* check base overflow */640hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; /* here because curr+2 could be > iend-8 */641hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);642643/* check immediate repcode */644while (ip0 <= ilimit) {645U32 const current2 = (U32)(ip0-base);646U32 const repIndex2 = current2 - offset_2;647const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?648dictBase - dictIndexDelta + repIndex2 :649base + repIndex2;650if ( (ZSTD_index_overlap_check(prefixStartIndex, repIndex2))651&& (MEM_read32(repMatch2) == MEM_read32(ip0))) {652const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;653size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;654U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */655ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);656hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;657ip0 += repLength2;658anchor = ip0;659continue;660}661break;662}663}664665/* Prepare for next iteration */666assert(ip0 == anchor);667ip1 = ip0 + stepSize;668}669670_cleanup:671/* save reps for next block */672rep[0] = offset_1;673rep[1] = offset_2;674675/* Return the last literals size */676return (size_t)(iend - anchor);677}678679680ZSTD_GEN_FAST_FN(dictMatchState, 4, 0)681ZSTD_GEN_FAST_FN(dictMatchState, 5, 0)682ZSTD_GEN_FAST_FN(dictMatchState, 6, 0)683ZSTD_GEN_FAST_FN(dictMatchState, 7, 0)684685size_t ZSTD_compressBlock_fast_dictMatchState(686ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],687void const* src, size_t srcSize)688{689U32 const mls = ms->cParams.minMatch;690assert(ms->dictMatchState != NULL);691switch(mls)692{693default: /* includes case 3 */694case 4 :695return ZSTD_compressBlock_fast_dictMatchState_4_0(ms, seqStore, rep, src, srcSize);696case 5 :697return ZSTD_compressBlock_fast_dictMatchState_5_0(ms, seqStore, rep, src, srcSize);698case 6 :699return ZSTD_compressBlock_fast_dictMatchState_6_0(ms, seqStore, rep, src, srcSize);700case 7 :701return ZSTD_compressBlock_fast_dictMatchState_7_0(ms, seqStore, rep, src, srcSize);702}703}704705706static707ZSTD_ALLOW_POINTER_OVERFLOW_ATTR708size_t ZSTD_compressBlock_fast_extDict_generic(709ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],710void const* src, size_t srcSize, U32 const mls, U32 const hasStep)711{712const ZSTD_compressionParameters* const cParams = &ms->cParams;713U32* const hashTable = ms->hashTable;714U32 const hlog = cParams->hashLog;715/* support stepSize of 0 */716size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;717const BYTE* const base = ms->window.base;718const BYTE* const dictBase = ms->window.dictBase;719const BYTE* const istart = (const BYTE*)src;720const BYTE* anchor = istart;721const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);722const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);723const U32 dictStartIndex = lowLimit;724const BYTE* const dictStart = dictBase + dictStartIndex;725const U32 dictLimit = ms->window.dictLimit;726const U32 prefixStartIndex = dictLimit < lowLimit ? lowLimit : dictLimit;727const BYTE* const prefixStart = base + prefixStartIndex;728const BYTE* const dictEnd = dictBase + prefixStartIndex;729const BYTE* const iend = istart + srcSize;730const BYTE* const ilimit = iend - 8;731U32 offset_1=rep[0], offset_2=rep[1];732U32 offsetSaved1 = 0, offsetSaved2 = 0;733734const BYTE* ip0 = istart;735const BYTE* ip1;736const BYTE* ip2;737const BYTE* ip3;738U32 current0;739740741size_t hash0; /* hash for ip0 */742size_t hash1; /* hash for ip1 */743U32 idx; /* match idx for ip0 */744const BYTE* idxBase; /* base pointer for idx */745746U32 offcode;747const BYTE* match0;748size_t mLength;749const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */750751size_t step;752const BYTE* nextStep;753const size_t kStepIncr = (1 << (kSearchStrength - 1));754755(void)hasStep; /* not currently specialized on whether it's accelerated */756757DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)", offset_1);758759/* switch to "regular" variant if extDict is invalidated due to maxDistance */760if (prefixStartIndex == dictStartIndex)761return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);762763{ U32 const curr = (U32)(ip0 - base);764U32 const maxRep = curr - dictStartIndex;765if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0;766if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0;767}768769/* start each op */770_start: /* Requires: ip0 */771772step = stepSize;773nextStep = ip0 + kStepIncr;774775/* calculate positions, ip0 - anchor == 0, so we skip step calc */776ip1 = ip0 + 1;777ip2 = ip0 + step;778ip3 = ip2 + 1;779780if (ip3 >= ilimit) {781goto _cleanup;782}783784hash0 = ZSTD_hashPtr(ip0, hlog, mls);785hash1 = ZSTD_hashPtr(ip1, hlog, mls);786787idx = hashTable[hash0];788idxBase = idx < prefixStartIndex ? dictBase : base;789790do {791{ /* load repcode match for ip[2] */792U32 const current2 = (U32)(ip2 - base);793U32 const repIndex = current2 - offset_1;794const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;795U32 rval;796if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */797& (offset_1 > 0) ) {798rval = MEM_read32(repBase + repIndex);799} else {800rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */801}802803/* write back hash table entry */804current0 = (U32)(ip0 - base);805hashTable[hash0] = current0;806807/* check repcode at ip[2] */808if (MEM_read32(ip2) == rval) {809ip0 = ip2;810match0 = repBase + repIndex;811matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;812assert((match0 != prefixStart) & (match0 != dictStart));813mLength = ip0[-1] == match0[-1];814ip0 -= mLength;815match0 -= mLength;816offcode = REPCODE1_TO_OFFBASE;817mLength += 4;818goto _match;819} }820821{ /* load match for ip[0] */822U32 const mval = idx >= dictStartIndex ?823MEM_read32(idxBase + idx) :824MEM_read32(ip0) ^ 1; /* guaranteed not to match */825826/* check match at ip[0] */827if (MEM_read32(ip0) == mval) {828/* found a match! */829goto _offset;830} }831832/* lookup ip[1] */833idx = hashTable[hash1];834idxBase = idx < prefixStartIndex ? dictBase : base;835836/* hash ip[2] */837hash0 = hash1;838hash1 = ZSTD_hashPtr(ip2, hlog, mls);839840/* advance to next positions */841ip0 = ip1;842ip1 = ip2;843ip2 = ip3;844845/* write back hash table entry */846current0 = (U32)(ip0 - base);847hashTable[hash0] = current0;848849{ /* load match for ip[0] */850U32 const mval = idx >= dictStartIndex ?851MEM_read32(idxBase + idx) :852MEM_read32(ip0) ^ 1; /* guaranteed not to match */853854/* check match at ip[0] */855if (MEM_read32(ip0) == mval) {856/* found a match! */857goto _offset;858} }859860/* lookup ip[1] */861idx = hashTable[hash1];862idxBase = idx < prefixStartIndex ? dictBase : base;863864/* hash ip[2] */865hash0 = hash1;866hash1 = ZSTD_hashPtr(ip2, hlog, mls);867868/* advance to next positions */869ip0 = ip1;870ip1 = ip2;871ip2 = ip0 + step;872ip3 = ip1 + step;873874/* calculate step */875if (ip2 >= nextStep) {876step++;877PREFETCH_L1(ip1 + 64);878PREFETCH_L1(ip1 + 128);879nextStep += kStepIncr;880}881} while (ip3 < ilimit);882883_cleanup:884/* Note that there are probably still a couple positions we could search.885* However, it seems to be a meaningful performance hit to try to search886* them. So let's not. */887888/* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),889* rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */890offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;891892/* save reps for next block */893rep[0] = offset_1 ? offset_1 : offsetSaved1;894rep[1] = offset_2 ? offset_2 : offsetSaved2;895896/* Return the last literals size */897return (size_t)(iend - anchor);898899_offset: /* Requires: ip0, idx, idxBase */900901/* Compute the offset code. */902{ U32 const offset = current0 - idx;903const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;904matchEnd = idx < prefixStartIndex ? dictEnd : iend;905match0 = idxBase + idx;906offset_2 = offset_1;907offset_1 = offset;908offcode = OFFSET_TO_OFFBASE(offset);909mLength = 4;910911/* Count the backwards match length. */912while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) {913ip0--;914match0--;915mLength++;916} }917918_match: /* Requires: ip0, match0, offcode, matchEnd */919920/* Count the forward length. */921assert(matchEnd != 0);922mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);923924ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);925926ip0 += mLength;927anchor = ip0;928929/* write next hash table entry */930if (ip1 < ip0) {931hashTable[hash1] = (U32)(ip1 - base);932}933934/* Fill table and check for immediate repcode. */935if (ip0 <= ilimit) {936/* Fill Table */937assert(base+current0+2 > istart); /* check base overflow */938hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */939hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);940941while (ip0 <= ilimit) {942U32 const repIndex2 = (U32)(ip0-base) - offset_2;943const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;944if ( ((ZSTD_index_overlap_check(prefixStartIndex, repIndex2)) & (offset_2 > 0))945&& (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {946const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;947size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;948{ U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */949ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);950hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);951ip0 += repLength2;952anchor = ip0;953continue;954}955break;956} }957958goto _start;959}960961ZSTD_GEN_FAST_FN(extDict, 4, 0)962ZSTD_GEN_FAST_FN(extDict, 5, 0)963ZSTD_GEN_FAST_FN(extDict, 6, 0)964ZSTD_GEN_FAST_FN(extDict, 7, 0)965966size_t ZSTD_compressBlock_fast_extDict(967ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],968void const* src, size_t srcSize)969{970U32 const mls = ms->cParams.minMatch;971assert(ms->dictMatchState == NULL);972switch(mls)973{974default: /* includes case 3 */975case 4 :976return ZSTD_compressBlock_fast_extDict_4_0(ms, seqStore, rep, src, srcSize);977case 5 :978return ZSTD_compressBlock_fast_extDict_5_0(ms, seqStore, rep, src, srcSize);979case 6 :980return ZSTD_compressBlock_fast_extDict_6_0(ms, seqStore, rep, src, srcSize);981case 7 :982return ZSTD_compressBlock_fast_extDict_7_0(ms, seqStore, rep, src, srcSize);983}984}985986987