/* LzFindOpt.c -- multithreaded Match finder for LZ algorithms12023-04-02 : Igor Pavlov : Public domain */23#include "Precomp.h"45#include "CpuArch.h"6#include "LzFind.h"78// #include "LzFindMt.h"910// #define LOG_ITERS1112// #define LOG_THREAD1314#ifdef LOG_THREAD15#include <stdio.h>16#define PRF(x) x17#else18// #define PRF(x)19#endif2021#ifdef LOG_ITERS22#include <stdio.h>23UInt64 g_NumIters_Tree;24UInt64 g_NumIters_Loop;25UInt64 g_NumIters_Bytes;26#define LOG_ITER(x) x27#else28#define LOG_ITER(x)29#endif3031// ---------- BT THREAD ----------3233#define USE_SON_PREFETCH34#define USE_LONG_MATCH_OPT3536#define kEmptyHashValue 03738// #define CYC_TO_POS_OFFSET 03940// #define CYC_TO_POS_OFFSET 1 // for debug4142/*43Z7_NO_INLINE44UInt32 * Z7_FASTCALL GetMatchesSpecN_1(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,45UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, UInt32 *posRes)46{47do48{49UInt32 delta;50if (hash == size)51break;52delta = *hash++;5354if (delta == 0 || delta > (UInt32)pos)55return NULL;5657lenLimit++;5859if (delta == (UInt32)pos)60{61CLzRef *ptr1 = son + ((size_t)pos << 1) - CYC_TO_POS_OFFSET * 2;62*d++ = 0;63ptr1[0] = kEmptyHashValue;64ptr1[1] = kEmptyHashValue;65}66else67{68UInt32 *_distances = ++d;6970CLzRef *ptr0 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2 + 1;71CLzRef *ptr1 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2;7273const Byte *len0 = cur, *len1 = cur;74UInt32 cutValue = _cutValue;75const Byte *maxLen = cur + _maxLen;7677for (LOG_ITER(g_NumIters_Tree++);;)78{79LOG_ITER(g_NumIters_Loop++);80{81const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta;82CLzRef *pair = son + ((size_t)(((ptrdiff_t)pos - CYC_TO_POS_OFFSET) + diff) << 1);83const Byte *len = (len0 < len1 ? len0 : len1);8485#ifdef USE_SON_PREFETCH86const UInt32 pair0 = *pair;87#endif8889if (len[diff] == len[0])90{91if (++len != lenLimit && len[diff] == len[0])92while (++len != lenLimit)93{94LOG_ITER(g_NumIters_Bytes++);95if (len[diff] != len[0])96break;97}98if (maxLen < len)99{100maxLen = len;101*d++ = (UInt32)(len - cur);102*d++ = delta - 1;103104if (len == lenLimit)105{106const UInt32 pair1 = pair[1];107*ptr1 =108#ifdef USE_SON_PREFETCH109pair0;110#else111pair[0];112#endif113*ptr0 = pair1;114115_distances[-1] = (UInt32)(d - _distances);116117#ifdef USE_LONG_MATCH_OPT118119if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)120break;121122{123for (;;)124{125hash++;126pos++;127cur++;128lenLimit++;129{130CLzRef *ptr = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2;131#if 0132*(UInt64 *)(void *)ptr = ((const UInt64 *)(const void *)ptr)[diff];133#else134const UInt32 p0 = ptr[0 + (diff * 2)];135const UInt32 p1 = ptr[1 + (diff * 2)];136ptr[0] = p0;137ptr[1] = p1;138// ptr[0] = ptr[0 + (diff * 2)];139// ptr[1] = ptr[1 + (diff * 2)];140#endif141}142// PrintSon(son + 2, pos - 1);143// printf("\npos = %x delta = %x\n", pos, delta);144len++;145*d++ = 2;146*d++ = (UInt32)(len - cur);147*d++ = delta - 1;148if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)149break;150}151}152#endif153154break;155}156}157}158159{160const UInt32 curMatch = (UInt32)pos - delta; // (UInt32)(pos + diff);161if (len[diff] < len[0])162{163delta = pair[1];164if (delta >= curMatch)165return NULL;166*ptr1 = curMatch;167ptr1 = pair + 1;168len1 = len;169}170else171{172delta = *pair;173if (delta >= curMatch)174return NULL;175*ptr0 = curMatch;176ptr0 = pair;177len0 = len;178}179180delta = (UInt32)pos - delta;181182if (--cutValue == 0 || delta >= pos)183{184*ptr0 = *ptr1 = kEmptyHashValue;185_distances[-1] = (UInt32)(d - _distances);186break;187}188}189}190} // for (tree iterations)191}192pos++;193cur++;194}195while (d < limit);196*posRes = (UInt32)pos;197return d;198}199*/200201/* define cbs if you use 2 functions.202GetMatchesSpecN_1() : (pos < _cyclicBufferSize)203GetMatchesSpecN_2() : (pos >= _cyclicBufferSize)204205do not define cbs if you use 1 function:206GetMatchesSpecN_2()207*/208209// #define cbs _cyclicBufferSize210211/*212we use size_t for (pos) and (_cyclicBufferPos_ instead of UInt32213to eliminate "movsx" BUG in old MSVC x64 compiler.214*/215216UInt32 * Z7_FASTCALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,217UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size,218size_t _cyclicBufferPos, UInt32 _cyclicBufferSize,219UInt32 *posRes);220221Z7_NO_INLINE222UInt32 * Z7_FASTCALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,223UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size,224size_t _cyclicBufferPos, UInt32 _cyclicBufferSize,225UInt32 *posRes)226{227do // while (hash != size)228{229UInt32 delta;230231#ifndef cbs232UInt32 cbs;233#endif234235if (hash == size)236break;237238delta = *hash++;239240if (delta == 0)241return NULL;242243lenLimit++;244245#ifndef cbs246cbs = _cyclicBufferSize;247if ((UInt32)pos < cbs)248{249if (delta > (UInt32)pos)250return NULL;251cbs = (UInt32)pos;252}253#endif254255if (delta >= cbs)256{257CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);258*d++ = 0;259ptr1[0] = kEmptyHashValue;260ptr1[1] = kEmptyHashValue;261}262else263{264UInt32 *_distances = ++d;265266CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;267CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);268269UInt32 cutValue = _cutValue;270const Byte *len0 = cur, *len1 = cur;271const Byte *maxLen = cur + _maxLen;272273// if (cutValue == 0) { *ptr0 = *ptr1 = kEmptyHashValue; } else274for (LOG_ITER(g_NumIters_Tree++);;)275{276LOG_ITER(g_NumIters_Loop++);277{278// SPEC code279CLzRef *pair = son + ((size_t)((ptrdiff_t)_cyclicBufferPos - (ptrdiff_t)delta280+ (ptrdiff_t)(UInt32)(_cyclicBufferPos < delta ? cbs : 0)281) << 1);282283const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta;284const Byte *len = (len0 < len1 ? len0 : len1);285286#ifdef USE_SON_PREFETCH287const UInt32 pair0 = *pair;288#endif289290if (len[diff] == len[0])291{292if (++len != lenLimit && len[diff] == len[0])293while (++len != lenLimit)294{295LOG_ITER(g_NumIters_Bytes++);296if (len[diff] != len[0])297break;298}299if (maxLen < len)300{301maxLen = len;302*d++ = (UInt32)(len - cur);303*d++ = delta - 1;304305if (len == lenLimit)306{307const UInt32 pair1 = pair[1];308*ptr1 =309#ifdef USE_SON_PREFETCH310pair0;311#else312pair[0];313#endif314*ptr0 = pair1;315316_distances[-1] = (UInt32)(d - _distances);317318#ifdef USE_LONG_MATCH_OPT319320if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)321break;322323{324for (;;)325{326*d++ = 2;327*d++ = (UInt32)(lenLimit - cur);328*d++ = delta - 1;329cur++;330lenLimit++;331// SPEC332_cyclicBufferPos++;333{334// SPEC code335CLzRef *dest = son + ((size_t)(_cyclicBufferPos) << 1);336const CLzRef *src = dest + ((diff337+ (ptrdiff_t)(UInt32)((_cyclicBufferPos < delta) ? cbs : 0)) << 1);338// CLzRef *ptr = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2;339#if 0340*(UInt64 *)(void *)dest = *((const UInt64 *)(const void *)src);341#else342const UInt32 p0 = src[0];343const UInt32 p1 = src[1];344dest[0] = p0;345dest[1] = p1;346#endif347}348pos++;349hash++;350if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)351break;352} // for() end for long matches353}354#endif355356break; // break from TREE iterations357}358}359}360{361const UInt32 curMatch = (UInt32)pos - delta; // (UInt32)(pos + diff);362if (len[diff] < len[0])363{364delta = pair[1];365*ptr1 = curMatch;366ptr1 = pair + 1;367len1 = len;368if (delta >= curMatch)369return NULL;370}371else372{373delta = *pair;374*ptr0 = curMatch;375ptr0 = pair;376len0 = len;377if (delta >= curMatch)378return NULL;379}380delta = (UInt32)pos - delta;381382if (--cutValue == 0 || delta >= cbs)383{384*ptr0 = *ptr1 = kEmptyHashValue;385_distances[-1] = (UInt32)(d - _distances);386break;387}388}389}390} // for (tree iterations)391}392pos++;393_cyclicBufferPos++;394cur++;395}396while (d < limit);397*posRes = (UInt32)pos;398return d;399}400401402403/*404typedef UInt32 uint32plus; // size_t405406UInt32 * Z7_FASTCALL GetMatchesSpecN_3(uint32plus lenLimit, size_t pos, const Byte *cur, CLzRef *son,407UInt32 _cutValue, UInt32 *d, uint32plus _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size,408size_t _cyclicBufferPos, UInt32 _cyclicBufferSize,409UInt32 *posRes)410{411do // while (hash != size)412{413UInt32 delta;414415#ifndef cbs416UInt32 cbs;417#endif418419if (hash == size)420break;421422delta = *hash++;423424if (delta == 0)425return NULL;426427#ifndef cbs428cbs = _cyclicBufferSize;429if ((UInt32)pos < cbs)430{431if (delta > (UInt32)pos)432return NULL;433cbs = (UInt32)pos;434}435#endif436437if (delta >= cbs)438{439CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);440*d++ = 0;441ptr1[0] = kEmptyHashValue;442ptr1[1] = kEmptyHashValue;443}444else445{446CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;447CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);448UInt32 *_distances = ++d;449uint32plus len0 = 0, len1 = 0;450UInt32 cutValue = _cutValue;451uint32plus maxLen = _maxLen;452// lenLimit++; // const Byte *lenLimit = cur + _lenLimit;453454for (LOG_ITER(g_NumIters_Tree++);;)455{456LOG_ITER(g_NumIters_Loop++);457{458// const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta;459CLzRef *pair = son + ((size_t)((ptrdiff_t)_cyclicBufferPos - delta460+ (ptrdiff_t)(UInt32)(_cyclicBufferPos < delta ? cbs : 0)461) << 1);462const Byte *pb = cur - delta;463uint32plus len = (len0 < len1 ? len0 : len1);464465#ifdef USE_SON_PREFETCH466const UInt32 pair0 = *pair;467#endif468469if (pb[len] == cur[len])470{471if (++len != lenLimit && pb[len] == cur[len])472while (++len != lenLimit)473if (pb[len] != cur[len])474break;475if (maxLen < len)476{477maxLen = len;478*d++ = (UInt32)len;479*d++ = delta - 1;480if (len == lenLimit)481{482{483const UInt32 pair1 = pair[1];484*ptr0 = pair1;485*ptr1 =486#ifdef USE_SON_PREFETCH487pair0;488#else489pair[0];490#endif491}492493_distances[-1] = (UInt32)(d - _distances);494495#ifdef USE_LONG_MATCH_OPT496497if (hash == size || *hash != delta || pb[lenLimit] != cur[lenLimit] || d >= limit)498break;499500{501const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta;502for (;;)503{504*d++ = 2;505*d++ = (UInt32)lenLimit;506*d++ = delta - 1;507_cyclicBufferPos++;508{509CLzRef *dest = son + ((size_t)_cyclicBufferPos << 1);510const CLzRef *src = dest + ((diff +511(ptrdiff_t)(UInt32)(_cyclicBufferPos < delta ? cbs : 0)) << 1);512#if 0513*(UInt64 *)(void *)dest = *((const UInt64 *)(const void *)src);514#else515const UInt32 p0 = src[0];516const UInt32 p1 = src[1];517dest[0] = p0;518dest[1] = p1;519#endif520}521hash++;522pos++;523cur++;524pb++;525if (hash == size || *hash != delta || pb[lenLimit] != cur[lenLimit] || d >= limit)526break;527}528}529#endif530531break;532}533}534}535{536const UInt32 curMatch = (UInt32)pos - delta;537if (pb[len] < cur[len])538{539delta = pair[1];540*ptr1 = curMatch;541ptr1 = pair + 1;542len1 = len;543}544else545{546delta = *pair;547*ptr0 = curMatch;548ptr0 = pair;549len0 = len;550}551552{553if (delta >= curMatch)554return NULL;555delta = (UInt32)pos - delta;556if (delta >= cbs557// delta >= _cyclicBufferSize || delta >= pos558|| --cutValue == 0)559{560*ptr0 = *ptr1 = kEmptyHashValue;561_distances[-1] = (UInt32)(d - _distances);562break;563}564}565}566}567} // for (tree iterations)568}569pos++;570_cyclicBufferPos++;571cur++;572}573while (d < limit);574*posRes = (UInt32)pos;575return d;576}577*/578579580