Path: blob/main/sys/contrib/zstd/lib/dictBuilder/zdict.c
48378 views
/*1* Copyright (c) Yann Collet, Facebook, Inc.2* All rights reserved.3*4* This source code is licensed under both the BSD-style license (found in the5* LICENSE file in the root directory of this source tree) and the GPLv2 (found6* in the COPYING file in the root directory of this source tree).7* You may select, at your option, one of the above-listed licenses.8*/91011/*-**************************************12* Tuning parameters13****************************************/14#define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */15#define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)16#define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)171819/*-**************************************20* Compiler Options21****************************************/22/* Unix Large Files support (>4GB) */23#define _FILE_OFFSET_BITS 6424#if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */25# ifndef _LARGEFILE_SOURCE26# define _LARGEFILE_SOURCE27# endif28#elif ! defined(__LP64__) /* No point defining Large file for 64 bit */29# ifndef _LARGEFILE64_SOURCE30# define _LARGEFILE64_SOURCE31# endif32#endif333435/*-*************************************36* Dependencies37***************************************/38#include <stdlib.h> /* malloc, free */39#include <string.h> /* memset */40#include <stdio.h> /* fprintf, fopen, ftello64 */41#include <time.h> /* clock */4243#ifndef ZDICT_STATIC_LINKING_ONLY44# define ZDICT_STATIC_LINKING_ONLY45#endif46#define HUF_STATIC_LINKING_ONLY4748#include "../common/mem.h" /* read */49#include "../common/fse.h" /* FSE_normalizeCount, FSE_writeNCount */50#include "../common/huf.h" /* HUF_buildCTable, HUF_writeCTable */51#include "../common/zstd_internal.h" /* includes zstd.h */52#include "../common/xxhash.h" /* XXH64 */53#include "../compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */54#include "../zdict.h"55#include "divsufsort.h"565758/*-*************************************59* Constants60***************************************/61#define KB *(1 <<10)62#define MB *(1 <<20)63#define GB *(1U<<30)6465#define DICTLISTSIZE_DEFAULT 100006667#define NOISELENGTH 326869static const U32 g_selectivity_default = 9;707172/*-*************************************73* Console display74***************************************/75#undef DISPLAY76#define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }77#undef DISPLAYLEVEL78#define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */7980static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }8182static void ZDICT_printHex(const void* ptr, size_t length)83{84const BYTE* const b = (const BYTE*)ptr;85size_t u;86for (u=0; u<length; u++) {87BYTE c = b[u];88if (c<32 || c>126) c = '.'; /* non-printable char */89DISPLAY("%c", c);90}91}929394/*-********************************************************95* Helper functions96**********************************************************/97unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }9899const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }100101unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)102{103if (dictSize < 8) return 0;104if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0;105return MEM_readLE32((const char*)dictBuffer + 4);106}107108size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)109{110size_t headerSize;111if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);112113{ ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));114U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);115if (!bs || !wksp) {116headerSize = ERROR(memory_allocation);117} else {118ZSTD_reset_compressedBlockState(bs);119headerSize = ZSTD_loadCEntropy(bs, wksp, dictBuffer, dictSize);120}121122free(bs);123free(wksp);124}125126return headerSize;127}128129/*-********************************************************130* Dictionary training functions131**********************************************************/132static unsigned ZDICT_NbCommonBytes (size_t val)133{134if (MEM_isLittleEndian()) {135if (MEM_64bits()) {136# if defined(_MSC_VER) && defined(_WIN64)137if (val != 0) {138unsigned long r;139_BitScanForward64(&r, (U64)val);140return (unsigned)(r >> 3);141} else {142/* Should not reach this code path */143__assume(0);144}145# elif defined(__GNUC__) && (__GNUC__ >= 3)146return (unsigned)(__builtin_ctzll((U64)val) >> 3);147# else148static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };149return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];150# endif151} else { /* 32 bits */152# if defined(_MSC_VER)153if (val != 0) {154unsigned long r;155_BitScanForward(&r, (U32)val);156return (unsigned)(r >> 3);157} else {158/* Should not reach this code path */159__assume(0);160}161# elif defined(__GNUC__) && (__GNUC__ >= 3)162return (unsigned)(__builtin_ctz((U32)val) >> 3);163# else164static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };165return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];166# endif167}168} else { /* Big Endian CPU */169if (MEM_64bits()) {170# if defined(_MSC_VER) && defined(_WIN64)171if (val != 0) {172unsigned long r;173_BitScanReverse64(&r, val);174return (unsigned)(r >> 3);175} else {176/* Should not reach this code path */177__assume(0);178}179# elif defined(__GNUC__) && (__GNUC__ >= 3)180return (unsigned)(__builtin_clzll(val) >> 3);181# else182unsigned r;183const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */184if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }185if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }186r += (!val);187return r;188# endif189} else { /* 32 bits */190# if defined(_MSC_VER)191if (val != 0) {192unsigned long r;193_BitScanReverse(&r, (unsigned long)val);194return (unsigned)(r >> 3);195} else {196/* Should not reach this code path */197__assume(0);198}199# elif defined(__GNUC__) && (__GNUC__ >= 3)200return (unsigned)(__builtin_clz((U32)val) >> 3);201# else202unsigned r;203if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }204r += (!val);205return r;206# endif207} }208}209210211/*! ZDICT_count() :212Count the nb of common bytes between 2 pointers.213Note : this function presumes end of buffer followed by noisy guard band.214*/215static size_t ZDICT_count(const void* pIn, const void* pMatch)216{217const char* const pStart = (const char*)pIn;218for (;;) {219size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);220if (!diff) {221pIn = (const char*)pIn+sizeof(size_t);222pMatch = (const char*)pMatch+sizeof(size_t);223continue;224}225pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff);226return (size_t)((const char*)pIn - pStart);227}228}229230231typedef struct {232U32 pos;233U32 length;234U32 savings;235} dictItem;236237static void ZDICT_initDictItem(dictItem* d)238{239d->pos = 1;240d->length = 0;241d->savings = (U32)(-1);242}243244245#define LLIMIT 64 /* heuristic determined experimentally */246#define MINMATCHLENGTH 7 /* heuristic determined experimentally */247static dictItem ZDICT_analyzePos(248BYTE* doneMarks,249const int* suffix, U32 start,250const void* buffer, U32 minRatio, U32 notificationLevel)251{252U32 lengthList[LLIMIT] = {0};253U32 cumulLength[LLIMIT] = {0};254U32 savings[LLIMIT] = {0};255const BYTE* b = (const BYTE*)buffer;256size_t maxLength = LLIMIT;257size_t pos = (size_t)suffix[start];258U32 end = start;259dictItem solution;260261/* init */262memset(&solution, 0, sizeof(solution));263doneMarks[pos] = 1;264265/* trivial repetition cases */266if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))267||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))268||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {269/* skip and mark segment */270U16 const pattern16 = MEM_read16(b+pos+4);271U32 u, patternEnd = 6;272while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;273if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;274for (u=1; u<patternEnd; u++)275doneMarks[pos+u] = 1;276return solution;277}278279/* look forward */280{ size_t length;281do {282end++;283length = ZDICT_count(b + pos, b + suffix[end]);284} while (length >= MINMATCHLENGTH);285}286287/* look backward */288{ size_t length;289do {290length = ZDICT_count(b + pos, b + *(suffix+start-1));291if (length >=MINMATCHLENGTH) start--;292} while(length >= MINMATCHLENGTH);293}294295/* exit if not found a minimum nb of repetitions */296if (end-start < minRatio) {297U32 idx;298for(idx=start; idx<end; idx++)299doneMarks[suffix[idx]] = 1;300return solution;301}302303{ int i;304U32 mml;305U32 refinedStart = start;306U32 refinedEnd = end;307308DISPLAYLEVEL(4, "\n");309DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (unsigned)(end-start), MINMATCHLENGTH, (unsigned)pos);310DISPLAYLEVEL(4, "\n");311312for (mml = MINMATCHLENGTH ; ; mml++) {313BYTE currentChar = 0;314U32 currentCount = 0;315U32 currentID = refinedStart;316U32 id;317U32 selectedCount = 0;318U32 selectedID = currentID;319for (id =refinedStart; id < refinedEnd; id++) {320if (b[suffix[id] + mml] != currentChar) {321if (currentCount > selectedCount) {322selectedCount = currentCount;323selectedID = currentID;324}325currentID = id;326currentChar = b[ suffix[id] + mml];327currentCount = 0;328}329currentCount ++;330}331if (currentCount > selectedCount) { /* for last */332selectedCount = currentCount;333selectedID = currentID;334}335336if (selectedCount < minRatio)337break;338refinedStart = selectedID;339refinedEnd = refinedStart + selectedCount;340}341342/* evaluate gain based on new dict */343start = refinedStart;344pos = suffix[refinedStart];345end = start;346memset(lengthList, 0, sizeof(lengthList));347348/* look forward */349{ size_t length;350do {351end++;352length = ZDICT_count(b + pos, b + suffix[end]);353if (length >= LLIMIT) length = LLIMIT-1;354lengthList[length]++;355} while (length >=MINMATCHLENGTH);356}357358/* look backward */359{ size_t length = MINMATCHLENGTH;360while ((length >= MINMATCHLENGTH) & (start > 0)) {361length = ZDICT_count(b + pos, b + suffix[start - 1]);362if (length >= LLIMIT) length = LLIMIT - 1;363lengthList[length]++;364if (length >= MINMATCHLENGTH) start--;365}366}367368/* largest useful length */369memset(cumulLength, 0, sizeof(cumulLength));370cumulLength[maxLength-1] = lengthList[maxLength-1];371for (i=(int)(maxLength-2); i>=0; i--)372cumulLength[i] = cumulLength[i+1] + lengthList[i];373374for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break;375maxLength = i;376377/* reduce maxLength in case of final into repetitive data */378{ U32 l = (U32)maxLength;379BYTE const c = b[pos + maxLength-1];380while (b[pos+l-2]==c) l--;381maxLength = l;382}383if (maxLength < MINMATCHLENGTH) return solution; /* skip : no long-enough solution */384385/* calculate savings */386savings[5] = 0;387for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)388savings[i] = savings[i-1] + (lengthList[i] * (i-3));389390DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n",391(unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / (double)maxLength);392393solution.pos = (U32)pos;394solution.length = (U32)maxLength;395solution.savings = savings[maxLength];396397/* mark positions done */398{ U32 id;399for (id=start; id<end; id++) {400U32 p, pEnd, length;401U32 const testedPos = (U32)suffix[id];402if (testedPos == pos)403length = solution.length;404else {405length = (U32)ZDICT_count(b+pos, b+testedPos);406if (length > solution.length) length = solution.length;407}408pEnd = (U32)(testedPos + length);409for (p=testedPos; p<pEnd; p++)410doneMarks[p] = 1;411} } }412413return solution;414}415416417static int isIncluded(const void* in, const void* container, size_t length)418{419const char* const ip = (const char*) in;420const char* const into = (const char*) container;421size_t u;422423for (u=0; u<length; u++) { /* works because end of buffer is a noisy guard band */424if (ip[u] != into[u]) break;425}426427return u==length;428}429430/*! ZDICT_tryMerge() :431check if dictItem can be merged, do it if possible432@return : id of destination elt, 0 if not merged433*/434static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer)435{436const U32 tableSize = table->pos;437const U32 eltEnd = elt.pos + elt.length;438const char* const buf = (const char*) buffer;439440/* tail overlap */441U32 u; for (u=1; u<tableSize; u++) {442if (u==eltNbToSkip) continue;443if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */444/* append */445U32 const addedLength = table[u].pos - elt.pos;446table[u].length += addedLength;447table[u].pos = elt.pos;448table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */449table[u].savings += elt.length / 8; /* rough approx bonus */450elt = table[u];451/* sort : improve rank */452while ((u>1) && (table[u-1].savings < elt.savings))453table[u] = table[u-1], u--;454table[u] = elt;455return u;456} }457458/* front overlap */459for (u=1; u<tableSize; u++) {460if (u==eltNbToSkip) continue;461462if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */463/* append */464int const addedLength = (int)eltEnd - (int)(table[u].pos + table[u].length);465table[u].savings += elt.length / 8; /* rough approx bonus */466if (addedLength > 0) { /* otherwise, elt fully included into existing */467table[u].length += addedLength;468table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */469}470/* sort : improve rank */471elt = table[u];472while ((u>1) && (table[u-1].savings < elt.savings))473table[u] = table[u-1], u--;474table[u] = elt;475return u;476}477478if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) {479if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) {480size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 );481table[u].pos = elt.pos;482table[u].savings += (U32)(elt.savings * addedLength / elt.length);483table[u].length = MIN(elt.length, table[u].length + 1);484return u;485}486}487}488489return 0;490}491492493static void ZDICT_removeDictItem(dictItem* table, U32 id)494{495/* convention : table[0].pos stores nb of elts */496U32 const max = table[0].pos;497U32 u;498if (!id) return; /* protection, should never happen */499for (u=id; u<max-1; u++)500table[u] = table[u+1];501table->pos--;502}503504505static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer)506{507/* merge if possible */508U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer);509if (mergeId) {510U32 newMerge = 1;511while (newMerge) {512newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer);513if (newMerge) ZDICT_removeDictItem(table, mergeId);514mergeId = newMerge;515}516return;517}518519/* insert */520{ U32 current;521U32 nextElt = table->pos;522if (nextElt >= maxSize) nextElt = maxSize-1;523current = nextElt-1;524while (table[current].savings < elt.savings) {525table[current+1] = table[current];526current--;527}528table[current+1] = elt;529table->pos = nextElt+1;530}531}532533534static U32 ZDICT_dictSize(const dictItem* dictList)535{536U32 u, dictSize = 0;537for (u=1; u<dictList[0].pos; u++)538dictSize += dictList[u].length;539return dictSize;540}541542543static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,544const void* const buffer, size_t bufferSize, /* buffer must end with noisy guard band */545const size_t* fileSizes, unsigned nbFiles,546unsigned minRatio, U32 notificationLevel)547{548int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));549int* const suffix = suffix0+1;550U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));551BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */552U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));553size_t result = 0;554clock_t displayClock = 0;555clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;556557# undef DISPLAYUPDATE558# define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \559if (ZDICT_clockSpan(displayClock) > refreshRate) \560{ displayClock = clock(); DISPLAY(__VA_ARGS__); \561if (notificationLevel>=4) fflush(stderr); } }562563/* init */564DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */565if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) {566result = ERROR(memory_allocation);567goto _cleanup;568}569if (minRatio < MINRATIO) minRatio = MINRATIO;570memset(doneMarks, 0, bufferSize+16);571572/* limit sample set size (divsufsort limitation)*/573if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (unsigned)(ZDICT_MAX_SAMPLES_SIZE>>20));574while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];575576/* sort */577DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (unsigned)(bufferSize>>20));578{ int const divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);579if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }580}581suffix[bufferSize] = (int)bufferSize; /* leads into noise */582suffix0[0] = (int)bufferSize; /* leads into noise */583/* build reverse suffix sort */584{ size_t pos;585for (pos=0; pos < bufferSize; pos++)586reverseSuffix[suffix[pos]] = (U32)pos;587/* note filePos tracks borders between samples.588It's not used at this stage, but planned to become useful in a later update */589filePos[0] = 0;590for (pos=1; pos<nbFiles; pos++)591filePos[pos] = (U32)(filePos[pos-1] + fileSizes[pos-1]);592}593594DISPLAYLEVEL(2, "finding patterns ... \n");595DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio);596597{ U32 cursor; for (cursor=0; cursor < bufferSize; ) {598dictItem solution;599if (doneMarks[cursor]) { cursor++; continue; }600solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);601if (solution.length==0) { cursor++; continue; }602ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);603cursor += solution.length;604DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);605} }606607_cleanup:608free(suffix0);609free(reverseSuffix);610free(doneMarks);611free(filePos);612return result;613}614615616static void ZDICT_fillNoise(void* buffer, size_t length)617{618unsigned const prime1 = 2654435761U;619unsigned const prime2 = 2246822519U;620unsigned acc = prime1;621size_t p=0;622for (p=0; p<length; p++) {623acc *= prime2;624((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);625}626}627628629typedef struct630{631ZSTD_CDict* dict; /* dictionary */632ZSTD_CCtx* zc; /* working context */633void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */634} EStats_ress_t;635636#define MAXREPOFFSET 1024637638static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params,639unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,640const void* src, size_t srcSize,641U32 notificationLevel)642{643size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog);644size_t cSize;645646if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */647{ size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict);648if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }649650}651cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);652if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; }653654if (cSize) { /* if == 0; block is not compressible */655const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);656657/* literals stats */658{ const BYTE* bytePtr;659for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++)660countLit[*bytePtr]++;661}662663/* seqStats */664{ U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);665ZSTD_seqToCodes(seqStorePtr);666667{ const BYTE* codePtr = seqStorePtr->ofCode;668U32 u;669for (u=0; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++;670}671672{ const BYTE* codePtr = seqStorePtr->mlCode;673U32 u;674for (u=0; u<nbSeq; u++) matchlengthCount[codePtr[u]]++;675}676677{ const BYTE* codePtr = seqStorePtr->llCode;678U32 u;679for (u=0; u<nbSeq; u++) litlengthCount[codePtr[u]]++;680}681682if (nbSeq >= 2) { /* rep offsets */683const seqDef* const seq = seqStorePtr->sequencesStart;684U32 offset1 = seq[0].offBase - ZSTD_REP_NUM;685U32 offset2 = seq[1].offBase - ZSTD_REP_NUM;686if (offset1 >= MAXREPOFFSET) offset1 = 0;687if (offset2 >= MAXREPOFFSET) offset2 = 0;688repOffsets[offset1] += 3;689repOffsets[offset2] += 1;690} } }691}692693static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)694{695size_t total=0;696unsigned u;697for (u=0; u<nbFiles; u++) total += fileSizes[u];698return total;699}700701typedef struct { U32 offset; U32 count; } offsetCount_t;702703static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val, U32 count)704{705U32 u;706table[ZSTD_REP_NUM].offset = val;707table[ZSTD_REP_NUM].count = count;708for (u=ZSTD_REP_NUM; u>0; u--) {709offsetCount_t tmp;710if (table[u-1].count >= table[u].count) break;711tmp = table[u-1];712table[u-1] = table[u];713table[u] = tmp;714}715}716717/* ZDICT_flatLit() :718* rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.719* necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.720*/721static void ZDICT_flatLit(unsigned* countLit)722{723int u;724for (u=1; u<256; u++) countLit[u] = 2;725countLit[0] = 4;726countLit[253] = 1;727countLit[254] = 1;728}729730#define OFFCODE_MAX 30 /* only applicable to first block */731static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,732int compressionLevel,733const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,734const void* dictBuffer, size_t dictBufferSize,735unsigned notificationLevel)736{737unsigned countLit[256];738HUF_CREATE_STATIC_CTABLE(hufTable, 255);739unsigned offcodeCount[OFFCODE_MAX+1];740short offcodeNCount[OFFCODE_MAX+1];741U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + 128 KB));742unsigned matchLengthCount[MaxML+1];743short matchLengthNCount[MaxML+1];744unsigned litLengthCount[MaxLL+1];745short litLengthNCount[MaxLL+1];746U32 repOffset[MAXREPOFFSET];747offsetCount_t bestRepOffset[ZSTD_REP_NUM+1];748EStats_ress_t esr = { NULL, NULL, NULL };749ZSTD_parameters params;750U32 u, huffLog = 11, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;751size_t pos = 0, errorCode;752size_t eSize = 0;753size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);754size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);755BYTE* dstPtr = (BYTE*)dstBuffer;756757/* init */758DEBUGLOG(4, "ZDICT_analyzeEntropy");759if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */760for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */761for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;762for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1;763for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1;764memset(repOffset, 0, sizeof(repOffset));765repOffset[1] = repOffset[4] = repOffset[8] = 1;766memset(bestRepOffset, 0, sizeof(bestRepOffset));767if (compressionLevel==0) compressionLevel = ZSTD_CLEVEL_DEFAULT;768params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);769770esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem);771esr.zc = ZSTD_createCCtx();772esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);773if (!esr.dict || !esr.zc || !esr.workPlace) {774eSize = ERROR(memory_allocation);775DISPLAYLEVEL(1, "Not enough memory \n");776goto _cleanup;777}778779/* collect stats on all samples */780for (u=0; u<nbFiles; u++) {781ZDICT_countEStats(esr, ¶ms,782countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,783(const char*)srcBuffer + pos, fileSizes[u],784notificationLevel);785pos += fileSizes[u];786}787788if (notificationLevel >= 4) {789/* writeStats */790DISPLAYLEVEL(4, "Offset Code Frequencies : \n");791for (u=0; u<=offcodeMax; u++) {792DISPLAYLEVEL(4, "%2u :%7u \n", u, offcodeCount[u]);793} }794795/* analyze, build stats, starting with literals */796{ size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);797if (HUF_isError(maxNbBits)) {798eSize = maxNbBits;799DISPLAYLEVEL(1, " HUF_buildCTable error \n");800goto _cleanup;801}802if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */803DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");804ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */805maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);806assert(maxNbBits==9);807}808huffLog = (U32)maxNbBits;809}810811/* looking for most common first offsets */812{ U32 offset;813for (offset=1; offset<MAXREPOFFSET; offset++)814ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]);815}816/* note : the result of this phase should be used to better appreciate the impact on statistics */817818total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u];819errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax, /* useLowProbCount */ 1);820if (FSE_isError(errorCode)) {821eSize = errorCode;822DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");823goto _cleanup;824}825Offlog = (U32)errorCode;826827total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];828errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML, /* useLowProbCount */ 1);829if (FSE_isError(errorCode)) {830eSize = errorCode;831DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");832goto _cleanup;833}834mlLog = (U32)errorCode;835836total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u];837errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL, /* useLowProbCount */ 1);838if (FSE_isError(errorCode)) {839eSize = errorCode;840DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");841goto _cleanup;842}843llLog = (U32)errorCode;844845/* write result to buffer */846{ size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog);847if (HUF_isError(hhSize)) {848eSize = hhSize;849DISPLAYLEVEL(1, "HUF_writeCTable error \n");850goto _cleanup;851}852dstPtr += hhSize;853maxDstSize -= hhSize;854eSize += hhSize;855}856857{ size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);858if (FSE_isError(ohSize)) {859eSize = ohSize;860DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");861goto _cleanup;862}863dstPtr += ohSize;864maxDstSize -= ohSize;865eSize += ohSize;866}867868{ size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);869if (FSE_isError(mhSize)) {870eSize = mhSize;871DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");872goto _cleanup;873}874dstPtr += mhSize;875maxDstSize -= mhSize;876eSize += mhSize;877}878879{ size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);880if (FSE_isError(lhSize)) {881eSize = lhSize;882DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");883goto _cleanup;884}885dstPtr += lhSize;886maxDstSize -= lhSize;887eSize += lhSize;888}889890if (maxDstSize<12) {891eSize = ERROR(dstSize_tooSmall);892DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");893goto _cleanup;894}895# if 0896MEM_writeLE32(dstPtr+0, bestRepOffset[0].offset);897MEM_writeLE32(dstPtr+4, bestRepOffset[1].offset);898MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset);899#else900/* at this stage, we don't use the result of "most common first offset",901* as the impact of statistics is not properly evaluated */902MEM_writeLE32(dstPtr+0, repStartValue[0]);903MEM_writeLE32(dstPtr+4, repStartValue[1]);904MEM_writeLE32(dstPtr+8, repStartValue[2]);905#endif906eSize += 12;907908_cleanup:909ZSTD_freeCDict(esr.dict);910ZSTD_freeCCtx(esr.zc);911free(esr.workPlace);912913return eSize;914}915916917/**918* @returns the maximum repcode value919*/920static U32 ZDICT_maxRep(U32 const reps[ZSTD_REP_NUM])921{922U32 maxRep = reps[0];923int r;924for (r = 1; r < ZSTD_REP_NUM; ++r)925maxRep = MAX(maxRep, reps[r]);926return maxRep;927}928929size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,930const void* customDictContent, size_t dictContentSize,931const void* samplesBuffer, const size_t* samplesSizes,932unsigned nbSamples, ZDICT_params_t params)933{934size_t hSize;935#define HBUFFSIZE 256 /* should prove large enough for all entropy headers */936BYTE header[HBUFFSIZE];937int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;938U32 const notificationLevel = params.notificationLevel;939/* The final dictionary content must be at least as large as the largest repcode */940size_t const minContentSize = (size_t)ZDICT_maxRep(repStartValue);941size_t paddingSize;942943/* check conditions */944DEBUGLOG(4, "ZDICT_finalizeDictionary");945if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);946if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);947948/* dictionary header */949MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY);950{ U64 const randomID = XXH64(customDictContent, dictContentSize, 0);951U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;952U32 const dictID = params.dictID ? params.dictID : compliantID;953MEM_writeLE32(header+4, dictID);954}955hSize = 8;956957/* entropy tables */958DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */959DISPLAYLEVEL(2, "statistics ... \n");960{ size_t const eSize = ZDICT_analyzeEntropy(header+hSize, HBUFFSIZE-hSize,961compressionLevel,962samplesBuffer, samplesSizes, nbSamples,963customDictContent, dictContentSize,964notificationLevel);965if (ZDICT_isError(eSize)) return eSize;966hSize += eSize;967}968969/* Shrink the content size if it doesn't fit in the buffer */970if (hSize + dictContentSize > dictBufferCapacity) {971dictContentSize = dictBufferCapacity - hSize;972}973974/* Pad the dictionary content with zeros if it is too small */975if (dictContentSize < minContentSize) {976RETURN_ERROR_IF(hSize + minContentSize > dictBufferCapacity, dstSize_tooSmall,977"dictBufferCapacity too small to fit max repcode");978paddingSize = minContentSize - dictContentSize;979} else {980paddingSize = 0;981}982983{984size_t const dictSize = hSize + paddingSize + dictContentSize;985986/* The dictionary consists of the header, optional padding, and the content.987* The padding comes before the content because the "best" position in the988* dictionary is the last byte.989*/990BYTE* const outDictHeader = (BYTE*)dictBuffer;991BYTE* const outDictPadding = outDictHeader + hSize;992BYTE* const outDictContent = outDictPadding + paddingSize;993994assert(dictSize <= dictBufferCapacity);995assert(outDictContent + dictContentSize == (BYTE*)dictBuffer + dictSize);996997/* First copy the customDictContent into its final location.998* `customDictContent` and `dictBuffer` may overlap, so we must999* do this before any other writes into the output buffer.1000* Then copy the header & padding into the output buffer.1001*/1002memmove(outDictContent, customDictContent, dictContentSize);1003memcpy(outDictHeader, header, hSize);1004memset(outDictPadding, 0, paddingSize);10051006return dictSize;1007}1008}100910101011static size_t ZDICT_addEntropyTablesFromBuffer_advanced(1012void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,1013const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,1014ZDICT_params_t params)1015{1016int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;1017U32 const notificationLevel = params.notificationLevel;1018size_t hSize = 8;10191020/* calculate entropy tables */1021DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */1022DISPLAYLEVEL(2, "statistics ... \n");1023{ size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,1024compressionLevel,1025samplesBuffer, samplesSizes, nbSamples,1026(char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize,1027notificationLevel);1028if (ZDICT_isError(eSize)) return eSize;1029hSize += eSize;1030}10311032/* add dictionary header (after entropy tables) */1033MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY);1034{ U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);1035U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;1036U32 const dictID = params.dictID ? params.dictID : compliantID;1037MEM_writeLE32((char*)dictBuffer+4, dictID);1038}10391040if (hSize + dictContentSize < dictBufferCapacity)1041memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);1042return MIN(dictBufferCapacity, hSize+dictContentSize);1043}10441045/*! ZDICT_trainFromBuffer_unsafe_legacy() :1046* Warning : `samplesBuffer` must be followed by noisy guard band !!!1047* @return : size of dictionary, or an error code which can be tested with ZDICT_isError()1048*/1049/* Begin FreeBSD - This symbol is needed by dll-linked CLI zstd(1). */1050ZSTDLIB_API1051/* End FreeBSD */1052static size_t ZDICT_trainFromBuffer_unsafe_legacy(1053void* dictBuffer, size_t maxDictSize,1054const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,1055ZDICT_legacy_params_t params)1056{1057U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));1058dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));1059unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel;1060unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity;1061size_t const targetDictSize = maxDictSize;1062size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);1063size_t dictSize = 0;1064U32 const notificationLevel = params.zParams.notificationLevel;10651066/* checks */1067if (!dictList) return ERROR(memory_allocation);1068if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); } /* requested dictionary size is too small */1069if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); } /* not enough source to create dictionary */10701071/* init */1072ZDICT_initDictItem(dictList);10731074/* build dictionary */1075ZDICT_trainBuffer_legacy(dictList, dictListSize,1076samplesBuffer, samplesBuffSize,1077samplesSizes, nbSamples,1078minRep, notificationLevel);10791080/* display best matches */1081if (params.zParams.notificationLevel>= 3) {1082unsigned const nb = MIN(25, dictList[0].pos);1083unsigned const dictContentSize = ZDICT_dictSize(dictList);1084unsigned u;1085DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", (unsigned)dictList[0].pos-1, dictContentSize);1086DISPLAYLEVEL(3, "list %u best segments \n", nb-1);1087for (u=1; u<nb; u++) {1088unsigned const pos = dictList[u].pos;1089unsigned const length = dictList[u].length;1090U32 const printedLength = MIN(40, length);1091if ((pos > samplesBuffSize) || ((pos + length) > samplesBuffSize)) {1092free(dictList);1093return ERROR(GENERIC); /* should never happen */1094}1095DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",1096u, length, pos, (unsigned)dictList[u].savings);1097ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);1098DISPLAYLEVEL(3, "| \n");1099} }110011011102/* create dictionary */1103{ unsigned dictContentSize = ZDICT_dictSize(dictList);1104if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */1105if (dictContentSize < targetDictSize/4) {1106DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (unsigned)maxDictSize);1107if (samplesBuffSize < 10 * targetDictSize)1108DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (unsigned)(samplesBuffSize>>20));1109if (minRep > MINRATIO) {1110DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);1111DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");1112}1113}11141115if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {1116unsigned proposedSelectivity = selectivity-1;1117while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }1118DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (unsigned)maxDictSize);1119DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);1120DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n");1121}11221123/* limit dictionary size */1124{ U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */1125U32 currentSize = 0;1126U32 n; for (n=1; n<max; n++) {1127currentSize += dictList[n].length;1128if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; }1129}1130dictList->pos = n;1131dictContentSize = currentSize;1132}11331134/* build dict content */1135{ U32 u;1136BYTE* ptr = (BYTE*)dictBuffer + maxDictSize;1137for (u=1; u<dictList->pos; u++) {1138U32 l = dictList[u].length;1139ptr -= l;1140if (ptr<(BYTE*)dictBuffer) { free(dictList); return ERROR(GENERIC); } /* should not happen */1141memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);1142} }11431144dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,1145samplesBuffer, samplesSizes, nbSamples,1146params.zParams);1147}11481149/* clean up */1150free(dictList);1151return dictSize;1152}115311541155/* ZDICT_trainFromBuffer_legacy() :1156* issue : samplesBuffer need to be followed by a noisy guard band.1157* work around : duplicate the buffer, and add the noise */1158size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,1159const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,1160ZDICT_legacy_params_t params)1161{1162size_t result;1163void* newBuff;1164size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);1165if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0; /* not enough content => no dictionary */11661167newBuff = malloc(sBuffSize + NOISELENGTH);1168if (!newBuff) return ERROR(memory_allocation);11691170memcpy(newBuff, samplesBuffer, sBuffSize);1171ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */11721173result =1174ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,1175samplesSizes, nbSamples, params);1176free(newBuff);1177return result;1178}117911801181size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,1182const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)1183{1184ZDICT_fastCover_params_t params;1185DEBUGLOG(3, "ZDICT_trainFromBuffer");1186memset(¶ms, 0, sizeof(params));1187params.d = 8;1188params.steps = 4;1189/* Use default level since no compression level information is available */1190params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;1191#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)1192params.zParams.notificationLevel = DEBUGLEVEL;1193#endif1194return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,1195samplesBuffer, samplesSizes, nbSamples,1196¶ms);1197}11981199size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,1200const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)1201{1202ZDICT_params_t params;1203memset(¶ms, 0, sizeof(params));1204return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity,1205samplesBuffer, samplesSizes, nbSamples,1206params);1207}120812091210