Path: blob/master/Utilities/cmzstd/lib/dictBuilder/cover.c
5043 views
/*1* Copyright (c) Meta Platforms, Inc. and affiliates.2* All rights reserved.3*4* This source code is licensed under both the BSD-style license (found in the5* LICENSE file in the root directory of this source tree) and the GPLv2 (found6* in the COPYING file in the root directory of this source tree).7* You may select, at your option, one of the above-listed licenses.8*/910/* *****************************************************************************11* Constructs a dictionary using a heuristic based on the following paper:12*13* Liao, Petri, Moffat, Wirth14* Effective Construction of Relative Lempel-Ziv Dictionaries15* Published in WWW 2016.16*17* Adapted from code originally written by @ot (Giuseppe Ottaviano).18******************************************************************************/1920/*-*************************************21* Dependencies22***************************************/23/* qsort_r is an extension. */24#if defined(__linux) || defined(__linux__) || defined(linux) || defined(__gnu_linux__) || \25defined(__CYGWIN__) || defined(__MSYS__)26#if !defined(_GNU_SOURCE) && !defined(__ANDROID__) /* NDK doesn't ship qsort_r(). */27#define _GNU_SOURCE28#endif29#endif3031#include <stdio.h> /* fprintf */32#include <stdlib.h> /* malloc, free, qsort_r */3334#include <string.h> /* memset */35#include <time.h> /* clock */3637#ifndef ZDICT_STATIC_LINKING_ONLY38# define ZDICT_STATIC_LINKING_ONLY39#endif4041#include "../common/mem.h" /* read */42#include "../common/pool.h" /* POOL_ctx */43#include "../common/threading.h" /* ZSTD_pthread_mutex_t */44#include "../common/zstd_internal.h" /* includes zstd.h */45#include "../common/bits.h" /* ZSTD_highbit32 */46#include "../zdict.h"47#include "cover.h"4849/*-*************************************50* Constants51***************************************/52/**53* There are 32bit indexes used to ref samples, so limit samples size to 4GB54* on 64bit builds.55* For 32bit builds we choose 1 GB.56* Most 32bit platforms have 2GB user-mode addressable space and we allocate a large57* contiguous buffer, so 1GB is already a high limit.58*/59#define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))60#define COVER_DEFAULT_SPLITPOINT 1.06162/*-*************************************63* Console display64***************************************/65#ifndef LOCALDISPLAYLEVEL66static int g_displayLevel = 0;67#endif68#undef DISPLAY69#define DISPLAY(...) \70{ \71fprintf(stderr, __VA_ARGS__); \72fflush(stderr); \73}74#undef LOCALDISPLAYLEVEL75#define LOCALDISPLAYLEVEL(displayLevel, l, ...) \76if (displayLevel >= l) { \77DISPLAY(__VA_ARGS__); \78} /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */79#undef DISPLAYLEVEL80#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)8182#ifndef LOCALDISPLAYUPDATE83static const clock_t g_refreshRate = CLOCKS_PER_SEC * 15 / 100;84static clock_t g_time = 0;85#endif86#undef LOCALDISPLAYUPDATE87#define LOCALDISPLAYUPDATE(displayLevel, l, ...) \88if (displayLevel >= l) { \89if ((clock() - g_time > g_refreshRate) || (displayLevel >= 4)) { \90g_time = clock(); \91DISPLAY(__VA_ARGS__); \92} \93}94#undef DISPLAYUPDATE95#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)9697/*-*************************************98* Hash table99***************************************100* A small specialized hash map for storing activeDmers.101* The map does not resize, so if it becomes full it will loop forever.102* Thus, the map must be large enough to store every value.103* The map implements linear probing and keeps its load less than 0.5.104*/105106#define MAP_EMPTY_VALUE ((U32)-1)107typedef struct COVER_map_pair_t_s {108U32 key;109U32 value;110} COVER_map_pair_t;111112typedef struct COVER_map_s {113COVER_map_pair_t *data;114U32 sizeLog;115U32 size;116U32 sizeMask;117} COVER_map_t;118119/**120* Clear the map.121*/122static void COVER_map_clear(COVER_map_t *map) {123memset(map->data, MAP_EMPTY_VALUE, map->size * sizeof(COVER_map_pair_t));124}125126/**127* Initializes a map of the given size.128* Returns 1 on success and 0 on failure.129* The map must be destroyed with COVER_map_destroy().130* The map is only guaranteed to be large enough to hold size elements.131*/132static int COVER_map_init(COVER_map_t *map, U32 size) {133map->sizeLog = ZSTD_highbit32(size) + 2;134map->size = (U32)1 << map->sizeLog;135map->sizeMask = map->size - 1;136map->data = (COVER_map_pair_t *)malloc(map->size * sizeof(COVER_map_pair_t));137if (!map->data) {138map->sizeLog = 0;139map->size = 0;140return 0;141}142COVER_map_clear(map);143return 1;144}145146/**147* Internal hash function148*/149static const U32 COVER_prime4bytes = 2654435761U;150static U32 COVER_map_hash(COVER_map_t *map, U32 key) {151return (key * COVER_prime4bytes) >> (32 - map->sizeLog);152}153154/**155* Helper function that returns the index that a key should be placed into.156*/157static U32 COVER_map_index(COVER_map_t *map, U32 key) {158const U32 hash = COVER_map_hash(map, key);159U32 i;160for (i = hash;; i = (i + 1) & map->sizeMask) {161COVER_map_pair_t *pos = &map->data[i];162if (pos->value == MAP_EMPTY_VALUE) {163return i;164}165if (pos->key == key) {166return i;167}168}169}170171/**172* Returns the pointer to the value for key.173* If key is not in the map, it is inserted and the value is set to 0.174* The map must not be full.175*/176static U32 *COVER_map_at(COVER_map_t *map, U32 key) {177COVER_map_pair_t *pos = &map->data[COVER_map_index(map, key)];178if (pos->value == MAP_EMPTY_VALUE) {179pos->key = key;180pos->value = 0;181}182return &pos->value;183}184185/**186* Deletes key from the map if present.187*/188static void COVER_map_remove(COVER_map_t *map, U32 key) {189U32 i = COVER_map_index(map, key);190COVER_map_pair_t *del = &map->data[i];191U32 shift = 1;192if (del->value == MAP_EMPTY_VALUE) {193return;194}195for (i = (i + 1) & map->sizeMask;; i = (i + 1) & map->sizeMask) {196COVER_map_pair_t *const pos = &map->data[i];197/* If the position is empty we are done */198if (pos->value == MAP_EMPTY_VALUE) {199del->value = MAP_EMPTY_VALUE;200return;201}202/* If pos can be moved to del do so */203if (((i - COVER_map_hash(map, pos->key)) & map->sizeMask) >= shift) {204del->key = pos->key;205del->value = pos->value;206del = pos;207shift = 1;208} else {209++shift;210}211}212}213214/**215* Destroys a map that is inited with COVER_map_init().216*/217static void COVER_map_destroy(COVER_map_t *map) {218if (map->data) {219free(map->data);220}221map->data = NULL;222map->size = 0;223}224225/*-*************************************226* Context227***************************************/228229typedef struct {230const BYTE *samples;231size_t *offsets;232const size_t *samplesSizes;233size_t nbSamples;234size_t nbTrainSamples;235size_t nbTestSamples;236U32 *suffix;237size_t suffixSize;238U32 *freqs;239U32 *dmerAt;240unsigned d;241} COVER_ctx_t;242243#if !defined(_GNU_SOURCE) && !defined(__APPLE__) && !defined(_MSC_VER)244/* C90 only offers qsort() that needs a global context. */245static COVER_ctx_t *g_coverCtx = NULL;246#endif247248/*-*************************************249* Helper functions250***************************************/251252/**253* Returns the sum of the sample sizes.254*/255size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {256size_t sum = 0;257unsigned i;258for (i = 0; i < nbSamples; ++i) {259sum += samplesSizes[i];260}261return sum;262}263264/**265* Returns -1 if the dmer at lp is less than the dmer at rp.266* Return 0 if the dmers at lp and rp are equal.267* Returns 1 if the dmer at lp is greater than the dmer at rp.268*/269static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) {270U32 const lhs = *(U32 const *)lp;271U32 const rhs = *(U32 const *)rp;272return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d);273}274/**275* Faster version for d <= 8.276*/277static int COVER_cmp8(COVER_ctx_t *ctx, const void *lp, const void *rp) {278U64 const mask = (ctx->d == 8) ? (U64)-1 : (((U64)1 << (8 * ctx->d)) - 1);279U64 const lhs = MEM_readLE64(ctx->samples + *(U32 const *)lp) & mask;280U64 const rhs = MEM_readLE64(ctx->samples + *(U32 const *)rp) & mask;281if (lhs < rhs) {282return -1;283}284return (lhs > rhs);285}286287/**288* Same as COVER_cmp() except ties are broken by pointer value289*/290#if (defined(_WIN32) && defined(_MSC_VER)) || defined(__APPLE__)291static int WIN_CDECL COVER_strict_cmp(void* g_coverCtx, const void* lp, const void* rp) {292#elif defined(_GNU_SOURCE)293static int COVER_strict_cmp(const void *lp, const void *rp, void *g_coverCtx) {294#else /* C90 fallback.*/295static int COVER_strict_cmp(const void *lp, const void *rp) {296#endif297int result = COVER_cmp((COVER_ctx_t*)g_coverCtx, lp, rp);298if (result == 0) {299result = lp < rp ? -1 : 1;300}301return result;302}303/**304* Faster version for d <= 8.305*/306#if (defined(_WIN32) && defined(_MSC_VER)) || defined(__APPLE__)307static int WIN_CDECL COVER_strict_cmp8(void* g_coverCtx, const void* lp, const void* rp) {308#elif defined(_GNU_SOURCE)309static int COVER_strict_cmp8(const void *lp, const void *rp, void *g_coverCtx) {310#else /* C90 fallback.*/311static int COVER_strict_cmp8(const void *lp, const void *rp) {312#endif313int result = COVER_cmp8((COVER_ctx_t*)g_coverCtx, lp, rp);314if (result == 0) {315result = lp < rp ? -1 : 1;316}317return result;318}319320/**321* Abstract away divergence of qsort_r() parameters.322* Hopefully when C11 become the norm, we will be able323* to clean it up.324*/325static void stableSort(COVER_ctx_t *ctx) {326#if defined(__APPLE__)327qsort_r(ctx->suffix, ctx->suffixSize, sizeof(U32),328ctx,329(ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));330#elif defined(_GNU_SOURCE)331qsort_r(ctx->suffix, ctx->suffixSize, sizeof(U32),332(ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp),333ctx);334#elif defined(_WIN32) && defined(_MSC_VER)335qsort_s(ctx->suffix, ctx->suffixSize, sizeof(U32),336(ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp),337ctx);338#elif defined(__OpenBSD__)339g_coverCtx = ctx;340mergesort(ctx->suffix, ctx->suffixSize, sizeof(U32),341(ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));342#else /* C90 fallback.*/343g_coverCtx = ctx;344/* TODO(cavalcanti): implement a reentrant qsort() when is not available. */345qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),346(ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));347#endif348}349350/**351* Returns the first pointer in [first, last) whose element does not compare352* less than value. If no such element exists it returns last.353*/354static const size_t *COVER_lower_bound(const size_t* first, const size_t* last,355size_t value) {356size_t count = (size_t)(last - first);357assert(last >= first);358while (count != 0) {359size_t step = count / 2;360const size_t *ptr = first;361ptr += step;362if (*ptr < value) {363first = ++ptr;364count -= step + 1;365} else {366count = step;367}368}369return first;370}371372/**373* Generic groupBy function.374* Groups an array sorted by cmp into groups with equivalent values.375* Calls grp for each group.376*/377static void378COVER_groupBy(const void *data, size_t count, size_t size, COVER_ctx_t *ctx,379int (*cmp)(COVER_ctx_t *, const void *, const void *),380void (*grp)(COVER_ctx_t *, const void *, const void *)) {381const BYTE *ptr = (const BYTE *)data;382size_t num = 0;383while (num < count) {384const BYTE *grpEnd = ptr + size;385++num;386while (num < count && cmp(ctx, ptr, grpEnd) == 0) {387grpEnd += size;388++num;389}390grp(ctx, ptr, grpEnd);391ptr = grpEnd;392}393}394395/*-*************************************396* Cover functions397***************************************/398399/**400* Called on each group of positions with the same dmer.401* Counts the frequency of each dmer and saves it in the suffix array.402* Fills `ctx->dmerAt`.403*/404static void COVER_group(COVER_ctx_t *ctx, const void *group,405const void *groupEnd) {406/* The group consists of all the positions with the same first d bytes. */407const U32 *grpPtr = (const U32 *)group;408const U32 *grpEnd = (const U32 *)groupEnd;409/* The dmerId is how we will reference this dmer.410* This allows us to map the whole dmer space to a much smaller space, the411* size of the suffix array.412*/413const U32 dmerId = (U32)(grpPtr - ctx->suffix);414/* Count the number of samples this dmer shows up in */415U32 freq = 0;416/* Details */417const size_t *curOffsetPtr = ctx->offsets;418const size_t *offsetsEnd = ctx->offsets + ctx->nbSamples;419/* Once *grpPtr >= curSampleEnd this occurrence of the dmer is in a420* different sample than the last.421*/422size_t curSampleEnd = ctx->offsets[0];423for (; grpPtr != grpEnd; ++grpPtr) {424/* Save the dmerId for this position so we can get back to it. */425ctx->dmerAt[*grpPtr] = dmerId;426/* Dictionaries only help for the first reference to the dmer.427* After that zstd can reference the match from the previous reference.428* So only count each dmer once for each sample it is in.429*/430if (*grpPtr < curSampleEnd) {431continue;432}433freq += 1;434/* Binary search to find the end of the sample *grpPtr is in.435* In the common case that grpPtr + 1 == grpEnd we can skip the binary436* search because the loop is over.437*/438if (grpPtr + 1 != grpEnd) {439const size_t *sampleEndPtr =440COVER_lower_bound(curOffsetPtr, offsetsEnd, *grpPtr);441curSampleEnd = *sampleEndPtr;442curOffsetPtr = sampleEndPtr + 1;443}444}445/* At this point we are never going to look at this segment of the suffix446* array again. We take advantage of this fact to save memory.447* We store the frequency of the dmer in the first position of the group,448* which is dmerId.449*/450ctx->suffix[dmerId] = freq;451}452453454/**455* Selects the best segment in an epoch.456* Segments of are scored according to the function:457*458* Let F(d) be the frequency of dmer d.459* Let S_i be the dmer at position i of segment S which has length k.460*461* Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})462*463* Once the dmer d is in the dictionary we set F(d) = 0.464*/465static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,466COVER_map_t *activeDmers, U32 begin,467U32 end,468ZDICT_cover_params_t parameters) {469/* Constants */470const U32 k = parameters.k;471const U32 d = parameters.d;472const U32 dmersInK = k - d + 1;473/* Try each segment (activeSegment) and save the best (bestSegment) */474COVER_segment_t bestSegment = {0, 0, 0};475COVER_segment_t activeSegment;476/* Reset the activeDmers in the segment */477COVER_map_clear(activeDmers);478/* The activeSegment starts at the beginning of the epoch. */479activeSegment.begin = begin;480activeSegment.end = begin;481activeSegment.score = 0;482/* Slide the activeSegment through the whole epoch.483* Save the best segment in bestSegment.484*/485while (activeSegment.end < end) {486/* The dmerId for the dmer at the next position */487U32 newDmer = ctx->dmerAt[activeSegment.end];488/* The entry in activeDmers for this dmerId */489U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer);490/* If the dmer isn't already present in the segment add its score. */491if (*newDmerOcc == 0) {492/* The paper suggest using the L-0.5 norm, but experiments show that it493* doesn't help.494*/495activeSegment.score += freqs[newDmer];496}497/* Add the dmer to the segment */498activeSegment.end += 1;499*newDmerOcc += 1;500501/* If the window is now too large, drop the first position */502if (activeSegment.end - activeSegment.begin == dmersInK + 1) {503U32 delDmer = ctx->dmerAt[activeSegment.begin];504U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);505activeSegment.begin += 1;506*delDmerOcc -= 1;507/* If this is the last occurrence of the dmer, subtract its score */508if (*delDmerOcc == 0) {509COVER_map_remove(activeDmers, delDmer);510activeSegment.score -= freqs[delDmer];511}512}513514/* If this segment is the best so far save it */515if (activeSegment.score > bestSegment.score) {516bestSegment = activeSegment;517}518}519{520/* Trim off the zero frequency head and tail from the segment. */521U32 newBegin = bestSegment.end;522U32 newEnd = bestSegment.begin;523U32 pos;524for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {525U32 freq = freqs[ctx->dmerAt[pos]];526if (freq != 0) {527newBegin = MIN(newBegin, pos);528newEnd = pos + 1;529}530}531bestSegment.begin = newBegin;532bestSegment.end = newEnd;533}534{535/* Zero out the frequency of each dmer covered by the chosen segment. */536U32 pos;537for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {538freqs[ctx->dmerAt[pos]] = 0;539}540}541return bestSegment;542}543544/**545* Check the validity of the parameters.546* Returns non-zero if the parameters are valid and 0 otherwise.547*/548static int COVER_checkParameters(ZDICT_cover_params_t parameters,549size_t maxDictSize) {550/* k and d are required parameters */551if (parameters.d == 0 || parameters.k == 0) {552return 0;553}554/* k <= maxDictSize */555if (parameters.k > maxDictSize) {556return 0;557}558/* d <= k */559if (parameters.d > parameters.k) {560return 0;561}562/* 0 < splitPoint <= 1 */563if (parameters.splitPoint <= 0 || parameters.splitPoint > 1){564return 0;565}566return 1;567}568569/**570* Clean up a context initialized with `COVER_ctx_init()`.571*/572static void COVER_ctx_destroy(COVER_ctx_t *ctx) {573if (!ctx) {574return;575}576if (ctx->suffix) {577free(ctx->suffix);578ctx->suffix = NULL;579}580if (ctx->freqs) {581free(ctx->freqs);582ctx->freqs = NULL;583}584if (ctx->dmerAt) {585free(ctx->dmerAt);586ctx->dmerAt = NULL;587}588if (ctx->offsets) {589free(ctx->offsets);590ctx->offsets = NULL;591}592}593594/**595* Prepare a context for dictionary building.596* The context is only dependent on the parameter `d` and can be used multiple597* times.598* Returns 0 on success or error code on error.599* The context must be destroyed with `COVER_ctx_destroy()`.600*/601static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,602const size_t *samplesSizes, unsigned nbSamples,603unsigned d, double splitPoint)604{605const BYTE *const samples = (const BYTE *)samplesBuffer;606const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);607/* Split samples into testing and training sets */608const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;609const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;610const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;611const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;612/* Checks */613if (totalSamplesSize < MAX(d, sizeof(U64)) ||614totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {615DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",616(unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));617return ERROR(srcSize_wrong);618}619/* Check if there are at least 5 training samples */620if (nbTrainSamples < 5) {621DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);622return ERROR(srcSize_wrong);623}624/* Check if there's testing sample */625if (nbTestSamples < 1) {626DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);627return ERROR(srcSize_wrong);628}629/* Zero the context */630memset(ctx, 0, sizeof(*ctx));631DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,632(unsigned)trainingSamplesSize);633DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,634(unsigned)testSamplesSize);635ctx->samples = samples;636ctx->samplesSizes = samplesSizes;637ctx->nbSamples = nbSamples;638ctx->nbTrainSamples = nbTrainSamples;639ctx->nbTestSamples = nbTestSamples;640/* Partial suffix array */641ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;642ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));643/* Maps index to the dmerID */644ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));645/* The offsets of each file */646ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));647if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {648DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");649COVER_ctx_destroy(ctx);650return ERROR(memory_allocation);651}652ctx->freqs = NULL;653ctx->d = d;654655/* Fill offsets from the samplesSizes */656{657U32 i;658ctx->offsets[0] = 0;659for (i = 1; i <= nbSamples; ++i) {660ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];661}662}663DISPLAYLEVEL(2, "Constructing partial suffix array\n");664{665/* suffix is a partial suffix array.666* It only sorts suffixes by their first parameters.d bytes.667* The sort is stable, so each dmer group is sorted by position in input.668*/669U32 i;670for (i = 0; i < ctx->suffixSize; ++i) {671ctx->suffix[i] = i;672}673stableSort(ctx);674}675DISPLAYLEVEL(2, "Computing frequencies\n");676/* For each dmer group (group of positions with the same first d bytes):677* 1. For each position we set dmerAt[position] = dmerID. The dmerID is678* (groupBeginPtr - suffix). This allows us to go from position to679* dmerID so we can look up values in freq.680* 2. We calculate how many samples the dmer occurs in and save it in681* freqs[dmerId].682*/683COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx,684(ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);685ctx->freqs = ctx->suffix;686ctx->suffix = NULL;687return 0;688}689690void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)691{692const double ratio = (double)nbDmers / (double)maxDictSize;693if (ratio >= 10) {694return;695}696LOCALDISPLAYLEVEL(displayLevel, 1,697"WARNING: The maximum dictionary size %u is too large "698"compared to the source size %u! "699"size(source)/size(dictionary) = %f, but it should be >= "700"10! This may lead to a subpar dictionary! We recommend "701"training on sources at least 10x, and preferably 100x "702"the size of the dictionary! \n", (U32)maxDictSize,703(U32)nbDmers, ratio);704}705706COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,707U32 nbDmers, U32 k, U32 passes)708{709const U32 minEpochSize = k * 10;710COVER_epoch_info_t epochs;711epochs.num = MAX(1, maxDictSize / k / passes);712epochs.size = nbDmers / epochs.num;713if (epochs.size >= minEpochSize) {714assert(epochs.size * epochs.num <= nbDmers);715return epochs;716}717epochs.size = MIN(minEpochSize, nbDmers);718epochs.num = nbDmers / epochs.size;719assert(epochs.size * epochs.num <= nbDmers);720return epochs;721}722723/**724* Given the prepared context build the dictionary.725*/726static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,727COVER_map_t *activeDmers, void *dictBuffer,728size_t dictBufferCapacity,729ZDICT_cover_params_t parameters) {730BYTE *const dict = (BYTE *)dictBuffer;731size_t tail = dictBufferCapacity;732/* Divide the data into epochs. We will select one segment from each epoch. */733const COVER_epoch_info_t epochs = COVER_computeEpochs(734(U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);735const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));736size_t zeroScoreRun = 0;737size_t epoch;738DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",739(U32)epochs.num, (U32)epochs.size);740/* Loop through the epochs until there are no more segments or the dictionary741* is full.742*/743for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {744const U32 epochBegin = (U32)(epoch * epochs.size);745const U32 epochEnd = epochBegin + epochs.size;746size_t segmentSize;747/* Select a segment */748COVER_segment_t segment = COVER_selectSegment(749ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);750/* If the segment covers no dmers, then we are out of content.751* There may be new content in other epochs, for continue for some time.752*/753if (segment.score == 0) {754if (++zeroScoreRun >= maxZeroScoreRun) {755break;756}757continue;758}759zeroScoreRun = 0;760/* Trim the segment if necessary and if it is too small then we are done */761segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);762if (segmentSize < parameters.d) {763break;764}765/* We fill the dictionary from the back to allow the best segments to be766* referenced with the smallest offsets.767*/768tail -= segmentSize;769memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);770DISPLAYUPDATE(7712, "\r%u%% ",772(unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));773}774DISPLAYLEVEL(2, "\r%79s\r", "");775return tail;776}777778ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_cover(779void *dictBuffer, size_t dictBufferCapacity,780const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,781ZDICT_cover_params_t parameters)782{783BYTE* const dict = (BYTE*)dictBuffer;784COVER_ctx_t ctx;785COVER_map_t activeDmers;786parameters.splitPoint = 1.0;787/* Initialize global data */788g_displayLevel = (int)parameters.zParams.notificationLevel;789/* Checks */790if (!COVER_checkParameters(parameters, dictBufferCapacity)) {791DISPLAYLEVEL(1, "Cover parameters incorrect\n");792return ERROR(parameter_outOfBound);793}794if (nbSamples == 0) {795DISPLAYLEVEL(1, "Cover must have at least one input file\n");796return ERROR(srcSize_wrong);797}798if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {799DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",800ZDICT_DICTSIZE_MIN);801return ERROR(dstSize_tooSmall);802}803/* Initialize context and activeDmers */804{805size_t const initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,806parameters.d, parameters.splitPoint);807if (ZSTD_isError(initVal)) {808return initVal;809}810}811COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);812if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {813DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");814COVER_ctx_destroy(&ctx);815return ERROR(memory_allocation);816}817818DISPLAYLEVEL(2, "Building dictionary\n");819{820const size_t tail =821COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer,822dictBufferCapacity, parameters);823const size_t dictionarySize = ZDICT_finalizeDictionary(824dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,825samplesBuffer, samplesSizes, nbSamples, parameters.zParams);826if (!ZSTD_isError(dictionarySize)) {827DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",828(unsigned)dictionarySize);829}830COVER_ctx_destroy(&ctx);831COVER_map_destroy(&activeDmers);832return dictionarySize;833}834}835836837838size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,839const size_t *samplesSizes, const BYTE *samples,840size_t *offsets,841size_t nbTrainSamples, size_t nbSamples,842BYTE *const dict, size_t dictBufferCapacity) {843size_t totalCompressedSize = ERROR(GENERIC);844/* Pointers */845ZSTD_CCtx *cctx;846ZSTD_CDict *cdict;847void *dst;848/* Local variables */849size_t dstCapacity;850size_t i;851/* Allocate dst with enough space to compress the maximum sized sample */852{853size_t maxSampleSize = 0;854i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;855for (; i < nbSamples; ++i) {856maxSampleSize = MAX(samplesSizes[i], maxSampleSize);857}858dstCapacity = ZSTD_compressBound(maxSampleSize);859dst = malloc(dstCapacity);860}861/* Create the cctx and cdict */862cctx = ZSTD_createCCtx();863cdict = ZSTD_createCDict(dict, dictBufferCapacity,864parameters.zParams.compressionLevel);865if (!dst || !cctx || !cdict) {866goto _compressCleanup;867}868/* Compress each sample and sum their sizes (or error) */869totalCompressedSize = dictBufferCapacity;870i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;871for (; i < nbSamples; ++i) {872const size_t size = ZSTD_compress_usingCDict(873cctx, dst, dstCapacity, samples + offsets[i],874samplesSizes[i], cdict);875if (ZSTD_isError(size)) {876totalCompressedSize = size;877goto _compressCleanup;878}879totalCompressedSize += size;880}881_compressCleanup:882ZSTD_freeCCtx(cctx);883ZSTD_freeCDict(cdict);884if (dst) {885free(dst);886}887return totalCompressedSize;888}889890891/**892* Initialize the `COVER_best_t`.893*/894void COVER_best_init(COVER_best_t *best) {895if (best==NULL) return; /* compatible with init on NULL */896(void)ZSTD_pthread_mutex_init(&best->mutex, NULL);897(void)ZSTD_pthread_cond_init(&best->cond, NULL);898best->liveJobs = 0;899best->dict = NULL;900best->dictSize = 0;901best->compressedSize = (size_t)-1;902memset(&best->parameters, 0, sizeof(best->parameters));903}904905/**906* Wait until liveJobs == 0.907*/908void COVER_best_wait(COVER_best_t *best) {909if (!best) {910return;911}912ZSTD_pthread_mutex_lock(&best->mutex);913while (best->liveJobs != 0) {914ZSTD_pthread_cond_wait(&best->cond, &best->mutex);915}916ZSTD_pthread_mutex_unlock(&best->mutex);917}918919/**920* Call COVER_best_wait() and then destroy the COVER_best_t.921*/922void COVER_best_destroy(COVER_best_t *best) {923if (!best) {924return;925}926COVER_best_wait(best);927if (best->dict) {928free(best->dict);929}930ZSTD_pthread_mutex_destroy(&best->mutex);931ZSTD_pthread_cond_destroy(&best->cond);932}933934/**935* Called when a thread is about to be launched.936* Increments liveJobs.937*/938void COVER_best_start(COVER_best_t *best) {939if (!best) {940return;941}942ZSTD_pthread_mutex_lock(&best->mutex);943++best->liveJobs;944ZSTD_pthread_mutex_unlock(&best->mutex);945}946947/**948* Called when a thread finishes executing, both on error or success.949* Decrements liveJobs and signals any waiting threads if liveJobs == 0.950* If this dictionary is the best so far save it and its parameters.951*/952void COVER_best_finish(COVER_best_t* best,953ZDICT_cover_params_t parameters,954COVER_dictSelection_t selection)955{956void* dict = selection.dictContent;957size_t compressedSize = selection.totalCompressedSize;958size_t dictSize = selection.dictSize;959if (!best) {960return;961}962{963size_t liveJobs;964ZSTD_pthread_mutex_lock(&best->mutex);965--best->liveJobs;966liveJobs = best->liveJobs;967/* If the new dictionary is better */968if (compressedSize < best->compressedSize) {969/* Allocate space if necessary */970if (!best->dict || best->dictSize < dictSize) {971if (best->dict) {972free(best->dict);973}974best->dict = malloc(dictSize);975if (!best->dict) {976best->compressedSize = ERROR(GENERIC);977best->dictSize = 0;978ZSTD_pthread_cond_signal(&best->cond);979ZSTD_pthread_mutex_unlock(&best->mutex);980return;981}982}983/* Save the dictionary, parameters, and size */984if (dict) {985memcpy(best->dict, dict, dictSize);986best->dictSize = dictSize;987best->parameters = parameters;988best->compressedSize = compressedSize;989}990}991if (liveJobs == 0) {992ZSTD_pthread_cond_broadcast(&best->cond);993}994ZSTD_pthread_mutex_unlock(&best->mutex);995}996}997998static COVER_dictSelection_t setDictSelection(BYTE* buf, size_t s, size_t csz)999{1000COVER_dictSelection_t ds;1001ds.dictContent = buf;1002ds.dictSize = s;1003ds.totalCompressedSize = csz;1004return ds;1005}10061007COVER_dictSelection_t COVER_dictSelectionError(size_t error) {1008return setDictSelection(NULL, 0, error);1009}10101011unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {1012return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent);1013}10141015void COVER_dictSelectionFree(COVER_dictSelection_t selection){1016free(selection.dictContent);1017}10181019COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,1020size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,1021size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) {10221023size_t largestDict = 0;1024size_t largestCompressed = 0;1025BYTE* customDictContentEnd = customDictContent + dictContentSize;10261027BYTE* largestDictbuffer = (BYTE*)malloc(dictBufferCapacity);1028BYTE* candidateDictBuffer = (BYTE*)malloc(dictBufferCapacity);1029double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;10301031if (!largestDictbuffer || !candidateDictBuffer) {1032free(largestDictbuffer);1033free(candidateDictBuffer);1034return COVER_dictSelectionError(dictContentSize);1035}10361037/* Initial dictionary size and compressed size */1038memcpy(largestDictbuffer, customDictContent, dictContentSize);1039dictContentSize = ZDICT_finalizeDictionary(1040largestDictbuffer, dictBufferCapacity, customDictContent, dictContentSize,1041samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);10421043if (ZDICT_isError(dictContentSize)) {1044free(largestDictbuffer);1045free(candidateDictBuffer);1046return COVER_dictSelectionError(dictContentSize);1047}10481049totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,1050samplesBuffer, offsets,1051nbCheckSamples, nbSamples,1052largestDictbuffer, dictContentSize);10531054if (ZSTD_isError(totalCompressedSize)) {1055free(largestDictbuffer);1056free(candidateDictBuffer);1057return COVER_dictSelectionError(totalCompressedSize);1058}10591060if (params.shrinkDict == 0) {1061free(candidateDictBuffer);1062return setDictSelection(largestDictbuffer, dictContentSize, totalCompressedSize);1063}10641065largestDict = dictContentSize;1066largestCompressed = totalCompressedSize;1067dictContentSize = ZDICT_DICTSIZE_MIN;10681069/* Largest dict is initially at least ZDICT_DICTSIZE_MIN */1070while (dictContentSize < largestDict) {1071memcpy(candidateDictBuffer, largestDictbuffer, largestDict);1072dictContentSize = ZDICT_finalizeDictionary(1073candidateDictBuffer, dictBufferCapacity, customDictContentEnd - dictContentSize, dictContentSize,1074samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);10751076if (ZDICT_isError(dictContentSize)) {1077free(largestDictbuffer);1078free(candidateDictBuffer);1079return COVER_dictSelectionError(dictContentSize);10801081}10821083totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,1084samplesBuffer, offsets,1085nbCheckSamples, nbSamples,1086candidateDictBuffer, dictContentSize);10871088if (ZSTD_isError(totalCompressedSize)) {1089free(largestDictbuffer);1090free(candidateDictBuffer);1091return COVER_dictSelectionError(totalCompressedSize);1092}10931094if ((double)totalCompressedSize <= (double)largestCompressed * regressionTolerance) {1095free(largestDictbuffer);1096return setDictSelection( candidateDictBuffer, dictContentSize, totalCompressedSize );1097}1098dictContentSize *= 2;1099}1100dictContentSize = largestDict;1101totalCompressedSize = largestCompressed;1102free(candidateDictBuffer);1103return setDictSelection( largestDictbuffer, dictContentSize, totalCompressedSize );1104}11051106/**1107* Parameters for COVER_tryParameters().1108*/1109typedef struct COVER_tryParameters_data_s {1110const COVER_ctx_t *ctx;1111COVER_best_t *best;1112size_t dictBufferCapacity;1113ZDICT_cover_params_t parameters;1114} COVER_tryParameters_data_t;11151116/**1117* Tries a set of parameters and updates the COVER_best_t with the results.1118* This function is thread safe if zstd is compiled with multithreaded support.1119* It takes its parameters as an *OWNING* opaque pointer to support threading.1120*/1121static void COVER_tryParameters(void *opaque)1122{1123/* Save parameters as local variables */1124COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t*)opaque;1125const COVER_ctx_t *const ctx = data->ctx;1126const ZDICT_cover_params_t parameters = data->parameters;1127size_t dictBufferCapacity = data->dictBufferCapacity;1128size_t totalCompressedSize = ERROR(GENERIC);1129/* Allocate space for hash table, dict, and freqs */1130COVER_map_t activeDmers;1131BYTE* const dict = (BYTE*)malloc(dictBufferCapacity);1132COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));1133U32* const freqs = (U32*)malloc(ctx->suffixSize * sizeof(U32));1134if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {1135DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");1136goto _cleanup;1137}1138if (!dict || !freqs) {1139DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");1140goto _cleanup;1141}1142/* Copy the frequencies because we need to modify them */1143memcpy(freqs, ctx->freqs, ctx->suffixSize * sizeof(U32));1144/* Build the dictionary */1145{1146const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,1147dictBufferCapacity, parameters);1148selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail,1149ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,1150totalCompressedSize);11511152if (COVER_dictSelectionIsError(selection)) {1153DISPLAYLEVEL(1, "Failed to select dictionary\n");1154goto _cleanup;1155}1156}1157_cleanup:1158free(dict);1159COVER_best_finish(data->best, parameters, selection);1160free(data);1161COVER_map_destroy(&activeDmers);1162COVER_dictSelectionFree(selection);1163free(freqs);1164}11651166ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_cover(1167void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer,1168const size_t* samplesSizes, unsigned nbSamples,1169ZDICT_cover_params_t* parameters)1170{1171/* constants */1172const unsigned nbThreads = parameters->nbThreads;1173const double splitPoint =1174parameters->splitPoint <= 0.0 ? COVER_DEFAULT_SPLITPOINT : parameters->splitPoint;1175const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;1176const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;1177const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;1178const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;1179const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;1180const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);1181const unsigned kIterations =1182(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);1183const unsigned shrinkDict = 0;1184/* Local variables */1185const int displayLevel = parameters->zParams.notificationLevel;1186unsigned iteration = 1;1187unsigned d;1188unsigned k;1189COVER_best_t best;1190POOL_ctx *pool = NULL;1191int warned = 0;11921193/* Checks */1194if (splitPoint <= 0 || splitPoint > 1) {1195LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");1196return ERROR(parameter_outOfBound);1197}1198if (kMinK < kMaxD || kMaxK < kMinK) {1199LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");1200return ERROR(parameter_outOfBound);1201}1202if (nbSamples == 0) {1203DISPLAYLEVEL(1, "Cover must have at least one input file\n");1204return ERROR(srcSize_wrong);1205}1206if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {1207DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",1208ZDICT_DICTSIZE_MIN);1209return ERROR(dstSize_tooSmall);1210}1211if (nbThreads > 1) {1212pool = POOL_create(nbThreads, 1);1213if (!pool) {1214return ERROR(memory_allocation);1215}1216}1217/* Initialization */1218COVER_best_init(&best);1219/* Turn down global display level to clean up display at level 2 and below */1220g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;1221/* Loop through d first because each new value needs a new context */1222LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",1223kIterations);1224for (d = kMinD; d <= kMaxD; d += 2) {1225/* Initialize the context for this value of d */1226COVER_ctx_t ctx;1227LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);1228{1229const size_t initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint);1230if (ZSTD_isError(initVal)) {1231LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");1232COVER_best_destroy(&best);1233POOL_free(pool);1234return initVal;1235}1236}1237if (!warned) {1238COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);1239warned = 1;1240}1241/* Loop through k reusing the same context */1242for (k = kMinK; k <= kMaxK; k += kStepSize) {1243/* Prepare the arguments */1244COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)malloc(1245sizeof(COVER_tryParameters_data_t));1246LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);1247if (!data) {1248LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");1249COVER_best_destroy(&best);1250COVER_ctx_destroy(&ctx);1251POOL_free(pool);1252return ERROR(memory_allocation);1253}1254data->ctx = &ctx;1255data->best = &best;1256data->dictBufferCapacity = dictBufferCapacity;1257data->parameters = *parameters;1258data->parameters.k = k;1259data->parameters.d = d;1260data->parameters.splitPoint = splitPoint;1261data->parameters.steps = kSteps;1262data->parameters.shrinkDict = shrinkDict;1263data->parameters.zParams.notificationLevel = g_displayLevel;1264/* Check the parameters */1265if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {1266DISPLAYLEVEL(1, "Cover parameters incorrect\n");1267free(data);1268continue;1269}1270/* Call the function and pass ownership of data to it */1271COVER_best_start(&best);1272if (pool) {1273POOL_add(pool, &COVER_tryParameters, data);1274} else {1275COVER_tryParameters(data);1276}1277/* Print status */1278LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ",1279(unsigned)((iteration * 100) / kIterations));1280++iteration;1281}1282COVER_best_wait(&best);1283COVER_ctx_destroy(&ctx);1284}1285LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");1286/* Fill the output buffer and parameters with output of the best parameters */1287{1288const size_t dictSize = best.dictSize;1289if (ZSTD_isError(best.compressedSize)) {1290const size_t compressedSize = best.compressedSize;1291COVER_best_destroy(&best);1292POOL_free(pool);1293return compressedSize;1294}1295*parameters = best.parameters;1296memcpy(dictBuffer, best.dict, dictSize);1297COVER_best_destroy(&best);1298POOL_free(pool);1299return dictSize;1300}1301}130213031304