Path: blob/master/Utilities/cmzstd/lib/dictBuilder/cover.c
3156 views
/*1* Copyright (c) Meta Platforms, Inc. and affiliates.2* All rights reserved.3*4* This source code is licensed under both the BSD-style license (found in the5* LICENSE file in the root directory of this source tree) and the GPLv2 (found6* in the COPYING file in the root directory of this source tree).7* You may select, at your option, one of the above-listed licenses.8*/910/* *****************************************************************************11* Constructs a dictionary using a heuristic based on the following paper:12*13* Liao, Petri, Moffat, Wirth14* Effective Construction of Relative Lempel-Ziv Dictionaries15* Published in WWW 2016.16*17* Adapted from code originally written by @ot (Giuseppe Ottaviano).18******************************************************************************/1920/*-*************************************21* Dependencies22***************************************/23#include <stdio.h> /* fprintf */24#include <stdlib.h> /* malloc, free, qsort */25#include <string.h> /* memset */26#include <time.h> /* clock */2728#ifndef ZDICT_STATIC_LINKING_ONLY29# define ZDICT_STATIC_LINKING_ONLY30#endif3132#include "../common/mem.h" /* read */33#include "../common/pool.h"34#include "../common/threading.h"35#include "../common/zstd_internal.h" /* includes zstd.h */36#include "../common/bits.h" /* ZSTD_highbit32 */37#include "../zdict.h"38#include "cover.h"3940/*-*************************************41* Constants42***************************************/43/**44* There are 32bit indexes used to ref samples, so limit samples size to 4GB45* on 64bit builds.46* For 32bit builds we choose 1 GB.47* Most 32bit platforms have 2GB user-mode addressable space and we allocate a large48* contiguous buffer, so 1GB is already a high limit.49*/50#define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))51#define COVER_DEFAULT_SPLITPOINT 1.05253/*-*************************************54* Console display55***************************************/56#ifndef LOCALDISPLAYLEVEL57static int g_displayLevel = 0;58#endif59#undef DISPLAY60#define DISPLAY(...) \61{ \62fprintf(stderr, __VA_ARGS__); \63fflush(stderr); \64}65#undef LOCALDISPLAYLEVEL66#define LOCALDISPLAYLEVEL(displayLevel, l, ...) \67if (displayLevel >= l) { \68DISPLAY(__VA_ARGS__); \69} /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */70#undef DISPLAYLEVEL71#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)7273#ifndef LOCALDISPLAYUPDATE74static const clock_t g_refreshRate = CLOCKS_PER_SEC * 15 / 100;75static clock_t g_time = 0;76#endif77#undef LOCALDISPLAYUPDATE78#define LOCALDISPLAYUPDATE(displayLevel, l, ...) \79if (displayLevel >= l) { \80if ((clock() - g_time > g_refreshRate) || (displayLevel >= 4)) { \81g_time = clock(); \82DISPLAY(__VA_ARGS__); \83} \84}85#undef DISPLAYUPDATE86#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)8788/*-*************************************89* Hash table90***************************************91* A small specialized hash map for storing activeDmers.92* The map does not resize, so if it becomes full it will loop forever.93* Thus, the map must be large enough to store every value.94* The map implements linear probing and keeps its load less than 0.5.95*/9697#define MAP_EMPTY_VALUE ((U32)-1)98typedef struct COVER_map_pair_t_s {99U32 key;100U32 value;101} COVER_map_pair_t;102103typedef struct COVER_map_s {104COVER_map_pair_t *data;105U32 sizeLog;106U32 size;107U32 sizeMask;108} COVER_map_t;109110/**111* Clear the map.112*/113static void COVER_map_clear(COVER_map_t *map) {114memset(map->data, MAP_EMPTY_VALUE, map->size * sizeof(COVER_map_pair_t));115}116117/**118* Initializes a map of the given size.119* Returns 1 on success and 0 on failure.120* The map must be destroyed with COVER_map_destroy().121* The map is only guaranteed to be large enough to hold size elements.122*/123static int COVER_map_init(COVER_map_t *map, U32 size) {124map->sizeLog = ZSTD_highbit32(size) + 2;125map->size = (U32)1 << map->sizeLog;126map->sizeMask = map->size - 1;127map->data = (COVER_map_pair_t *)malloc(map->size * sizeof(COVER_map_pair_t));128if (!map->data) {129map->sizeLog = 0;130map->size = 0;131return 0;132}133COVER_map_clear(map);134return 1;135}136137/**138* Internal hash function139*/140static const U32 COVER_prime4bytes = 2654435761U;141static U32 COVER_map_hash(COVER_map_t *map, U32 key) {142return (key * COVER_prime4bytes) >> (32 - map->sizeLog);143}144145/**146* Helper function that returns the index that a key should be placed into.147*/148static U32 COVER_map_index(COVER_map_t *map, U32 key) {149const U32 hash = COVER_map_hash(map, key);150U32 i;151for (i = hash;; i = (i + 1) & map->sizeMask) {152COVER_map_pair_t *pos = &map->data[i];153if (pos->value == MAP_EMPTY_VALUE) {154return i;155}156if (pos->key == key) {157return i;158}159}160}161162/**163* Returns the pointer to the value for key.164* If key is not in the map, it is inserted and the value is set to 0.165* The map must not be full.166*/167static U32 *COVER_map_at(COVER_map_t *map, U32 key) {168COVER_map_pair_t *pos = &map->data[COVER_map_index(map, key)];169if (pos->value == MAP_EMPTY_VALUE) {170pos->key = key;171pos->value = 0;172}173return &pos->value;174}175176/**177* Deletes key from the map if present.178*/179static void COVER_map_remove(COVER_map_t *map, U32 key) {180U32 i = COVER_map_index(map, key);181COVER_map_pair_t *del = &map->data[i];182U32 shift = 1;183if (del->value == MAP_EMPTY_VALUE) {184return;185}186for (i = (i + 1) & map->sizeMask;; i = (i + 1) & map->sizeMask) {187COVER_map_pair_t *const pos = &map->data[i];188/* If the position is empty we are done */189if (pos->value == MAP_EMPTY_VALUE) {190del->value = MAP_EMPTY_VALUE;191return;192}193/* If pos can be moved to del do so */194if (((i - COVER_map_hash(map, pos->key)) & map->sizeMask) >= shift) {195del->key = pos->key;196del->value = pos->value;197del = pos;198shift = 1;199} else {200++shift;201}202}203}204205/**206* Destroys a map that is inited with COVER_map_init().207*/208static void COVER_map_destroy(COVER_map_t *map) {209if (map->data) {210free(map->data);211}212map->data = NULL;213map->size = 0;214}215216/*-*************************************217* Context218***************************************/219220typedef struct {221const BYTE *samples;222size_t *offsets;223const size_t *samplesSizes;224size_t nbSamples;225size_t nbTrainSamples;226size_t nbTestSamples;227U32 *suffix;228size_t suffixSize;229U32 *freqs;230U32 *dmerAt;231unsigned d;232} COVER_ctx_t;233234/* We need a global context for qsort... */235static COVER_ctx_t *g_coverCtx = NULL;236237/*-*************************************238* Helper functions239***************************************/240241/**242* Returns the sum of the sample sizes.243*/244size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {245size_t sum = 0;246unsigned i;247for (i = 0; i < nbSamples; ++i) {248sum += samplesSizes[i];249}250return sum;251}252253/**254* Returns -1 if the dmer at lp is less than the dmer at rp.255* Return 0 if the dmers at lp and rp are equal.256* Returns 1 if the dmer at lp is greater than the dmer at rp.257*/258static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) {259U32 const lhs = *(U32 const *)lp;260U32 const rhs = *(U32 const *)rp;261return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d);262}263/**264* Faster version for d <= 8.265*/266static int COVER_cmp8(COVER_ctx_t *ctx, const void *lp, const void *rp) {267U64 const mask = (ctx->d == 8) ? (U64)-1 : (((U64)1 << (8 * ctx->d)) - 1);268U64 const lhs = MEM_readLE64(ctx->samples + *(U32 const *)lp) & mask;269U64 const rhs = MEM_readLE64(ctx->samples + *(U32 const *)rp) & mask;270if (lhs < rhs) {271return -1;272}273return (lhs > rhs);274}275276/**277* Same as COVER_cmp() except ties are broken by pointer value278* NOTE: g_coverCtx must be set to call this function. A global is required because279* qsort doesn't take an opaque pointer.280*/281static int WIN_CDECL COVER_strict_cmp(const void *lp, const void *rp) {282int result = COVER_cmp(g_coverCtx, lp, rp);283if (result == 0) {284result = lp < rp ? -1 : 1;285}286return result;287}288/**289* Faster version for d <= 8.290*/291static int WIN_CDECL COVER_strict_cmp8(const void *lp, const void *rp) {292int result = COVER_cmp8(g_coverCtx, lp, rp);293if (result == 0) {294result = lp < rp ? -1 : 1;295}296return result;297}298299/**300* Returns the first pointer in [first, last) whose element does not compare301* less than value. If no such element exists it returns last.302*/303static const size_t *COVER_lower_bound(const size_t *first, const size_t *last,304size_t value) {305size_t count = last - first;306while (count != 0) {307size_t step = count / 2;308const size_t *ptr = first;309ptr += step;310if (*ptr < value) {311first = ++ptr;312count -= step + 1;313} else {314count = step;315}316}317return first;318}319320/**321* Generic groupBy function.322* Groups an array sorted by cmp into groups with equivalent values.323* Calls grp for each group.324*/325static void326COVER_groupBy(const void *data, size_t count, size_t size, COVER_ctx_t *ctx,327int (*cmp)(COVER_ctx_t *, const void *, const void *),328void (*grp)(COVER_ctx_t *, const void *, const void *)) {329const BYTE *ptr = (const BYTE *)data;330size_t num = 0;331while (num < count) {332const BYTE *grpEnd = ptr + size;333++num;334while (num < count && cmp(ctx, ptr, grpEnd) == 0) {335grpEnd += size;336++num;337}338grp(ctx, ptr, grpEnd);339ptr = grpEnd;340}341}342343/*-*************************************344* Cover functions345***************************************/346347/**348* Called on each group of positions with the same dmer.349* Counts the frequency of each dmer and saves it in the suffix array.350* Fills `ctx->dmerAt`.351*/352static void COVER_group(COVER_ctx_t *ctx, const void *group,353const void *groupEnd) {354/* The group consists of all the positions with the same first d bytes. */355const U32 *grpPtr = (const U32 *)group;356const U32 *grpEnd = (const U32 *)groupEnd;357/* The dmerId is how we will reference this dmer.358* This allows us to map the whole dmer space to a much smaller space, the359* size of the suffix array.360*/361const U32 dmerId = (U32)(grpPtr - ctx->suffix);362/* Count the number of samples this dmer shows up in */363U32 freq = 0;364/* Details */365const size_t *curOffsetPtr = ctx->offsets;366const size_t *offsetsEnd = ctx->offsets + ctx->nbSamples;367/* Once *grpPtr >= curSampleEnd this occurrence of the dmer is in a368* different sample than the last.369*/370size_t curSampleEnd = ctx->offsets[0];371for (; grpPtr != grpEnd; ++grpPtr) {372/* Save the dmerId for this position so we can get back to it. */373ctx->dmerAt[*grpPtr] = dmerId;374/* Dictionaries only help for the first reference to the dmer.375* After that zstd can reference the match from the previous reference.376* So only count each dmer once for each sample it is in.377*/378if (*grpPtr < curSampleEnd) {379continue;380}381freq += 1;382/* Binary search to find the end of the sample *grpPtr is in.383* In the common case that grpPtr + 1 == grpEnd we can skip the binary384* search because the loop is over.385*/386if (grpPtr + 1 != grpEnd) {387const size_t *sampleEndPtr =388COVER_lower_bound(curOffsetPtr, offsetsEnd, *grpPtr);389curSampleEnd = *sampleEndPtr;390curOffsetPtr = sampleEndPtr + 1;391}392}393/* At this point we are never going to look at this segment of the suffix394* array again. We take advantage of this fact to save memory.395* We store the frequency of the dmer in the first position of the group,396* which is dmerId.397*/398ctx->suffix[dmerId] = freq;399}400401402/**403* Selects the best segment in an epoch.404* Segments of are scored according to the function:405*406* Let F(d) be the frequency of dmer d.407* Let S_i be the dmer at position i of segment S which has length k.408*409* Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})410*411* Once the dmer d is in the dictionary we set F(d) = 0.412*/413static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,414COVER_map_t *activeDmers, U32 begin,415U32 end,416ZDICT_cover_params_t parameters) {417/* Constants */418const U32 k = parameters.k;419const U32 d = parameters.d;420const U32 dmersInK = k - d + 1;421/* Try each segment (activeSegment) and save the best (bestSegment) */422COVER_segment_t bestSegment = {0, 0, 0};423COVER_segment_t activeSegment;424/* Reset the activeDmers in the segment */425COVER_map_clear(activeDmers);426/* The activeSegment starts at the beginning of the epoch. */427activeSegment.begin = begin;428activeSegment.end = begin;429activeSegment.score = 0;430/* Slide the activeSegment through the whole epoch.431* Save the best segment in bestSegment.432*/433while (activeSegment.end < end) {434/* The dmerId for the dmer at the next position */435U32 newDmer = ctx->dmerAt[activeSegment.end];436/* The entry in activeDmers for this dmerId */437U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer);438/* If the dmer isn't already present in the segment add its score. */439if (*newDmerOcc == 0) {440/* The paper suggest using the L-0.5 norm, but experiments show that it441* doesn't help.442*/443activeSegment.score += freqs[newDmer];444}445/* Add the dmer to the segment */446activeSegment.end += 1;447*newDmerOcc += 1;448449/* If the window is now too large, drop the first position */450if (activeSegment.end - activeSegment.begin == dmersInK + 1) {451U32 delDmer = ctx->dmerAt[activeSegment.begin];452U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);453activeSegment.begin += 1;454*delDmerOcc -= 1;455/* If this is the last occurrence of the dmer, subtract its score */456if (*delDmerOcc == 0) {457COVER_map_remove(activeDmers, delDmer);458activeSegment.score -= freqs[delDmer];459}460}461462/* If this segment is the best so far save it */463if (activeSegment.score > bestSegment.score) {464bestSegment = activeSegment;465}466}467{468/* Trim off the zero frequency head and tail from the segment. */469U32 newBegin = bestSegment.end;470U32 newEnd = bestSegment.begin;471U32 pos;472for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {473U32 freq = freqs[ctx->dmerAt[pos]];474if (freq != 0) {475newBegin = MIN(newBegin, pos);476newEnd = pos + 1;477}478}479bestSegment.begin = newBegin;480bestSegment.end = newEnd;481}482{483/* Zero out the frequency of each dmer covered by the chosen segment. */484U32 pos;485for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {486freqs[ctx->dmerAt[pos]] = 0;487}488}489return bestSegment;490}491492/**493* Check the validity of the parameters.494* Returns non-zero if the parameters are valid and 0 otherwise.495*/496static int COVER_checkParameters(ZDICT_cover_params_t parameters,497size_t maxDictSize) {498/* k and d are required parameters */499if (parameters.d == 0 || parameters.k == 0) {500return 0;501}502/* k <= maxDictSize */503if (parameters.k > maxDictSize) {504return 0;505}506/* d <= k */507if (parameters.d > parameters.k) {508return 0;509}510/* 0 < splitPoint <= 1 */511if (parameters.splitPoint <= 0 || parameters.splitPoint > 1){512return 0;513}514return 1;515}516517/**518* Clean up a context initialized with `COVER_ctx_init()`.519*/520static void COVER_ctx_destroy(COVER_ctx_t *ctx) {521if (!ctx) {522return;523}524if (ctx->suffix) {525free(ctx->suffix);526ctx->suffix = NULL;527}528if (ctx->freqs) {529free(ctx->freqs);530ctx->freqs = NULL;531}532if (ctx->dmerAt) {533free(ctx->dmerAt);534ctx->dmerAt = NULL;535}536if (ctx->offsets) {537free(ctx->offsets);538ctx->offsets = NULL;539}540}541542/**543* Prepare a context for dictionary building.544* The context is only dependent on the parameter `d` and can be used multiple545* times.546* Returns 0 on success or error code on error.547* The context must be destroyed with `COVER_ctx_destroy()`.548*/549static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,550const size_t *samplesSizes, unsigned nbSamples,551unsigned d, double splitPoint) {552const BYTE *const samples = (const BYTE *)samplesBuffer;553const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);554/* Split samples into testing and training sets */555const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;556const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;557const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;558const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;559/* Checks */560if (totalSamplesSize < MAX(d, sizeof(U64)) ||561totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {562DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",563(unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));564return ERROR(srcSize_wrong);565}566/* Check if there are at least 5 training samples */567if (nbTrainSamples < 5) {568DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);569return ERROR(srcSize_wrong);570}571/* Check if there's testing sample */572if (nbTestSamples < 1) {573DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);574return ERROR(srcSize_wrong);575}576/* Zero the context */577memset(ctx, 0, sizeof(*ctx));578DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,579(unsigned)trainingSamplesSize);580DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,581(unsigned)testSamplesSize);582ctx->samples = samples;583ctx->samplesSizes = samplesSizes;584ctx->nbSamples = nbSamples;585ctx->nbTrainSamples = nbTrainSamples;586ctx->nbTestSamples = nbTestSamples;587/* Partial suffix array */588ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;589ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));590/* Maps index to the dmerID */591ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));592/* The offsets of each file */593ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));594if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {595DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");596COVER_ctx_destroy(ctx);597return ERROR(memory_allocation);598}599ctx->freqs = NULL;600ctx->d = d;601602/* Fill offsets from the samplesSizes */603{604U32 i;605ctx->offsets[0] = 0;606for (i = 1; i <= nbSamples; ++i) {607ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];608}609}610DISPLAYLEVEL(2, "Constructing partial suffix array\n");611{612/* suffix is a partial suffix array.613* It only sorts suffixes by their first parameters.d bytes.614* The sort is stable, so each dmer group is sorted by position in input.615*/616U32 i;617for (i = 0; i < ctx->suffixSize; ++i) {618ctx->suffix[i] = i;619}620/* qsort doesn't take an opaque pointer, so pass as a global.621* On OpenBSD qsort() is not guaranteed to be stable, their mergesort() is.622*/623g_coverCtx = ctx;624#if defined(__OpenBSD__)625mergesort(ctx->suffix, ctx->suffixSize, sizeof(U32),626(ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));627#else628qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),629(ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));630#endif631}632DISPLAYLEVEL(2, "Computing frequencies\n");633/* For each dmer group (group of positions with the same first d bytes):634* 1. For each position we set dmerAt[position] = dmerID. The dmerID is635* (groupBeginPtr - suffix). This allows us to go from position to636* dmerID so we can look up values in freq.637* 2. We calculate how many samples the dmer occurs in and save it in638* freqs[dmerId].639*/640COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx,641(ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);642ctx->freqs = ctx->suffix;643ctx->suffix = NULL;644return 0;645}646647void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)648{649const double ratio = (double)nbDmers / (double)maxDictSize;650if (ratio >= 10) {651return;652}653LOCALDISPLAYLEVEL(displayLevel, 1,654"WARNING: The maximum dictionary size %u is too large "655"compared to the source size %u! "656"size(source)/size(dictionary) = %f, but it should be >= "657"10! This may lead to a subpar dictionary! We recommend "658"training on sources at least 10x, and preferably 100x "659"the size of the dictionary! \n", (U32)maxDictSize,660(U32)nbDmers, ratio);661}662663COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,664U32 nbDmers, U32 k, U32 passes)665{666const U32 minEpochSize = k * 10;667COVER_epoch_info_t epochs;668epochs.num = MAX(1, maxDictSize / k / passes);669epochs.size = nbDmers / epochs.num;670if (epochs.size >= minEpochSize) {671assert(epochs.size * epochs.num <= nbDmers);672return epochs;673}674epochs.size = MIN(minEpochSize, nbDmers);675epochs.num = nbDmers / epochs.size;676assert(epochs.size * epochs.num <= nbDmers);677return epochs;678}679680/**681* Given the prepared context build the dictionary.682*/683static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,684COVER_map_t *activeDmers, void *dictBuffer,685size_t dictBufferCapacity,686ZDICT_cover_params_t parameters) {687BYTE *const dict = (BYTE *)dictBuffer;688size_t tail = dictBufferCapacity;689/* Divide the data into epochs. We will select one segment from each epoch. */690const COVER_epoch_info_t epochs = COVER_computeEpochs(691(U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);692const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));693size_t zeroScoreRun = 0;694size_t epoch;695DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",696(U32)epochs.num, (U32)epochs.size);697/* Loop through the epochs until there are no more segments or the dictionary698* is full.699*/700for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {701const U32 epochBegin = (U32)(epoch * epochs.size);702const U32 epochEnd = epochBegin + epochs.size;703size_t segmentSize;704/* Select a segment */705COVER_segment_t segment = COVER_selectSegment(706ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);707/* If the segment covers no dmers, then we are out of content.708* There may be new content in other epochs, for continue for some time.709*/710if (segment.score == 0) {711if (++zeroScoreRun >= maxZeroScoreRun) {712break;713}714continue;715}716zeroScoreRun = 0;717/* Trim the segment if necessary and if it is too small then we are done */718segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);719if (segmentSize < parameters.d) {720break;721}722/* We fill the dictionary from the back to allow the best segments to be723* referenced with the smallest offsets.724*/725tail -= segmentSize;726memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);727DISPLAYUPDATE(7282, "\r%u%% ",729(unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));730}731DISPLAYLEVEL(2, "\r%79s\r", "");732return tail;733}734735ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(736void *dictBuffer, size_t dictBufferCapacity,737const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,738ZDICT_cover_params_t parameters)739{740BYTE* const dict = (BYTE*)dictBuffer;741COVER_ctx_t ctx;742COVER_map_t activeDmers;743parameters.splitPoint = 1.0;744/* Initialize global data */745g_displayLevel = (int)parameters.zParams.notificationLevel;746/* Checks */747if (!COVER_checkParameters(parameters, dictBufferCapacity)) {748DISPLAYLEVEL(1, "Cover parameters incorrect\n");749return ERROR(parameter_outOfBound);750}751if (nbSamples == 0) {752DISPLAYLEVEL(1, "Cover must have at least one input file\n");753return ERROR(srcSize_wrong);754}755if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {756DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",757ZDICT_DICTSIZE_MIN);758return ERROR(dstSize_tooSmall);759}760/* Initialize context and activeDmers */761{762size_t const initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,763parameters.d, parameters.splitPoint);764if (ZSTD_isError(initVal)) {765return initVal;766}767}768COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);769if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {770DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");771COVER_ctx_destroy(&ctx);772return ERROR(memory_allocation);773}774775DISPLAYLEVEL(2, "Building dictionary\n");776{777const size_t tail =778COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer,779dictBufferCapacity, parameters);780const size_t dictionarySize = ZDICT_finalizeDictionary(781dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,782samplesBuffer, samplesSizes, nbSamples, parameters.zParams);783if (!ZSTD_isError(dictionarySize)) {784DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",785(unsigned)dictionarySize);786}787COVER_ctx_destroy(&ctx);788COVER_map_destroy(&activeDmers);789return dictionarySize;790}791}792793794795size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,796const size_t *samplesSizes, const BYTE *samples,797size_t *offsets,798size_t nbTrainSamples, size_t nbSamples,799BYTE *const dict, size_t dictBufferCapacity) {800size_t totalCompressedSize = ERROR(GENERIC);801/* Pointers */802ZSTD_CCtx *cctx;803ZSTD_CDict *cdict;804void *dst;805/* Local variables */806size_t dstCapacity;807size_t i;808/* Allocate dst with enough space to compress the maximum sized sample */809{810size_t maxSampleSize = 0;811i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;812for (; i < nbSamples; ++i) {813maxSampleSize = MAX(samplesSizes[i], maxSampleSize);814}815dstCapacity = ZSTD_compressBound(maxSampleSize);816dst = malloc(dstCapacity);817}818/* Create the cctx and cdict */819cctx = ZSTD_createCCtx();820cdict = ZSTD_createCDict(dict, dictBufferCapacity,821parameters.zParams.compressionLevel);822if (!dst || !cctx || !cdict) {823goto _compressCleanup;824}825/* Compress each sample and sum their sizes (or error) */826totalCompressedSize = dictBufferCapacity;827i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;828for (; i < nbSamples; ++i) {829const size_t size = ZSTD_compress_usingCDict(830cctx, dst, dstCapacity, samples + offsets[i],831samplesSizes[i], cdict);832if (ZSTD_isError(size)) {833totalCompressedSize = size;834goto _compressCleanup;835}836totalCompressedSize += size;837}838_compressCleanup:839ZSTD_freeCCtx(cctx);840ZSTD_freeCDict(cdict);841if (dst) {842free(dst);843}844return totalCompressedSize;845}846847848/**849* Initialize the `COVER_best_t`.850*/851void COVER_best_init(COVER_best_t *best) {852if (best==NULL) return; /* compatible with init on NULL */853(void)ZSTD_pthread_mutex_init(&best->mutex, NULL);854(void)ZSTD_pthread_cond_init(&best->cond, NULL);855best->liveJobs = 0;856best->dict = NULL;857best->dictSize = 0;858best->compressedSize = (size_t)-1;859memset(&best->parameters, 0, sizeof(best->parameters));860}861862/**863* Wait until liveJobs == 0.864*/865void COVER_best_wait(COVER_best_t *best) {866if (!best) {867return;868}869ZSTD_pthread_mutex_lock(&best->mutex);870while (best->liveJobs != 0) {871ZSTD_pthread_cond_wait(&best->cond, &best->mutex);872}873ZSTD_pthread_mutex_unlock(&best->mutex);874}875876/**877* Call COVER_best_wait() and then destroy the COVER_best_t.878*/879void COVER_best_destroy(COVER_best_t *best) {880if (!best) {881return;882}883COVER_best_wait(best);884if (best->dict) {885free(best->dict);886}887ZSTD_pthread_mutex_destroy(&best->mutex);888ZSTD_pthread_cond_destroy(&best->cond);889}890891/**892* Called when a thread is about to be launched.893* Increments liveJobs.894*/895void COVER_best_start(COVER_best_t *best) {896if (!best) {897return;898}899ZSTD_pthread_mutex_lock(&best->mutex);900++best->liveJobs;901ZSTD_pthread_mutex_unlock(&best->mutex);902}903904/**905* Called when a thread finishes executing, both on error or success.906* Decrements liveJobs and signals any waiting threads if liveJobs == 0.907* If this dictionary is the best so far save it and its parameters.908*/909void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,910COVER_dictSelection_t selection) {911void* dict = selection.dictContent;912size_t compressedSize = selection.totalCompressedSize;913size_t dictSize = selection.dictSize;914if (!best) {915return;916}917{918size_t liveJobs;919ZSTD_pthread_mutex_lock(&best->mutex);920--best->liveJobs;921liveJobs = best->liveJobs;922/* If the new dictionary is better */923if (compressedSize < best->compressedSize) {924/* Allocate space if necessary */925if (!best->dict || best->dictSize < dictSize) {926if (best->dict) {927free(best->dict);928}929best->dict = malloc(dictSize);930if (!best->dict) {931best->compressedSize = ERROR(GENERIC);932best->dictSize = 0;933ZSTD_pthread_cond_signal(&best->cond);934ZSTD_pthread_mutex_unlock(&best->mutex);935return;936}937}938/* Save the dictionary, parameters, and size */939if (dict) {940memcpy(best->dict, dict, dictSize);941best->dictSize = dictSize;942best->parameters = parameters;943best->compressedSize = compressedSize;944}945}946if (liveJobs == 0) {947ZSTD_pthread_cond_broadcast(&best->cond);948}949ZSTD_pthread_mutex_unlock(&best->mutex);950}951}952953static COVER_dictSelection_t setDictSelection(BYTE* buf, size_t s, size_t csz)954{955COVER_dictSelection_t ds;956ds.dictContent = buf;957ds.dictSize = s;958ds.totalCompressedSize = csz;959return ds;960}961962COVER_dictSelection_t COVER_dictSelectionError(size_t error) {963return setDictSelection(NULL, 0, error);964}965966unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {967return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent);968}969970void COVER_dictSelectionFree(COVER_dictSelection_t selection){971free(selection.dictContent);972}973974COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,975size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,976size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) {977978size_t largestDict = 0;979size_t largestCompressed = 0;980BYTE* customDictContentEnd = customDictContent + dictContentSize;981982BYTE * largestDictbuffer = (BYTE *)malloc(dictBufferCapacity);983BYTE * candidateDictBuffer = (BYTE *)malloc(dictBufferCapacity);984double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;985986if (!largestDictbuffer || !candidateDictBuffer) {987free(largestDictbuffer);988free(candidateDictBuffer);989return COVER_dictSelectionError(dictContentSize);990}991992/* Initial dictionary size and compressed size */993memcpy(largestDictbuffer, customDictContent, dictContentSize);994dictContentSize = ZDICT_finalizeDictionary(995largestDictbuffer, dictBufferCapacity, customDictContent, dictContentSize,996samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);997998if (ZDICT_isError(dictContentSize)) {999free(largestDictbuffer);1000free(candidateDictBuffer);1001return COVER_dictSelectionError(dictContentSize);1002}10031004totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,1005samplesBuffer, offsets,1006nbCheckSamples, nbSamples,1007largestDictbuffer, dictContentSize);10081009if (ZSTD_isError(totalCompressedSize)) {1010free(largestDictbuffer);1011free(candidateDictBuffer);1012return COVER_dictSelectionError(totalCompressedSize);1013}10141015if (params.shrinkDict == 0) {1016free(candidateDictBuffer);1017return setDictSelection(largestDictbuffer, dictContentSize, totalCompressedSize);1018}10191020largestDict = dictContentSize;1021largestCompressed = totalCompressedSize;1022dictContentSize = ZDICT_DICTSIZE_MIN;10231024/* Largest dict is initially at least ZDICT_DICTSIZE_MIN */1025while (dictContentSize < largestDict) {1026memcpy(candidateDictBuffer, largestDictbuffer, largestDict);1027dictContentSize = ZDICT_finalizeDictionary(1028candidateDictBuffer, dictBufferCapacity, customDictContentEnd - dictContentSize, dictContentSize,1029samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);10301031if (ZDICT_isError(dictContentSize)) {1032free(largestDictbuffer);1033free(candidateDictBuffer);1034return COVER_dictSelectionError(dictContentSize);10351036}10371038totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,1039samplesBuffer, offsets,1040nbCheckSamples, nbSamples,1041candidateDictBuffer, dictContentSize);10421043if (ZSTD_isError(totalCompressedSize)) {1044free(largestDictbuffer);1045free(candidateDictBuffer);1046return COVER_dictSelectionError(totalCompressedSize);1047}10481049if ((double)totalCompressedSize <= (double)largestCompressed * regressionTolerance) {1050free(largestDictbuffer);1051return setDictSelection( candidateDictBuffer, dictContentSize, totalCompressedSize );1052}1053dictContentSize *= 2;1054}1055dictContentSize = largestDict;1056totalCompressedSize = largestCompressed;1057free(candidateDictBuffer);1058return setDictSelection( largestDictbuffer, dictContentSize, totalCompressedSize );1059}10601061/**1062* Parameters for COVER_tryParameters().1063*/1064typedef struct COVER_tryParameters_data_s {1065const COVER_ctx_t *ctx;1066COVER_best_t *best;1067size_t dictBufferCapacity;1068ZDICT_cover_params_t parameters;1069} COVER_tryParameters_data_t;10701071/**1072* Tries a set of parameters and updates the COVER_best_t with the results.1073* This function is thread safe if zstd is compiled with multithreaded support.1074* It takes its parameters as an *OWNING* opaque pointer to support threading.1075*/1076static void COVER_tryParameters(void *opaque)1077{1078/* Save parameters as local variables */1079COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t*)opaque;1080const COVER_ctx_t *const ctx = data->ctx;1081const ZDICT_cover_params_t parameters = data->parameters;1082size_t dictBufferCapacity = data->dictBufferCapacity;1083size_t totalCompressedSize = ERROR(GENERIC);1084/* Allocate space for hash table, dict, and freqs */1085COVER_map_t activeDmers;1086BYTE* const dict = (BYTE*)malloc(dictBufferCapacity);1087COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));1088U32* const freqs = (U32*)malloc(ctx->suffixSize * sizeof(U32));1089if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {1090DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");1091goto _cleanup;1092}1093if (!dict || !freqs) {1094DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");1095goto _cleanup;1096}1097/* Copy the frequencies because we need to modify them */1098memcpy(freqs, ctx->freqs, ctx->suffixSize * sizeof(U32));1099/* Build the dictionary */1100{1101const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,1102dictBufferCapacity, parameters);1103selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail,1104ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,1105totalCompressedSize);11061107if (COVER_dictSelectionIsError(selection)) {1108DISPLAYLEVEL(1, "Failed to select dictionary\n");1109goto _cleanup;1110}1111}1112_cleanup:1113free(dict);1114COVER_best_finish(data->best, parameters, selection);1115free(data);1116COVER_map_destroy(&activeDmers);1117COVER_dictSelectionFree(selection);1118free(freqs);1119}11201121ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(1122void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer,1123const size_t* samplesSizes, unsigned nbSamples,1124ZDICT_cover_params_t* parameters)1125{1126/* constants */1127const unsigned nbThreads = parameters->nbThreads;1128const double splitPoint =1129parameters->splitPoint <= 0.0 ? COVER_DEFAULT_SPLITPOINT : parameters->splitPoint;1130const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;1131const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;1132const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;1133const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;1134const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;1135const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);1136const unsigned kIterations =1137(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);1138const unsigned shrinkDict = 0;1139/* Local variables */1140const int displayLevel = parameters->zParams.notificationLevel;1141unsigned iteration = 1;1142unsigned d;1143unsigned k;1144COVER_best_t best;1145POOL_ctx *pool = NULL;1146int warned = 0;11471148/* Checks */1149if (splitPoint <= 0 || splitPoint > 1) {1150LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");1151return ERROR(parameter_outOfBound);1152}1153if (kMinK < kMaxD || kMaxK < kMinK) {1154LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");1155return ERROR(parameter_outOfBound);1156}1157if (nbSamples == 0) {1158DISPLAYLEVEL(1, "Cover must have at least one input file\n");1159return ERROR(srcSize_wrong);1160}1161if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {1162DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",1163ZDICT_DICTSIZE_MIN);1164return ERROR(dstSize_tooSmall);1165}1166if (nbThreads > 1) {1167pool = POOL_create(nbThreads, 1);1168if (!pool) {1169return ERROR(memory_allocation);1170}1171}1172/* Initialization */1173COVER_best_init(&best);1174/* Turn down global display level to clean up display at level 2 and below */1175g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;1176/* Loop through d first because each new value needs a new context */1177LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",1178kIterations);1179for (d = kMinD; d <= kMaxD; d += 2) {1180/* Initialize the context for this value of d */1181COVER_ctx_t ctx;1182LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);1183{1184const size_t initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint);1185if (ZSTD_isError(initVal)) {1186LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");1187COVER_best_destroy(&best);1188POOL_free(pool);1189return initVal;1190}1191}1192if (!warned) {1193COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);1194warned = 1;1195}1196/* Loop through k reusing the same context */1197for (k = kMinK; k <= kMaxK; k += kStepSize) {1198/* Prepare the arguments */1199COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)malloc(1200sizeof(COVER_tryParameters_data_t));1201LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);1202if (!data) {1203LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");1204COVER_best_destroy(&best);1205COVER_ctx_destroy(&ctx);1206POOL_free(pool);1207return ERROR(memory_allocation);1208}1209data->ctx = &ctx;1210data->best = &best;1211data->dictBufferCapacity = dictBufferCapacity;1212data->parameters = *parameters;1213data->parameters.k = k;1214data->parameters.d = d;1215data->parameters.splitPoint = splitPoint;1216data->parameters.steps = kSteps;1217data->parameters.shrinkDict = shrinkDict;1218data->parameters.zParams.notificationLevel = g_displayLevel;1219/* Check the parameters */1220if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {1221DISPLAYLEVEL(1, "Cover parameters incorrect\n");1222free(data);1223continue;1224}1225/* Call the function and pass ownership of data to it */1226COVER_best_start(&best);1227if (pool) {1228POOL_add(pool, &COVER_tryParameters, data);1229} else {1230COVER_tryParameters(data);1231}1232/* Print status */1233LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ",1234(unsigned)((iteration * 100) / kIterations));1235++iteration;1236}1237COVER_best_wait(&best);1238COVER_ctx_destroy(&ctx);1239}1240LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");1241/* Fill the output buffer and parameters with output of the best parameters */1242{1243const size_t dictSize = best.dictSize;1244if (ZSTD_isError(best.compressedSize)) {1245const size_t compressedSize = best.compressedSize;1246COVER_best_destroy(&best);1247POOL_free(pool);1248return compressedSize;1249}1250*parameters = best.parameters;1251memcpy(dictBuffer, best.dict, dictSize);1252COVER_best_destroy(&best);1253POOL_free(pool);1254return dictSize;1255}1256}125712581259