Path: blob/master/Utilities/cmzstd/lib/dictBuilder/cover.h
5020 views
/*1* Copyright (c) Meta Platforms, Inc. and affiliates.2* All rights reserved.3*4* This source code is licensed under both the BSD-style license (found in the5* LICENSE file in the root directory of this source tree) and the GPLv2 (found6* in the COPYING file in the root directory of this source tree).7* You may select, at your option, one of the above-listed licenses.8*/910#ifndef ZDICT_STATIC_LINKING_ONLY11# define ZDICT_STATIC_LINKING_ONLY12#endif1314#include "../common/threading.h" /* ZSTD_pthread_mutex_t */15#include "../common/mem.h" /* U32, BYTE */16#include "../zdict.h"1718/**19* COVER_best_t is used for two purposes:20* 1. Synchronizing threads.21* 2. Saving the best parameters and dictionary.22*23* All of the methods except COVER_best_init() are thread safe if zstd is24* compiled with multithreaded support.25*/26typedef struct COVER_best_s {27ZSTD_pthread_mutex_t mutex;28ZSTD_pthread_cond_t cond;29size_t liveJobs;30void *dict;31size_t dictSize;32ZDICT_cover_params_t parameters;33size_t compressedSize;34} COVER_best_t;3536/**37* A segment is a range in the source as well as the score of the segment.38*/39typedef struct {40U32 begin;41U32 end;42U32 score;43} COVER_segment_t;4445/**46*Number of epochs and size of each epoch.47*/48typedef struct {49U32 num;50U32 size;51} COVER_epoch_info_t;5253/**54* Struct used for the dictionary selection function.55*/56typedef struct COVER_dictSelection {57BYTE* dictContent;58size_t dictSize;59size_t totalCompressedSize;60} COVER_dictSelection_t;6162/**63* Computes the number of epochs and the size of each epoch.64* We will make sure that each epoch gets at least 10 * k bytes.65*66* The COVER algorithms divide the data up into epochs of equal size and67* select one segment from each epoch.68*69* @param maxDictSize The maximum allowed dictionary size.70* @param nbDmers The number of dmers we are training on.71* @param k The parameter k (segment size).72* @param passes The target number of passes over the dmer corpus.73* More passes means a better dictionary.74*/75COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,76U32 k, U32 passes);7778/**79* Warns the user when their corpus is too small.80*/81void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);8283/**84* Checks total compressed size of a dictionary85*/86size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,87const size_t *samplesSizes, const BYTE *samples,88size_t *offsets,89size_t nbTrainSamples, size_t nbSamples,90BYTE *const dict, size_t dictBufferCapacity);9192/**93* Returns the sum of the sample sizes.94*/95size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;9697/**98* Initialize the `COVER_best_t`.99*/100void COVER_best_init(COVER_best_t *best);101102/**103* Wait until liveJobs == 0.104*/105void COVER_best_wait(COVER_best_t *best);106107/**108* Call COVER_best_wait() and then destroy the COVER_best_t.109*/110void COVER_best_destroy(COVER_best_t *best);111112/**113* Called when a thread is about to be launched.114* Increments liveJobs.115*/116void COVER_best_start(COVER_best_t *best);117118/**119* Called when a thread finishes executing, both on error or success.120* Decrements liveJobs and signals any waiting threads if liveJobs == 0.121* If this dictionary is the best so far save it and its parameters.122*/123void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,124COVER_dictSelection_t selection);125/**126* Error function for COVER_selectDict function. Checks if the return127* value is an error.128*/129unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);130131/**132* Error function for COVER_selectDict function. Returns a struct where133* return.totalCompressedSize is a ZSTD error.134*/135COVER_dictSelection_t COVER_dictSelectionError(size_t error);136137/**138* Always call after selectDict is called to free up used memory from139* newly created dictionary.140*/141void COVER_dictSelectionFree(COVER_dictSelection_t selection);142143/**144* Called to finalize the dictionary and select one based on whether or not145* the shrink-dict flag was enabled. If enabled the dictionary used is the146* smallest dictionary within a specified regression of the compressed size147* from the largest dictionary.148*/149COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,150size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,151size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);152153154