Path: blob/master/Utilities/cmzstd/lib/dictBuilder/cover.h
3156 views
/*1* Copyright (c) Meta Platforms, Inc. and affiliates.2* All rights reserved.3*4* This source code is licensed under both the BSD-style license (found in the5* LICENSE file in the root directory of this source tree) and the GPLv2 (found6* in the COPYING file in the root directory of this source tree).7* You may select, at your option, one of the above-listed licenses.8*/910#ifndef ZDICT_STATIC_LINKING_ONLY11# define ZDICT_STATIC_LINKING_ONLY12#endif1314#include <stdio.h> /* fprintf */15#include <stdlib.h> /* malloc, free, qsort */16#include <string.h> /* memset */17#include <time.h> /* clock */18#include "../common/mem.h" /* read */19#include "../common/pool.h"20#include "../common/threading.h"21#include "../common/zstd_internal.h" /* includes zstd.h */22#include "../zdict.h"2324/**25* COVER_best_t is used for two purposes:26* 1. Synchronizing threads.27* 2. Saving the best parameters and dictionary.28*29* All of the methods except COVER_best_init() are thread safe if zstd is30* compiled with multithreaded support.31*/32typedef struct COVER_best_s {33ZSTD_pthread_mutex_t mutex;34ZSTD_pthread_cond_t cond;35size_t liveJobs;36void *dict;37size_t dictSize;38ZDICT_cover_params_t parameters;39size_t compressedSize;40} COVER_best_t;4142/**43* A segment is a range in the source as well as the score of the segment.44*/45typedef struct {46U32 begin;47U32 end;48U32 score;49} COVER_segment_t;5051/**52*Number of epochs and size of each epoch.53*/54typedef struct {55U32 num;56U32 size;57} COVER_epoch_info_t;5859/**60* Struct used for the dictionary selection function.61*/62typedef struct COVER_dictSelection {63BYTE* dictContent;64size_t dictSize;65size_t totalCompressedSize;66} COVER_dictSelection_t;6768/**69* Computes the number of epochs and the size of each epoch.70* We will make sure that each epoch gets at least 10 * k bytes.71*72* The COVER algorithms divide the data up into epochs of equal size and73* select one segment from each epoch.74*75* @param maxDictSize The maximum allowed dictionary size.76* @param nbDmers The number of dmers we are training on.77* @param k The parameter k (segment size).78* @param passes The target number of passes over the dmer corpus.79* More passes means a better dictionary.80*/81COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,82U32 k, U32 passes);8384/**85* Warns the user when their corpus is too small.86*/87void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);8889/**90* Checks total compressed size of a dictionary91*/92size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,93const size_t *samplesSizes, const BYTE *samples,94size_t *offsets,95size_t nbTrainSamples, size_t nbSamples,96BYTE *const dict, size_t dictBufferCapacity);9798/**99* Returns the sum of the sample sizes.100*/101size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;102103/**104* Initialize the `COVER_best_t`.105*/106void COVER_best_init(COVER_best_t *best);107108/**109* Wait until liveJobs == 0.110*/111void COVER_best_wait(COVER_best_t *best);112113/**114* Call COVER_best_wait() and then destroy the COVER_best_t.115*/116void COVER_best_destroy(COVER_best_t *best);117118/**119* Called when a thread is about to be launched.120* Increments liveJobs.121*/122void COVER_best_start(COVER_best_t *best);123124/**125* Called when a thread finishes executing, both on error or success.126* Decrements liveJobs and signals any waiting threads if liveJobs == 0.127* If this dictionary is the best so far save it and its parameters.128*/129void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,130COVER_dictSelection_t selection);131/**132* Error function for COVER_selectDict function. Checks if the return133* value is an error.134*/135unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);136137/**138* Error function for COVER_selectDict function. Returns a struct where139* return.totalCompressedSize is a ZSTD error.140*/141COVER_dictSelection_t COVER_dictSelectionError(size_t error);142143/**144* Always call after selectDict is called to free up used memory from145* newly created dictionary.146*/147void COVER_dictSelectionFree(COVER_dictSelection_t selection);148149/**150* Called to finalize the dictionary and select one based on whether or not151* the shrink-dict flag was enabled. If enabled the dictionary used is the152* smallest dictionary within a specified regression of the compressed size153* from the largest dictionary.154*/155COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,156size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,157size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);158159160