Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Kitware
GitHub Repository: Kitware/CMake
Path: blob/master/Utilities/cmzstd/lib/dictBuilder/cover.h
5020 views
1
/*
2
* Copyright (c) Meta Platforms, Inc. and affiliates.
3
* All rights reserved.
4
*
5
* This source code is licensed under both the BSD-style license (found in the
6
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
* in the COPYING file in the root directory of this source tree).
8
* You may select, at your option, one of the above-listed licenses.
9
*/
10
11
#ifndef ZDICT_STATIC_LINKING_ONLY
12
# define ZDICT_STATIC_LINKING_ONLY
13
#endif
14
15
#include "../common/threading.h" /* ZSTD_pthread_mutex_t */
16
#include "../common/mem.h" /* U32, BYTE */
17
#include "../zdict.h"
18
19
/**
20
* COVER_best_t is used for two purposes:
21
* 1. Synchronizing threads.
22
* 2. Saving the best parameters and dictionary.
23
*
24
* All of the methods except COVER_best_init() are thread safe if zstd is
25
* compiled with multithreaded support.
26
*/
27
typedef struct COVER_best_s {
28
ZSTD_pthread_mutex_t mutex;
29
ZSTD_pthread_cond_t cond;
30
size_t liveJobs;
31
void *dict;
32
size_t dictSize;
33
ZDICT_cover_params_t parameters;
34
size_t compressedSize;
35
} COVER_best_t;
36
37
/**
38
* A segment is a range in the source as well as the score of the segment.
39
*/
40
typedef struct {
41
U32 begin;
42
U32 end;
43
U32 score;
44
} COVER_segment_t;
45
46
/**
47
*Number of epochs and size of each epoch.
48
*/
49
typedef struct {
50
U32 num;
51
U32 size;
52
} COVER_epoch_info_t;
53
54
/**
55
* Struct used for the dictionary selection function.
56
*/
57
typedef struct COVER_dictSelection {
58
BYTE* dictContent;
59
size_t dictSize;
60
size_t totalCompressedSize;
61
} COVER_dictSelection_t;
62
63
/**
64
* Computes the number of epochs and the size of each epoch.
65
* We will make sure that each epoch gets at least 10 * k bytes.
66
*
67
* The COVER algorithms divide the data up into epochs of equal size and
68
* select one segment from each epoch.
69
*
70
* @param maxDictSize The maximum allowed dictionary size.
71
* @param nbDmers The number of dmers we are training on.
72
* @param k The parameter k (segment size).
73
* @param passes The target number of passes over the dmer corpus.
74
* More passes means a better dictionary.
75
*/
76
COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
77
U32 k, U32 passes);
78
79
/**
80
* Warns the user when their corpus is too small.
81
*/
82
void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
83
84
/**
85
* Checks total compressed size of a dictionary
86
*/
87
size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
88
const size_t *samplesSizes, const BYTE *samples,
89
size_t *offsets,
90
size_t nbTrainSamples, size_t nbSamples,
91
BYTE *const dict, size_t dictBufferCapacity);
92
93
/**
94
* Returns the sum of the sample sizes.
95
*/
96
size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
97
98
/**
99
* Initialize the `COVER_best_t`.
100
*/
101
void COVER_best_init(COVER_best_t *best);
102
103
/**
104
* Wait until liveJobs == 0.
105
*/
106
void COVER_best_wait(COVER_best_t *best);
107
108
/**
109
* Call COVER_best_wait() and then destroy the COVER_best_t.
110
*/
111
void COVER_best_destroy(COVER_best_t *best);
112
113
/**
114
* Called when a thread is about to be launched.
115
* Increments liveJobs.
116
*/
117
void COVER_best_start(COVER_best_t *best);
118
119
/**
120
* Called when a thread finishes executing, both on error or success.
121
* Decrements liveJobs and signals any waiting threads if liveJobs == 0.
122
* If this dictionary is the best so far save it and its parameters.
123
*/
124
void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
125
COVER_dictSelection_t selection);
126
/**
127
* Error function for COVER_selectDict function. Checks if the return
128
* value is an error.
129
*/
130
unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
131
132
/**
133
* Error function for COVER_selectDict function. Returns a struct where
134
* return.totalCompressedSize is a ZSTD error.
135
*/
136
COVER_dictSelection_t COVER_dictSelectionError(size_t error);
137
138
/**
139
* Always call after selectDict is called to free up used memory from
140
* newly created dictionary.
141
*/
142
void COVER_dictSelectionFree(COVER_dictSelection_t selection);
143
144
/**
145
* Called to finalize the dictionary and select one based on whether or not
146
* the shrink-dict flag was enabled. If enabled the dictionary used is the
147
* smallest dictionary within a specified regression of the compressed size
148
* from the largest dictionary.
149
*/
150
COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
151
size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
152
size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
153
154