Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Kitware
GitHub Repository: Kitware/CMake
Path: blob/master/Utilities/cmzstd/lib/dictBuilder/cover.h
3156 views
1
/*
2
* Copyright (c) Meta Platforms, Inc. and affiliates.
3
* All rights reserved.
4
*
5
* This source code is licensed under both the BSD-style license (found in the
6
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
* in the COPYING file in the root directory of this source tree).
8
* You may select, at your option, one of the above-listed licenses.
9
*/
10
11
#ifndef ZDICT_STATIC_LINKING_ONLY
12
# define ZDICT_STATIC_LINKING_ONLY
13
#endif
14
15
#include <stdio.h> /* fprintf */
16
#include <stdlib.h> /* malloc, free, qsort */
17
#include <string.h> /* memset */
18
#include <time.h> /* clock */
19
#include "../common/mem.h" /* read */
20
#include "../common/pool.h"
21
#include "../common/threading.h"
22
#include "../common/zstd_internal.h" /* includes zstd.h */
23
#include "../zdict.h"
24
25
/**
26
* COVER_best_t is used for two purposes:
27
* 1. Synchronizing threads.
28
* 2. Saving the best parameters and dictionary.
29
*
30
* All of the methods except COVER_best_init() are thread safe if zstd is
31
* compiled with multithreaded support.
32
*/
33
typedef struct COVER_best_s {
34
ZSTD_pthread_mutex_t mutex;
35
ZSTD_pthread_cond_t cond;
36
size_t liveJobs;
37
void *dict;
38
size_t dictSize;
39
ZDICT_cover_params_t parameters;
40
size_t compressedSize;
41
} COVER_best_t;
42
43
/**
44
* A segment is a range in the source as well as the score of the segment.
45
*/
46
typedef struct {
47
U32 begin;
48
U32 end;
49
U32 score;
50
} COVER_segment_t;
51
52
/**
53
*Number of epochs and size of each epoch.
54
*/
55
typedef struct {
56
U32 num;
57
U32 size;
58
} COVER_epoch_info_t;
59
60
/**
61
* Struct used for the dictionary selection function.
62
*/
63
typedef struct COVER_dictSelection {
64
BYTE* dictContent;
65
size_t dictSize;
66
size_t totalCompressedSize;
67
} COVER_dictSelection_t;
68
69
/**
70
* Computes the number of epochs and the size of each epoch.
71
* We will make sure that each epoch gets at least 10 * k bytes.
72
*
73
* The COVER algorithms divide the data up into epochs of equal size and
74
* select one segment from each epoch.
75
*
76
* @param maxDictSize The maximum allowed dictionary size.
77
* @param nbDmers The number of dmers we are training on.
78
* @param k The parameter k (segment size).
79
* @param passes The target number of passes over the dmer corpus.
80
* More passes means a better dictionary.
81
*/
82
COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
83
U32 k, U32 passes);
84
85
/**
86
* Warns the user when their corpus is too small.
87
*/
88
void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
89
90
/**
91
* Checks total compressed size of a dictionary
92
*/
93
size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
94
const size_t *samplesSizes, const BYTE *samples,
95
size_t *offsets,
96
size_t nbTrainSamples, size_t nbSamples,
97
BYTE *const dict, size_t dictBufferCapacity);
98
99
/**
100
* Returns the sum of the sample sizes.
101
*/
102
size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
103
104
/**
105
* Initialize the `COVER_best_t`.
106
*/
107
void COVER_best_init(COVER_best_t *best);
108
109
/**
110
* Wait until liveJobs == 0.
111
*/
112
void COVER_best_wait(COVER_best_t *best);
113
114
/**
115
* Call COVER_best_wait() and then destroy the COVER_best_t.
116
*/
117
void COVER_best_destroy(COVER_best_t *best);
118
119
/**
120
* Called when a thread is about to be launched.
121
* Increments liveJobs.
122
*/
123
void COVER_best_start(COVER_best_t *best);
124
125
/**
126
* Called when a thread finishes executing, both on error or success.
127
* Decrements liveJobs and signals any waiting threads if liveJobs == 0.
128
* If this dictionary is the best so far save it and its parameters.
129
*/
130
void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
131
COVER_dictSelection_t selection);
132
/**
133
* Error function for COVER_selectDict function. Checks if the return
134
* value is an error.
135
*/
136
unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
137
138
/**
139
* Error function for COVER_selectDict function. Returns a struct where
140
* return.totalCompressedSize is a ZSTD error.
141
*/
142
COVER_dictSelection_t COVER_dictSelectionError(size_t error);
143
144
/**
145
* Always call after selectDict is called to free up used memory from
146
* newly created dictionary.
147
*/
148
void COVER_dictSelectionFree(COVER_dictSelection_t selection);
149
150
/**
151
* Called to finalize the dictionary and select one based on whether or not
152
* the shrink-dict flag was enabled. If enabled the dictionary used is the
153
* smallest dictionary within a specified regression of the compressed size
154
* from the largest dictionary.
155
*/
156
COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
157
size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
158
size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
159
160