Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/zstd/programs/dibio.c
48254 views
1
/*
2
* Copyright (c) Yann Collet, Facebook, Inc.
3
* All rights reserved.
4
*
5
* This source code is licensed under both the BSD-style license (found in the
6
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
* in the COPYING file in the root directory of this source tree).
8
* You may select, at your option, one of the above-listed licenses.
9
*/
10
11
12
13
/* **************************************
14
* Compiler Warnings
15
****************************************/
16
#ifdef _MSC_VER
17
# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
18
#endif
19
20
21
/*-*************************************
22
* Includes
23
***************************************/
24
#include "platform.h" /* Large Files support */
25
#include "util.h" /* UTIL_getFileSize, UTIL_getTotalFileSize */
26
#include <stdlib.h> /* malloc, free */
27
#include <string.h> /* memset */
28
#include <stdio.h> /* fprintf, fopen, ftello64 */
29
#include <errno.h> /* errno */
30
#include <assert.h>
31
32
#include "timefn.h" /* UTIL_time_t, UTIL_clockSpanMicro, UTIL_getTime */
33
#include "../lib/common/mem.h" /* read */
34
#include "dibio.h"
35
36
37
/*-*************************************
38
* Constants
39
***************************************/
40
#define KB *(1 <<10)
41
#define MB *(1 <<20)
42
#define GB *(1U<<30)
43
44
#define SAMPLESIZE_MAX (128 KB)
45
#define MEMMULT 11 /* rough estimation : memory cost to analyze 1 byte of sample */
46
#define COVER_MEMMULT 9 /* rough estimation : memory cost to analyze 1 byte of sample */
47
#define FASTCOVER_MEMMULT 1 /* rough estimation : memory cost to analyze 1 byte of sample */
48
static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
49
50
#define NOISELENGTH 32
51
#define MAX_SAMPLES_SIZE (2 GB) /* training dataset limited to 2GB */
52
53
54
/*-*************************************
55
* Console display
56
***************************************/
57
#define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
58
#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
59
60
static const U64 g_refreshRate = SEC_TO_MICRO / 6;
61
static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
62
63
#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
64
if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
65
{ g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
66
if (displayLevel>=4) fflush(stderr); } } }
67
68
/*-*************************************
69
* Exceptions
70
***************************************/
71
#ifndef DEBUG
72
# define DEBUG 0
73
#endif
74
#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
75
#define EXM_THROW(error, ...) \
76
{ \
77
DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
78
DISPLAY("Error %i : ", error); \
79
DISPLAY(__VA_ARGS__); \
80
DISPLAY("\n"); \
81
exit(error); \
82
}
83
84
85
/* ********************************************************
86
* Helper functions
87
**********************************************************/
88
#undef MIN
89
#define MIN(a,b) ((a) < (b) ? (a) : (b))
90
91
/**
92
Returns the size of a file.
93
If error returns -1.
94
*/
95
static S64 DiB_getFileSize (const char * fileName)
96
{
97
U64 const fileSize = UTIL_getFileSize(fileName);
98
return (fileSize == UTIL_FILESIZE_UNKNOWN) ? -1 : (S64)fileSize;
99
}
100
101
/* ********************************************************
102
* File related operations
103
**********************************************************/
104
/** DiB_loadFiles() :
105
* load samples from files listed in fileNamesTable into buffer.
106
* works even if buffer is too small to load all samples.
107
* Also provides the size of each sample into sampleSizes table
108
* which must be sized correctly, using DiB_fileStats().
109
* @return : nb of samples effectively loaded into `buffer`
110
* *bufferSizePtr is modified, it provides the amount data loaded within buffer.
111
* sampleSizes is filled with the size of each sample.
112
*/
113
static int DiB_loadFiles(
114
void* buffer, size_t* bufferSizePtr,
115
size_t* sampleSizes, int sstSize,
116
const char** fileNamesTable, int nbFiles,
117
size_t targetChunkSize, int displayLevel )
118
{
119
char* const buff = (char*)buffer;
120
size_t totalDataLoaded = 0;
121
int nbSamplesLoaded = 0;
122
int fileIndex = 0;
123
FILE * f = NULL;
124
125
assert(targetChunkSize <= SAMPLESIZE_MAX);
126
127
while ( nbSamplesLoaded < sstSize && fileIndex < nbFiles ) {
128
size_t fileDataLoaded;
129
S64 const fileSize = DiB_getFileSize(fileNamesTable[fileIndex]);
130
if (fileSize <= 0) /* skip if zero-size or file error */
131
continue;
132
133
f = fopen( fileNamesTable[fileIndex], "rb");
134
if (f == NULL)
135
EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileNamesTable[fileIndex], strerror(errno));
136
DISPLAYUPDATE(2, "Loading %s... \r", fileNamesTable[fileIndex]);
137
138
/* Load the first chunk of data from the file */
139
fileDataLoaded = targetChunkSize > 0 ?
140
(size_t)MIN(fileSize, (S64)targetChunkSize) :
141
(size_t)MIN(fileSize, SAMPLESIZE_MAX );
142
if (totalDataLoaded + fileDataLoaded > *bufferSizePtr)
143
break;
144
if (fread( buff+totalDataLoaded, 1, fileDataLoaded, f ) != fileDataLoaded)
145
EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
146
sampleSizes[nbSamplesLoaded++] = fileDataLoaded;
147
totalDataLoaded += fileDataLoaded;
148
149
/* If file-chunking is enabled, load the rest of the file as more samples */
150
if (targetChunkSize > 0) {
151
while( (S64)fileDataLoaded < fileSize && nbSamplesLoaded < sstSize ) {
152
size_t const chunkSize = MIN((size_t)(fileSize-fileDataLoaded), targetChunkSize);
153
if (totalDataLoaded + chunkSize > *bufferSizePtr) /* buffer is full */
154
break;
155
156
if (fread( buff+totalDataLoaded, 1, chunkSize, f ) != chunkSize)
157
EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
158
sampleSizes[nbSamplesLoaded++] = chunkSize;
159
totalDataLoaded += chunkSize;
160
fileDataLoaded += chunkSize;
161
}
162
}
163
fileIndex += 1;
164
fclose(f); f = NULL;
165
}
166
if (f != NULL)
167
fclose(f);
168
169
DISPLAYLEVEL(2, "\r%79s\r", "");
170
DISPLAYLEVEL(4, "Loaded %d KB total training data, %d nb samples \n",
171
(int)(totalDataLoaded / (1 KB)), nbSamplesLoaded );
172
*bufferSizePtr = totalDataLoaded;
173
return nbSamplesLoaded;
174
}
175
176
#define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r)))
177
static U32 DiB_rand(U32* src)
178
{
179
static const U32 prime1 = 2654435761U;
180
static const U32 prime2 = 2246822519U;
181
U32 rand32 = *src;
182
rand32 *= prime1;
183
rand32 ^= prime2;
184
rand32 = DiB_rotl32(rand32, 13);
185
*src = rand32;
186
return rand32 >> 5;
187
}
188
189
/* DiB_shuffle() :
190
* shuffle a table of file names in a semi-random way
191
* It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
192
* it will load random elements from it, instead of just the first ones. */
193
static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
194
U32 seed = 0xFD2FB528;
195
unsigned i;
196
assert(nbFiles >= 1);
197
for (i = nbFiles - 1; i > 0; --i) {
198
unsigned const j = DiB_rand(&seed) % (i + 1);
199
const char* const tmp = fileNamesTable[j];
200
fileNamesTable[j] = fileNamesTable[i];
201
fileNamesTable[i] = tmp;
202
}
203
}
204
205
206
/*-********************************************************
207
* Dictionary training functions
208
**********************************************************/
209
static size_t DiB_findMaxMem(unsigned long long requiredMem)
210
{
211
size_t const step = 8 MB;
212
void* testmem = NULL;
213
214
requiredMem = (((requiredMem >> 23) + 1) << 23);
215
requiredMem += step;
216
if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
217
218
while (!testmem) {
219
testmem = malloc((size_t)requiredMem);
220
requiredMem -= step;
221
}
222
223
free(testmem);
224
return (size_t)requiredMem;
225
}
226
227
228
static void DiB_fillNoise(void* buffer, size_t length)
229
{
230
unsigned const prime1 = 2654435761U;
231
unsigned const prime2 = 2246822519U;
232
unsigned acc = prime1;
233
size_t p=0;
234
235
for (p=0; p<length; p++) {
236
acc *= prime2;
237
((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
238
}
239
}
240
241
242
static void DiB_saveDict(const char* dictFileName,
243
const void* buff, size_t buffSize)
244
{
245
FILE* const f = fopen(dictFileName, "wb");
246
if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
247
248
{ size_t const n = fwrite(buff, 1, buffSize, f);
249
if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
250
251
{ size_t const n = (size_t)fclose(f);
252
if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
253
}
254
255
typedef struct {
256
S64 totalSizeToLoad;
257
int nbSamples;
258
int oneSampleTooLarge;
259
} fileStats;
260
261
/*! DiB_fileStats() :
262
* Given a list of files, and a chunkSize (0 == no chunk, whole files)
263
* provides the amount of data to be loaded and the resulting nb of samples.
264
* This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
265
*/
266
static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t chunkSize, int displayLevel)
267
{
268
fileStats fs;
269
int n;
270
memset(&fs, 0, sizeof(fs));
271
272
// We assume that if chunking is requested, the chunk size is < SAMPLESIZE_MAX
273
assert( chunkSize <= SAMPLESIZE_MAX );
274
275
for (n=0; n<nbFiles; n++) {
276
S64 const fileSize = DiB_getFileSize(fileNamesTable[n]);
277
// TODO: is there a minimum sample size? What if the file is 1-byte?
278
if (fileSize == 0) {
279
DISPLAYLEVEL(3, "Sample file '%s' has zero size, skipping...\n", fileNamesTable[n]);
280
continue;
281
}
282
283
/* the case where we are breaking up files in sample chunks */
284
if (chunkSize > 0)
285
{
286
// TODO: is there a minimum sample size? Can we have a 1-byte sample?
287
fs.nbSamples += (int)((fileSize + chunkSize-1) / chunkSize);
288
fs.totalSizeToLoad += fileSize;
289
}
290
else {
291
/* the case where one file is one sample */
292
if (fileSize > SAMPLESIZE_MAX) {
293
/* flag excessively large sample files */
294
fs.oneSampleTooLarge |= (fileSize > 2*SAMPLESIZE_MAX);
295
296
/* Limit to the first SAMPLESIZE_MAX (128kB) of the file */
297
DISPLAYLEVEL(3, "Sample file '%s' is too large, limiting to %d KB",
298
fileNamesTable[n], SAMPLESIZE_MAX / (1 KB));
299
}
300
fs.nbSamples += 1;
301
fs.totalSizeToLoad += MIN(fileSize, SAMPLESIZE_MAX);
302
}
303
}
304
DISPLAYLEVEL(4, "Found training data %d files, %d KB, %d samples\n", nbFiles, (int)(fs.totalSizeToLoad / (1 KB)), fs.nbSamples);
305
return fs;
306
}
307
308
int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
309
const char** fileNamesTable, int nbFiles, size_t chunkSize,
310
ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
311
ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit)
312
{
313
fileStats fs;
314
size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */
315
int nbSamplesLoaded; /* nb of samples effectively loaded in srcBuffer */
316
size_t loadedSize; /* total data loaded in srcBuffer for all samples */
317
void* srcBuffer /* contiguous buffer with training data/samples */;
318
void* const dictBuffer = malloc(maxDictSize);
319
int result = 0;
320
321
int const displayLevel = params ? params->zParams.notificationLevel :
322
coverParams ? coverParams->zParams.notificationLevel :
323
fastCoverParams ? fastCoverParams->zParams.notificationLevel : 0;
324
325
/* Shuffle input files before we start assessing how much sample datA to load.
326
The purpose of the shuffle is to pick random samples when the sample
327
set is larger than what we can load in memory. */
328
DISPLAYLEVEL(3, "Shuffling input files\n");
329
DiB_shuffle(fileNamesTable, nbFiles);
330
331
/* Figure out how much sample data to load with how many samples */
332
fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
333
334
{
335
int const memMult = params ? MEMMULT :
336
coverParams ? COVER_MEMMULT:
337
FASTCOVER_MEMMULT;
338
size_t const maxMem = DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
339
/* Limit the size of the training data to the free memory */
340
/* Limit the size of the training data to 2GB */
341
/* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */
342
loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
343
if (memLimit != 0) {
344
DISPLAYLEVEL(2, "! Warning : setting manual memory limit for dictionary training data at %u MB \n",
345
(unsigned)(memLimit / (1 MB)));
346
loadedSize = (size_t)MIN(loadedSize, memLimit);
347
}
348
srcBuffer = malloc(loadedSize+NOISELENGTH);
349
sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
350
}
351
352
/* Checks */
353
if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer))
354
EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
355
if (fs.oneSampleTooLarge) {
356
DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n");
357
DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n");
358
DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
359
}
360
if (fs.nbSamples < 5) {
361
DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n");
362
DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n");
363
DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
364
EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */
365
}
366
if (fs.totalSizeToLoad < (S64)maxDictSize * 8) {
367
DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n");
368
DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n");
369
}
370
371
/* init */
372
if ((S64)loadedSize < fs.totalSizeToLoad)
373
DISPLAYLEVEL(1, "Training samples set too large (%u MB); training on %u MB only...\n",
374
(unsigned)(fs.totalSizeToLoad / (1 MB)),
375
(unsigned)(loadedSize / (1 MB)));
376
377
/* Load input buffer */
378
nbSamplesLoaded = DiB_loadFiles(
379
srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable,
380
nbFiles, chunkSize, displayLevel);
381
382
{ size_t dictSize;
383
if (params) {
384
DiB_fillNoise((char*)srcBuffer + loadedSize, NOISELENGTH); /* guard band, for end of buffer condition */
385
dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize,
386
srcBuffer, sampleSizes, nbSamplesLoaded,
387
*params);
388
} else if (coverParams) {
389
if (optimize) {
390
dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
391
srcBuffer, sampleSizes, nbSamplesLoaded,
392
coverParams);
393
if (!ZDICT_isError(dictSize)) {
394
unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
395
DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d,
396
coverParams->steps, splitPercentage);
397
}
398
} else {
399
dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
400
sampleSizes, nbSamplesLoaded, *coverParams);
401
}
402
} else {
403
assert(fastCoverParams != NULL);
404
if (optimize) {
405
dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,
406
srcBuffer, sampleSizes, nbSamplesLoaded,
407
fastCoverParams);
408
if (!ZDICT_isError(dictSize)) {
409
unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);
410
DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastCoverParams->k,
411
fastCoverParams->d, fastCoverParams->f, fastCoverParams->steps, splitPercentage,
412
fastCoverParams->accel);
413
}
414
} else {
415
dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,
416
sampleSizes, nbSamplesLoaded, *fastCoverParams);
417
}
418
}
419
if (ZDICT_isError(dictSize)) {
420
DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */
421
result = 1;
422
goto _cleanup;
423
}
424
/* save dict */
425
DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (unsigned)dictSize, dictFileName);
426
DiB_saveDict(dictFileName, dictBuffer, dictSize);
427
}
428
429
/* clean up */
430
_cleanup:
431
free(srcBuffer);
432
free(sampleSizes);
433
free(dictBuffer);
434
return result;
435
}
436
437