CoCalc -- dibio.c

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/zstd/programs/dibio.c
⁴⁸²⁵⁴ views
1
/*
2
 * Copyright (c) Yann Collet, Facebook, Inc.
3
 * All rights reserved.
4
 *
5
 * This source code is licensed under both the BSD-style license (found in the
6
 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
 * in the COPYING file in the root directory of this source tree).
8
 * You may select, at your option, one of the above-listed licenses.
9
 */
10

11

12

13
/* **************************************
14
*  Compiler Warnings
15
****************************************/
16
#ifdef _MSC_VER
17
#  pragma warning(disable : 4127)    /* disable: C4127: conditional expression is constant */
18
#endif
19

20

21
/*-*************************************
22
*  Includes
23
***************************************/
24
#include "platform.h"       /* Large Files support */
25
#include "util.h"           /* UTIL_getFileSize, UTIL_getTotalFileSize */
26
#include <stdlib.h>         /* malloc, free */
27
#include <string.h>         /* memset */
28
#include <stdio.h>          /* fprintf, fopen, ftello64 */
29
#include <errno.h>          /* errno */
30
#include <assert.h>
31

32
#include "timefn.h"         /* UTIL_time_t, UTIL_clockSpanMicro, UTIL_getTime */
33
#include "../lib/common/mem.h"  /* read */
34
#include "dibio.h"
35

36

37
/*-*************************************
38
*  Constants
39
***************************************/
40
#define KB *(1 <<10)
41
#define MB *(1 <<20)
42
#define GB *(1U<<30)
43

44
#define SAMPLESIZE_MAX (128 KB)
45
#define MEMMULT 11    /* rough estimation : memory cost to analyze 1 byte of sample */
46
#define COVER_MEMMULT 9    /* rough estimation : memory cost to analyze 1 byte of sample */
47
#define FASTCOVER_MEMMULT 1    /* rough estimation : memory cost to analyze 1 byte of sample */
48
static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
49

50
#define NOISELENGTH 32
51
#define MAX_SAMPLES_SIZE (2 GB) /* training dataset limited to 2GB */
52

53

54
/*-*************************************
55
*  Console display
56
***************************************/
57
#define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
58
#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
59

60
static const U64 g_refreshRate = SEC_TO_MICRO / 6;
61
static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
62

63
#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
64
            if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
65
            { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
66
            if (displayLevel>=4) fflush(stderr); } } }
67

68
/*-*************************************
69
*  Exceptions
70
***************************************/
71
#ifndef DEBUG
72
#  define DEBUG 0
73
#endif
74
#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
75
#define EXM_THROW(error, ...)                                             \
76
{                                                                         \
77
    DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
78
    DISPLAY("Error %i : ", error);                                        \
79
    DISPLAY(__VA_ARGS__);                                                 \
80
    DISPLAY("\n");                                                        \
81
    exit(error);                                                          \
82
}
83

84

85
/* ********************************************************
86
*  Helper functions
87
**********************************************************/
88
#undef MIN
89
#define MIN(a,b)    ((a) < (b) ? (a) : (b))
90

91
/**
92
  Returns the size of a file.
93
  If error returns -1.
94
*/
95
static S64 DiB_getFileSize (const char * fileName)
96
{
97
    U64 const fileSize = UTIL_getFileSize(fileName);
98
    return (fileSize == UTIL_FILESIZE_UNKNOWN) ? -1 : (S64)fileSize;
99
}
100

101
/* ********************************************************
102
*  File related operations
103
**********************************************************/
104
/** DiB_loadFiles() :
105
 *  load samples from files listed in fileNamesTable into buffer.
106
 *  works even if buffer is too small to load all samples.
107
 *  Also provides the size of each sample into sampleSizes table
108
 *  which must be sized correctly, using DiB_fileStats().
109
 * @return : nb of samples effectively loaded into `buffer`
110
 * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
111
 *  sampleSizes is filled with the size of each sample.
112
 */
113
static int DiB_loadFiles(
114
    void* buffer, size_t* bufferSizePtr,
115
    size_t* sampleSizes, int sstSize,
116
    const char** fileNamesTable, int nbFiles,
117
    size_t targetChunkSize, int displayLevel )
118
{
119
    char* const buff = (char*)buffer;
120
    size_t totalDataLoaded = 0;
121
    int nbSamplesLoaded = 0;
122
    int fileIndex = 0;
123
    FILE * f = NULL;
124

125
    assert(targetChunkSize <= SAMPLESIZE_MAX);
126

127
    while ( nbSamplesLoaded < sstSize && fileIndex < nbFiles ) {
128
        size_t fileDataLoaded;
129
        S64 const fileSize = DiB_getFileSize(fileNamesTable[fileIndex]);
130
        if (fileSize <= 0) /* skip if zero-size or file error */
131
            continue;
132

133
        f = fopen( fileNamesTable[fileIndex], "rb");
134
        if (f == NULL)
135
            EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileNamesTable[fileIndex], strerror(errno));
136
        DISPLAYUPDATE(2, "Loading %s...       \r", fileNamesTable[fileIndex]);
137

138
        /* Load the first chunk of data from the file */
139
        fileDataLoaded = targetChunkSize > 0 ?
140
                            (size_t)MIN(fileSize, (S64)targetChunkSize) :
141
                            (size_t)MIN(fileSize, SAMPLESIZE_MAX );
142
        if (totalDataLoaded + fileDataLoaded > *bufferSizePtr)
143
            break;
144
        if (fread( buff+totalDataLoaded, 1, fileDataLoaded, f ) != fileDataLoaded)
145
            EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
146
        sampleSizes[nbSamplesLoaded++] = fileDataLoaded;
147
        totalDataLoaded += fileDataLoaded;
148

149
        /* If file-chunking is enabled, load the rest of the file as more samples */
150
        if (targetChunkSize > 0) {
151
            while( (S64)fileDataLoaded < fileSize && nbSamplesLoaded < sstSize ) {
152
                size_t const chunkSize = MIN((size_t)(fileSize-fileDataLoaded), targetChunkSize);
153
                if (totalDataLoaded + chunkSize > *bufferSizePtr) /* buffer is full */
154
                    break;
155

156
                if (fread( buff+totalDataLoaded, 1, chunkSize, f ) != chunkSize)
157
                    EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
158
                sampleSizes[nbSamplesLoaded++] = chunkSize;
159
                totalDataLoaded += chunkSize;
160
                fileDataLoaded += chunkSize;
161
            }
162
        }
163
        fileIndex += 1;
164
        fclose(f); f = NULL;
165
    }
166
    if (f != NULL)
167
        fclose(f);
168

169
    DISPLAYLEVEL(2, "\r%79s\r", "");
170
    DISPLAYLEVEL(4, "Loaded %d KB total training data, %d nb samples \n",
171
        (int)(totalDataLoaded / (1 KB)), nbSamplesLoaded );
172
    *bufferSizePtr = totalDataLoaded;
173
    return nbSamplesLoaded;
174
}
175

176
#define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r)))
177
static U32 DiB_rand(U32* src)
178
{
179
    static const U32 prime1 = 2654435761U;
180
    static const U32 prime2 = 2246822519U;
181
    U32 rand32 = *src;
182
    rand32 *= prime1;
183
    rand32 ^= prime2;
184
    rand32  = DiB_rotl32(rand32, 13);
185
    *src = rand32;
186
    return rand32 >> 5;
187
}
188

189
/* DiB_shuffle() :
190
 * shuffle a table of file names in a semi-random way
191
 * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
192
 * it will load random elements from it, instead of just the first ones. */
193
static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
194
    U32 seed = 0xFD2FB528;
195
    unsigned i;
196
    assert(nbFiles >= 1);
197
    for (i = nbFiles - 1; i > 0; --i) {
198
        unsigned const j = DiB_rand(&seed) % (i + 1);
199
        const char* const tmp = fileNamesTable[j];
200
        fileNamesTable[j] = fileNamesTable[i];
201
        fileNamesTable[i] = tmp;
202
    }
203
}
204

205

206
/*-********************************************************
207
*  Dictionary training functions
208
**********************************************************/
209
static size_t DiB_findMaxMem(unsigned long long requiredMem)
210
{
211
    size_t const step = 8 MB;
212
    void* testmem = NULL;
213

214
    requiredMem = (((requiredMem >> 23) + 1) << 23);
215
    requiredMem += step;
216
    if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
217

218
    while (!testmem) {
219
        testmem = malloc((size_t)requiredMem);
220
        requiredMem -= step;
221
    }
222

223
    free(testmem);
224
    return (size_t)requiredMem;
225
}
226

227

228
static void DiB_fillNoise(void* buffer, size_t length)
229
{
230
    unsigned const prime1 = 2654435761U;
231
    unsigned const prime2 = 2246822519U;
232
    unsigned acc = prime1;
233
    size_t p=0;
234

235
    for (p=0; p<length; p++) {
236
        acc *= prime2;
237
        ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
238
    }
239
}
240

241

242
static void DiB_saveDict(const char* dictFileName,
243
                         const void* buff, size_t buffSize)
244
{
245
    FILE* const f = fopen(dictFileName, "wb");
246
    if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
247

248
    { size_t const n = fwrite(buff, 1, buffSize, f);
249
      if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
250

251
    { size_t const n = (size_t)fclose(f);
252
      if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
253
}
254

255
typedef struct {
256
    S64 totalSizeToLoad;
257
    int nbSamples;
258
    int oneSampleTooLarge;
259
} fileStats;
260

261
/*! DiB_fileStats() :
262
 *  Given a list of files, and a chunkSize (0 == no chunk, whole files)
263
 *  provides the amount of data to be loaded and the resulting nb of samples.
264
 *  This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
265
 */
266
static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t chunkSize, int displayLevel)
267
{
268
    fileStats fs;
269
    int n;
270
    memset(&fs, 0, sizeof(fs));
271

272
    // We assume that if chunking is requested, the chunk size is < SAMPLESIZE_MAX
273
    assert( chunkSize <= SAMPLESIZE_MAX );
274

275
    for (n=0; n<nbFiles; n++) {
276
      S64 const fileSize = DiB_getFileSize(fileNamesTable[n]);
277
      // TODO: is there a minimum sample size? What if the file is 1-byte?
278
      if (fileSize == 0) {
279
        DISPLAYLEVEL(3, "Sample file '%s' has zero size, skipping...\n", fileNamesTable[n]);
280
        continue;
281
      }
282

283
      /* the case where we are breaking up files in sample chunks */
284
      if (chunkSize > 0)
285
      {
286
        // TODO: is there a minimum sample size? Can we have a 1-byte sample?
287
        fs.nbSamples += (int)((fileSize + chunkSize-1) / chunkSize);
288
        fs.totalSizeToLoad += fileSize;
289
      }
290
      else {
291
      /* the case where one file is one sample */
292
        if (fileSize > SAMPLESIZE_MAX) {
293
          /* flag excessively large sample files */
294
          fs.oneSampleTooLarge |= (fileSize > 2*SAMPLESIZE_MAX);
295

296
          /* Limit to the first SAMPLESIZE_MAX (128kB) of the file */
297
          DISPLAYLEVEL(3, "Sample file '%s' is too large, limiting to %d KB",
298
              fileNamesTable[n], SAMPLESIZE_MAX / (1 KB));
299
        }
300
        fs.nbSamples += 1;
301
        fs.totalSizeToLoad += MIN(fileSize, SAMPLESIZE_MAX);
302
      }
303
    }
304
    DISPLAYLEVEL(4, "Found training data %d files, %d KB, %d samples\n", nbFiles, (int)(fs.totalSizeToLoad / (1 KB)), fs.nbSamples);
305
    return fs;
306
}
307

308
int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
309
                       const char** fileNamesTable, int nbFiles, size_t chunkSize,
310
                       ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
311
                       ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit)
312
{
313
    fileStats fs;
314
    size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */
315
    int nbSamplesLoaded; /* nb of samples effectively loaded in srcBuffer */
316
    size_t loadedSize; /* total data loaded in srcBuffer for all samples */
317
    void* srcBuffer /* contiguous buffer with training data/samples */;
318
    void* const dictBuffer = malloc(maxDictSize);
319
    int result = 0;
320

321
    int const displayLevel = params ? params->zParams.notificationLevel :
322
        coverParams ? coverParams->zParams.notificationLevel :
323
        fastCoverParams ? fastCoverParams->zParams.notificationLevel : 0;
324

325
    /* Shuffle input files before we start assessing how much sample datA to load.
326
       The purpose of the shuffle is to pick random samples when the sample
327
       set is larger than what we can load in memory. */
328
    DISPLAYLEVEL(3, "Shuffling input files\n");
329
    DiB_shuffle(fileNamesTable, nbFiles);
330

331
    /* Figure out how much sample data to load with how many samples */
332
    fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
333

334
    {
335
        int const memMult = params ? MEMMULT :
336
                            coverParams ? COVER_MEMMULT:
337
                            FASTCOVER_MEMMULT;
338
        size_t const maxMem =  DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
339
        /* Limit the size of the training data to the free memory */
340
        /* Limit the size of the training data to 2GB */
341
        /* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */
342
        loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
343
        if (memLimit != 0) {
344
            DISPLAYLEVEL(2, "!  Warning : setting manual memory limit for dictionary training data at %u MB \n",
345
                (unsigned)(memLimit / (1 MB)));
346
            loadedSize = (size_t)MIN(loadedSize, memLimit);
347
        }
348
        srcBuffer = malloc(loadedSize+NOISELENGTH);
349
        sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
350
    }
351

352
    /* Checks */
353
    if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer))
354
        EXM_THROW(12, "not enough memory for DiB_trainFiles");   /* should not happen */
355
    if (fs.oneSampleTooLarge) {
356
        DISPLAYLEVEL(2, "!  Warning : some sample(s) are very large \n");
357
        DISPLAYLEVEL(2, "!  Note that dictionary is only useful for small samples. \n");
358
        DISPLAYLEVEL(2, "!  As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
359
    }
360
    if (fs.nbSamples < 5) {
361
        DISPLAYLEVEL(2, "!  Warning : nb of samples too low for proper processing ! \n");
362
        DISPLAYLEVEL(2, "!  Please provide _one file per sample_. \n");
363
        DISPLAYLEVEL(2, "!  Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
364
        EXM_THROW(14, "nb of samples too low");   /* we now clearly forbid this case */
365
    }
366
    if (fs.totalSizeToLoad < (S64)maxDictSize * 8) {
367
        DISPLAYLEVEL(2, "!  Warning : data size of samples too small for target dictionary size \n");
368
        DISPLAYLEVEL(2, "!  Samples should be about 100x larger than target dictionary size \n");
369
    }
370

371
    /* init */
372
    if ((S64)loadedSize < fs.totalSizeToLoad)
373
        DISPLAYLEVEL(1, "Training samples set too large (%u MB); training on %u MB only...\n",
374
            (unsigned)(fs.totalSizeToLoad / (1 MB)),
375
            (unsigned)(loadedSize / (1 MB)));
376

377
    /* Load input buffer */
378
    nbSamplesLoaded = DiB_loadFiles(
379
        srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable,
380
        nbFiles, chunkSize, displayLevel);
381

382
    {   size_t dictSize;
383
        if (params) {
384
            DiB_fillNoise((char*)srcBuffer + loadedSize, NOISELENGTH);   /* guard band, for end of buffer condition */
385
            dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize,
386
                                                    srcBuffer, sampleSizes, nbSamplesLoaded,
387
                                                    *params);
388
        } else if (coverParams) {
389
            if (optimize) {
390
              dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
391
                                                             srcBuffer, sampleSizes, nbSamplesLoaded,
392
                                                             coverParams);
393
              if (!ZDICT_isError(dictSize)) {
394
                  unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
395
                  DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d,
396
                              coverParams->steps, splitPercentage);
397
              }
398
            } else {
399
              dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
400
                                                     sampleSizes, nbSamplesLoaded, *coverParams);
401
            }
402
        } else {
403
            assert(fastCoverParams != NULL);
404
            if (optimize) {
405
              dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,
406
                                                              srcBuffer, sampleSizes, nbSamplesLoaded,
407
                                                              fastCoverParams);
408
              if (!ZDICT_isError(dictSize)) {
409
                unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);
410
                DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastCoverParams->k,
411
                            fastCoverParams->d, fastCoverParams->f, fastCoverParams->steps, splitPercentage,
412
                            fastCoverParams->accel);
413
              }
414
            } else {
415
              dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,
416
                                                        sampleSizes, nbSamplesLoaded, *fastCoverParams);
417
            }
418
        }
419
        if (ZDICT_isError(dictSize)) {
420
            DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
421
            result = 1;
422
            goto _cleanup;
423
        }
424
        /* save dict */
425
        DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (unsigned)dictSize, dictFileName);
426
        DiB_saveDict(dictFileName, dictBuffer, dictSize);
427
    }
428

429
    /* clean up */
430
_cleanup:
431
    free(srcBuffer);
432
    free(sampleSizes);
433
    free(dictBuffer);
434
    return result;
435
}
436

437
Product

Resources

Company