Path: blob/main/sys/contrib/zstd/programs/dibio.c
103480 views
/*1* Copyright (c) Yann Collet, Facebook, Inc.2* All rights reserved.3*4* This source code is licensed under both the BSD-style license (found in the5* LICENSE file in the root directory of this source tree) and the GPLv2 (found6* in the COPYING file in the root directory of this source tree).7* You may select, at your option, one of the above-listed licenses.8*/9101112/* **************************************13* Compiler Warnings14****************************************/15#ifdef _MSC_VER16# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */17#endif181920/*-*************************************21* Includes22***************************************/23#include "platform.h" /* Large Files support */24#include "util.h" /* UTIL_getFileSize, UTIL_getTotalFileSize */25#include <stdlib.h> /* malloc, free */26#include <string.h> /* memset */27#include <stdio.h> /* fprintf, fopen, ftello64 */28#include <errno.h> /* errno */29#include <assert.h>3031#include "timefn.h" /* UTIL_time_t, UTIL_clockSpanMicro, UTIL_getTime */32#include "../lib/common/mem.h" /* read */33#include "dibio.h"343536/*-*************************************37* Constants38***************************************/39#define KB *(1 <<10)40#define MB *(1 <<20)41#define GB *(1U<<30)4243#define SAMPLESIZE_MAX (128 KB)44#define MEMMULT 11 /* rough estimation : memory cost to analyze 1 byte of sample */45#define COVER_MEMMULT 9 /* rough estimation : memory cost to analyze 1 byte of sample */46#define FASTCOVER_MEMMULT 1 /* rough estimation : memory cost to analyze 1 byte of sample */47static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));4849#define NOISELENGTH 3250#define MAX_SAMPLES_SIZE (2 GB) /* training dataset limited to 2GB */515253/*-*************************************54* Console display55***************************************/56#define DISPLAY(...) fprintf(stderr, __VA_ARGS__)57#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }5859static const U64 g_refreshRate = SEC_TO_MICRO / 6;60static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;6162#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \63if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \64{ g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \65if (displayLevel>=4) fflush(stderr); } } }6667/*-*************************************68* Exceptions69***************************************/70#ifndef DEBUG71# define DEBUG 072#endif73#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);74#define EXM_THROW(error, ...) \75{ \76DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \77DISPLAY("Error %i : ", error); \78DISPLAY(__VA_ARGS__); \79DISPLAY("\n"); \80exit(error); \81}828384/* ********************************************************85* Helper functions86**********************************************************/87#undef MIN88#define MIN(a,b) ((a) < (b) ? (a) : (b))8990/**91Returns the size of a file.92If error returns -1.93*/94static S64 DiB_getFileSize (const char * fileName)95{96U64 const fileSize = UTIL_getFileSize(fileName);97return (fileSize == UTIL_FILESIZE_UNKNOWN) ? -1 : (S64)fileSize;98}99100/* ********************************************************101* File related operations102**********************************************************/103/** DiB_loadFiles() :104* load samples from files listed in fileNamesTable into buffer.105* works even if buffer is too small to load all samples.106* Also provides the size of each sample into sampleSizes table107* which must be sized correctly, using DiB_fileStats().108* @return : nb of samples effectively loaded into `buffer`109* *bufferSizePtr is modified, it provides the amount data loaded within buffer.110* sampleSizes is filled with the size of each sample.111*/112static int DiB_loadFiles(113void* buffer, size_t* bufferSizePtr,114size_t* sampleSizes, int sstSize,115const char** fileNamesTable, int nbFiles,116size_t targetChunkSize, int displayLevel )117{118char* const buff = (char*)buffer;119size_t totalDataLoaded = 0;120int nbSamplesLoaded = 0;121int fileIndex = 0;122FILE * f = NULL;123124assert(targetChunkSize <= SAMPLESIZE_MAX);125126while ( nbSamplesLoaded < sstSize && fileIndex < nbFiles ) {127size_t fileDataLoaded;128S64 const fileSize = DiB_getFileSize(fileNamesTable[fileIndex]);129if (fileSize <= 0) /* skip if zero-size or file error */130continue;131132f = fopen( fileNamesTable[fileIndex], "rb");133if (f == NULL)134EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileNamesTable[fileIndex], strerror(errno));135DISPLAYUPDATE(2, "Loading %s... \r", fileNamesTable[fileIndex]);136137/* Load the first chunk of data from the file */138fileDataLoaded = targetChunkSize > 0 ?139(size_t)MIN(fileSize, (S64)targetChunkSize) :140(size_t)MIN(fileSize, SAMPLESIZE_MAX );141if (totalDataLoaded + fileDataLoaded > *bufferSizePtr)142break;143if (fread( buff+totalDataLoaded, 1, fileDataLoaded, f ) != fileDataLoaded)144EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);145sampleSizes[nbSamplesLoaded++] = fileDataLoaded;146totalDataLoaded += fileDataLoaded;147148/* If file-chunking is enabled, load the rest of the file as more samples */149if (targetChunkSize > 0) {150while( (S64)fileDataLoaded < fileSize && nbSamplesLoaded < sstSize ) {151size_t const chunkSize = MIN((size_t)(fileSize-fileDataLoaded), targetChunkSize);152if (totalDataLoaded + chunkSize > *bufferSizePtr) /* buffer is full */153break;154155if (fread( buff+totalDataLoaded, 1, chunkSize, f ) != chunkSize)156EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);157sampleSizes[nbSamplesLoaded++] = chunkSize;158totalDataLoaded += chunkSize;159fileDataLoaded += chunkSize;160}161}162fileIndex += 1;163fclose(f); f = NULL;164}165if (f != NULL)166fclose(f);167168DISPLAYLEVEL(2, "\r%79s\r", "");169DISPLAYLEVEL(4, "Loaded %d KB total training data, %d nb samples \n",170(int)(totalDataLoaded / (1 KB)), nbSamplesLoaded );171*bufferSizePtr = totalDataLoaded;172return nbSamplesLoaded;173}174175#define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r)))176static U32 DiB_rand(U32* src)177{178static const U32 prime1 = 2654435761U;179static const U32 prime2 = 2246822519U;180U32 rand32 = *src;181rand32 *= prime1;182rand32 ^= prime2;183rand32 = DiB_rotl32(rand32, 13);184*src = rand32;185return rand32 >> 5;186}187188/* DiB_shuffle() :189* shuffle a table of file names in a semi-random way190* It improves dictionary quality by reducing "locality" impact, so if sample set is very large,191* it will load random elements from it, instead of just the first ones. */192static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {193U32 seed = 0xFD2FB528;194unsigned i;195assert(nbFiles >= 1);196for (i = nbFiles - 1; i > 0; --i) {197unsigned const j = DiB_rand(&seed) % (i + 1);198const char* const tmp = fileNamesTable[j];199fileNamesTable[j] = fileNamesTable[i];200fileNamesTable[i] = tmp;201}202}203204205/*-********************************************************206* Dictionary training functions207**********************************************************/208static size_t DiB_findMaxMem(unsigned long long requiredMem)209{210size_t const step = 8 MB;211void* testmem = NULL;212213requiredMem = (((requiredMem >> 23) + 1) << 23);214requiredMem += step;215if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;216217while (!testmem) {218testmem = malloc((size_t)requiredMem);219requiredMem -= step;220}221222free(testmem);223return (size_t)requiredMem;224}225226227static void DiB_fillNoise(void* buffer, size_t length)228{229unsigned const prime1 = 2654435761U;230unsigned const prime2 = 2246822519U;231unsigned acc = prime1;232size_t p=0;233234for (p=0; p<length; p++) {235acc *= prime2;236((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);237}238}239240241static void DiB_saveDict(const char* dictFileName,242const void* buff, size_t buffSize)243{244FILE* const f = fopen(dictFileName, "wb");245if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);246247{ size_t const n = fwrite(buff, 1, buffSize, f);248if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }249250{ size_t const n = (size_t)fclose(f);251if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }252}253254typedef struct {255S64 totalSizeToLoad;256int nbSamples;257int oneSampleTooLarge;258} fileStats;259260/*! DiB_fileStats() :261* Given a list of files, and a chunkSize (0 == no chunk, whole files)262* provides the amount of data to be loaded and the resulting nb of samples.263* This is useful primarily for allocation purpose => sample buffer, and sample sizes table.264*/265static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t chunkSize, int displayLevel)266{267fileStats fs;268int n;269memset(&fs, 0, sizeof(fs));270271// We assume that if chunking is requested, the chunk size is < SAMPLESIZE_MAX272assert( chunkSize <= SAMPLESIZE_MAX );273274for (n=0; n<nbFiles; n++) {275S64 const fileSize = DiB_getFileSize(fileNamesTable[n]);276// TODO: is there a minimum sample size? What if the file is 1-byte?277if (fileSize == 0) {278DISPLAYLEVEL(3, "Sample file '%s' has zero size, skipping...\n", fileNamesTable[n]);279continue;280}281282/* the case where we are breaking up files in sample chunks */283if (chunkSize > 0)284{285// TODO: is there a minimum sample size? Can we have a 1-byte sample?286fs.nbSamples += (int)((fileSize + chunkSize-1) / chunkSize);287fs.totalSizeToLoad += fileSize;288}289else {290/* the case where one file is one sample */291if (fileSize > SAMPLESIZE_MAX) {292/* flag excessively large sample files */293fs.oneSampleTooLarge |= (fileSize > 2*SAMPLESIZE_MAX);294295/* Limit to the first SAMPLESIZE_MAX (128kB) of the file */296DISPLAYLEVEL(3, "Sample file '%s' is too large, limiting to %d KB",297fileNamesTable[n], SAMPLESIZE_MAX / (1 KB));298}299fs.nbSamples += 1;300fs.totalSizeToLoad += MIN(fileSize, SAMPLESIZE_MAX);301}302}303DISPLAYLEVEL(4, "Found training data %d files, %d KB, %d samples\n", nbFiles, (int)(fs.totalSizeToLoad / (1 KB)), fs.nbSamples);304return fs;305}306307int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,308const char** fileNamesTable, int nbFiles, size_t chunkSize,309ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,310ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit)311{312fileStats fs;313size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */314int nbSamplesLoaded; /* nb of samples effectively loaded in srcBuffer */315size_t loadedSize; /* total data loaded in srcBuffer for all samples */316void* srcBuffer /* contiguous buffer with training data/samples */;317void* const dictBuffer = malloc(maxDictSize);318int result = 0;319320int const displayLevel = params ? params->zParams.notificationLevel :321coverParams ? coverParams->zParams.notificationLevel :322fastCoverParams ? fastCoverParams->zParams.notificationLevel : 0;323324/* Shuffle input files before we start assessing how much sample datA to load.325The purpose of the shuffle is to pick random samples when the sample326set is larger than what we can load in memory. */327DISPLAYLEVEL(3, "Shuffling input files\n");328DiB_shuffle(fileNamesTable, nbFiles);329330/* Figure out how much sample data to load with how many samples */331fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);332333{334int const memMult = params ? MEMMULT :335coverParams ? COVER_MEMMULT:336FASTCOVER_MEMMULT;337size_t const maxMem = DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;338/* Limit the size of the training data to the free memory */339/* Limit the size of the training data to 2GB */340/* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */341loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );342if (memLimit != 0) {343DISPLAYLEVEL(2, "! Warning : setting manual memory limit for dictionary training data at %u MB \n",344(unsigned)(memLimit / (1 MB)));345loadedSize = (size_t)MIN(loadedSize, memLimit);346}347srcBuffer = malloc(loadedSize+NOISELENGTH);348sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));349}350351/* Checks */352if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer))353EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */354if (fs.oneSampleTooLarge) {355DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n");356DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n");357DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);358}359if (fs.nbSamples < 5) {360DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n");361DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n");362DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");363EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */364}365if (fs.totalSizeToLoad < (S64)maxDictSize * 8) {366DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n");367DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n");368}369370/* init */371if ((S64)loadedSize < fs.totalSizeToLoad)372DISPLAYLEVEL(1, "Training samples set too large (%u MB); training on %u MB only...\n",373(unsigned)(fs.totalSizeToLoad / (1 MB)),374(unsigned)(loadedSize / (1 MB)));375376/* Load input buffer */377nbSamplesLoaded = DiB_loadFiles(378srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable,379nbFiles, chunkSize, displayLevel);380381{ size_t dictSize;382if (params) {383DiB_fillNoise((char*)srcBuffer + loadedSize, NOISELENGTH); /* guard band, for end of buffer condition */384dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize,385srcBuffer, sampleSizes, nbSamplesLoaded,386*params);387} else if (coverParams) {388if (optimize) {389dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,390srcBuffer, sampleSizes, nbSamplesLoaded,391coverParams);392if (!ZDICT_isError(dictSize)) {393unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);394DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d,395coverParams->steps, splitPercentage);396}397} else {398dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,399sampleSizes, nbSamplesLoaded, *coverParams);400}401} else {402assert(fastCoverParams != NULL);403if (optimize) {404dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,405srcBuffer, sampleSizes, nbSamplesLoaded,406fastCoverParams);407if (!ZDICT_isError(dictSize)) {408unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);409DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastCoverParams->k,410fastCoverParams->d, fastCoverParams->f, fastCoverParams->steps, splitPercentage,411fastCoverParams->accel);412}413} else {414dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,415sampleSizes, nbSamplesLoaded, *fastCoverParams);416}417}418if (ZDICT_isError(dictSize)) {419DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */420result = 1;421goto _cleanup;422}423/* save dict */424DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (unsigned)dictSize, dictFileName);425DiB_saveDict(dictFileName, dictBuffer, dictSize);426}427428/* clean up */429_cleanup:430free(srcBuffer);431free(sampleSizes);432free(dictBuffer);433return result;434}435436437