CoCalc -- lorem.c

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/zstd/programs/lorem.c
²⁸⁹⁰²³ views
1
/*
2
 * Copyright (c) Meta Platforms, Inc. and affiliates.
3
 * All rights reserved.
4
 *
5
 * This source code is licensed under both the BSD-style license (found in the
6
 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
 * in the COPYING file in the root directory of this source tree).
8
 * You may select, at your option, one of the above-listed licenses.
9
 */
10

11
/* Implementation notes:
12
 *
13
 * This is a very simple lorem ipsum generator
14
 * which features a static list of words
15
 * and print them one after another randomly
16
 * with a fake sentence / paragraph structure.
17
 *
18
 * The goal is to generate a printable text
19
 * that can be used to fake a text compression scenario.
20
 * The resulting compression / ratio curve of the lorem ipsum generator
21
 * is more satisfying than the previous statistical generator,
22
 * which was initially designed for entropy compression,
23
 * and lacks a regularity more representative of text.
24
 *
25
 * The compression ratio achievable on the generated lorem ipsum
26
 * is still a bit too good, presumably because the dictionary is a bit too
27
 * small. It would be possible to create some more complex scheme, notably by
28
 * enlarging the dictionary with a word generator, and adding grammatical rules
29
 * (composition) and syntax rules. But that's probably overkill for the intended
30
 * goal.
31
 */
32

33
#include "lorem.h"
34
#include <assert.h>
35
#include <limits.h> /* INT_MAX */
36
#include <string.h> /* memcpy */
37

38
#define WORD_MAX_SIZE 20
39

40
/* Define the word pool */
41
static const char* kWords[] = {
42
    "lorem",        "ipsum",      "dolor",       "sit",          "amet",
43
    "consectetur",  "adipiscing", "elit",        "sed",          "do",
44
    "eiusmod",      "tempor",     "incididunt",  "ut",           "labore",
45
    "et",           "dolore",     "magna",       "aliqua",       "dis",
46
    "lectus",       "vestibulum", "mattis",      "ullamcorper",  "velit",
47
    "commodo",      "a",          "lacus",       "arcu",         "magnis",
48
    "parturient",   "montes",     "nascetur",    "ridiculus",    "mus",
49
    "mauris",       "nulla",      "malesuada",   "pellentesque", "eget",
50
    "gravida",      "in",         "dictum",      "non",          "erat",
51
    "nam",          "voluptat",   "maecenas",    "blandit",      "aliquam",
52
    "etiam",        "enim",       "lobortis",    "scelerisque",  "fermentum",
53
    "dui",          "faucibus",   "ornare",      "at",           "elementum",
54
    "eu",           "facilisis",  "odio",        "morbi",        "quis",
55
    "eros",         "donec",      "ac",          "orci",         "purus",
56
    "turpis",       "cursus",     "leo",         "vel",          "porta",
57
    "consequat",    "interdum",   "varius",      "vulputate",    "aliquet",
58
    "pharetra",     "nunc",       "auctor",      "urna",         "id",
59
    "metus",        "viverra",    "nibh",        "cras",         "mi",
60
    "unde",         "omnis",      "iste",        "natus",        "error",
61
    "perspiciatis", "voluptatem", "accusantium", "doloremque",   "laudantium",
62
    "totam",        "rem",        "aperiam",     "eaque",        "ipsa",
63
    "quae",         "ab",         "illo",        "inventore",    "veritatis",
64
    "quasi",        "architecto", "beatae",      "vitae",        "dicta",
65
    "sunt",         "explicabo",  "nemo",        "ipsam",        "quia",
66
    "voluptas",     "aspernatur", "aut",         "odit",         "fugit",
67
    "consequuntur", "magni",      "dolores",     "eos",          "qui",
68
    "ratione",      "sequi",      "nesciunt",    "neque",        "porro",
69
    "quisquam",     "est",        "dolorem",     "adipisci",     "numquam",
70
    "eius",         "modi",       "tempora",     "incidunt",     "magnam",
71
    "quaerat",      "ad",         "minima",      "veniam",       "nostrum",
72
    "ullam",        "corporis",   "suscipit",    "laboriosam",   "nisi",
73
    "aliquid",      "ex",         "ea",          "commodi",      "consequatur",
74
    "autem",        "eum",        "iure",        "voluptate",    "esse",
75
    "quam",         "nihil",      "molestiae",   "illum",        "fugiat",
76
    "quo",          "pariatur",   "vero",        "accusamus",    "iusto",
77
    "dignissimos",  "ducimus",    "blanditiis",  "praesentium",  "voluptatum",
78
    "deleniti",     "atque",      "corrupti",    "quos",         "quas",
79
    "molestias",    "excepturi",  "sint",        "occaecati",    "cupiditate",
80
    "provident",    "similique",  "culpa",       "officia",      "deserunt",
81
    "mollitia",     "animi",      "laborum",     "dolorum",      "fuga",
82
    "harum",        "quidem",     "rerum",       "facilis",      "expedita",
83
    "distinctio",   "libero",     "tempore",     "cum",          "soluta",
84
    "nobis",        "eligendi",   "optio",       "cumque",       "impedit",
85
    "minus",        "quod",       "maxime",      "placeat",      "facere",
86
    "possimus",     "assumenda",  "repellendus", "temporibus",   "quibusdam",
87
    "officiis",     "debitis",    "saepe",       "eveniet",      "voluptates",
88
    "repudiandae",  "recusandae", "itaque",      "earum",        "hic",
89
    "tenetur",      "sapiente",   "delectus",    "reiciendis",   "cillum",
90
    "maiores",      "alias",      "perferendis", "doloribus",    "asperiores",
91
    "repellat",     "minim",      "nostrud",     "exercitation", "ullamco",
92
    "laboris",      "aliquip",    "duis",        "aute",         "irure",
93
};
94
static const unsigned kNbWords = sizeof(kWords) / sizeof(kWords[0]);
95

96
/* simple 1-dimension distribution, based on word's length, favors small words
97
 */
98
static const int kWeights[]    = { 0, 8, 6, 4, 3, 2 };
99
static const size_t kNbWeights = sizeof(kWeights) / sizeof(kWeights[0]);
100

101
#define DISTRIB_SIZE_MAX 650
102
static int g_distrib[DISTRIB_SIZE_MAX] = { 0 };
103
static unsigned g_distribCount         = 0;
104

105
static void countFreqs(
106
        const char* words[],
107
        size_t nbWords,
108
        const int* weights,
109
        size_t nbWeights)
110
{
111
    unsigned total = 0;
112
    size_t w;
113
    for (w = 0; w < nbWords; w++) {
114
        size_t len = strlen(words[w]);
115
        int lmax;
116
        if (len >= nbWeights)
117
            len = nbWeights - 1;
118
        lmax = weights[len];
119
        total += (unsigned)lmax;
120
    }
121
    g_distribCount = total;
122
    assert(g_distribCount <= DISTRIB_SIZE_MAX);
123
}
124

125
static void init_word_distrib(
126
        const char* words[],
127
        size_t nbWords,
128
        const int* weights,
129
        size_t nbWeights)
130
{
131
    size_t w, d = 0;
132
    countFreqs(words, nbWords, weights, nbWeights);
133
    for (w = 0; w < nbWords; w++) {
134
        size_t len = strlen(words[w]);
135
        int l, lmax;
136
        if (len >= nbWeights)
137
            len = nbWeights - 1;
138
        lmax = weights[len];
139
        for (l = 0; l < lmax; l++) {
140
            g_distrib[d++] = (int)w;
141
        }
142
    }
143
}
144

145
/* Note: this unit only works when invoked sequentially.
146
 * No concurrent access is allowed */
147
static char* g_ptr         = NULL;
148
static size_t g_nbChars    = 0;
149
static size_t g_maxChars   = 10000000;
150
static unsigned g_randRoot = 0;
151

152
#define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r)))
153
static unsigned LOREM_rand(unsigned range)
154
{
155
    static const unsigned prime1 = 2654435761U;
156
    static const unsigned prime2 = 2246822519U;
157
    unsigned rand32              = g_randRoot;
158
    rand32 *= prime1;
159
    rand32 ^= prime2;
160
    rand32     = RDG_rotl32(rand32, 13);
161
    g_randRoot = rand32;
162
    return (unsigned)(((unsigned long long)rand32 * range) >> 32);
163
}
164

165
static void writeLastCharacters(void)
166
{
167
    size_t lastChars = g_maxChars - g_nbChars;
168
    assert(g_maxChars >= g_nbChars);
169
    if (lastChars == 0)
170
        return;
171
    g_ptr[g_nbChars++] = '.';
172
    if (lastChars > 2) {
173
        memset(g_ptr + g_nbChars, ' ', lastChars - 2);
174
    }
175
    if (lastChars > 1) {
176
        g_ptr[g_maxChars - 1] = '\n';
177
    }
178
    g_nbChars = g_maxChars;
179
}
180

181
static void generateWord(const char* word, const char* separator, int upCase)
182
{
183
    size_t const len = strlen(word) + strlen(separator);
184
    if (g_nbChars + len > g_maxChars) {
185
        writeLastCharacters();
186
        return;
187
    }
188
    memcpy(g_ptr + g_nbChars, word, strlen(word));
189
    if (upCase) {
190
        static const char toUp = 'A' - 'a';
191
        g_ptr[g_nbChars]       = (char)(g_ptr[g_nbChars] + toUp);
192
    }
193
    g_nbChars += strlen(word);
194
    memcpy(g_ptr + g_nbChars, separator, strlen(separator));
195
    g_nbChars += strlen(separator);
196
}
197

198
static int about(unsigned target)
199
{
200
    return (int)(LOREM_rand(target) + LOREM_rand(target) + 1);
201
}
202

203
/* Function to generate a random sentence */
204
static void generateSentence(int nbWords)
205
{
206
    int commaPos       = about(9);
207
    int comma2         = commaPos + about(7);
208
    int qmark          = (LOREM_rand(11) == 7);
209
    const char* endSep = qmark ? "? " : ". ";
210
    int i;
211
    for (i = 0; i < nbWords; i++) {
212
        int const wordID       = g_distrib[LOREM_rand(g_distribCount)];
213
        const char* const word = kWords[wordID];
214
        const char* sep        = " ";
215
        if (i == commaPos)
216
            sep = ", ";
217
        if (i == comma2)
218
            sep = ", ";
219
        if (i == nbWords - 1)
220
            sep = endSep;
221
        generateWord(word, sep, i == 0);
222
    }
223
}
224

225
static void generateParagraph(int nbSentences)
226
{
227
    int i;
228
    for (i = 0; i < nbSentences; i++) {
229
        int wordsPerSentence = about(11);
230
        generateSentence(wordsPerSentence);
231
    }
232
    if (g_nbChars < g_maxChars) {
233
        g_ptr[g_nbChars++] = '\n';
234
    }
235
    if (g_nbChars < g_maxChars) {
236
        g_ptr[g_nbChars++] = '\n';
237
    }
238
}
239

240
/* It's "common" for lorem ipsum generators to start with the same first
241
 * pre-defined sentence */
242
static void generateFirstSentence(void)
243
{
244
    int i;
245
    for (i = 0; i < 18; i++) {
246
        const char* word      = kWords[i];
247
        const char* separator = " ";
248
        if (i == 4)
249
            separator = ", ";
250
        if (i == 7)
251
            separator = ", ";
252
        generateWord(word, separator, i == 0);
253
    }
254
    generateWord(kWords[18], ". ", 0);
255
}
256

257
size_t
258
LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill)
259
{
260
    g_ptr = (char*)buffer;
261
    assert(size < INT_MAX);
262
    g_maxChars = size;
263
    g_nbChars  = 0;
264
    g_randRoot = seed;
265
    if (g_distribCount == 0) {
266
        init_word_distrib(kWords, kNbWords, kWeights, kNbWeights);
267
    }
268

269
    if (first) {
270
        generateFirstSentence();
271
    }
272
    while (g_nbChars < g_maxChars) {
273
        int sentencePerParagraph = about(7);
274
        generateParagraph(sentencePerParagraph);
275
        if (!fill)
276
            break; /* only generate one paragraph in not-fill mode */
277
    }
278
    g_ptr = NULL;
279
    return g_nbChars;
280
}
281

282
void LOREM_genBuffer(void* buffer, size_t size, unsigned seed)
283
{
284
    LOREM_genBlock(buffer, size, seed, 1, 1);
285
}
286

287
Product

Resources

Company