Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/zstd/zfs_zstd.c
48383 views
1
// SPDX-License-Identifier: BSD-3-Clause
2
/*
3
* BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
4
*
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions are met:
7
*
8
* 1. Redistributions of source code must retain the above copyright notice,
9
* this list of conditions and the following disclaimer.
10
*
11
* 2. Redistributions in binary form must reproduce the above copyright notice,
12
* this list of conditions and the following disclaimer in the documentation
13
* and/or other materials provided with the distribution.
14
*
15
* 3. Neither the name of the copyright holder nor the names of its
16
* contributors may be used to endorse or promote products derived from this
17
* software without specific prior written permission.
18
*
19
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29
* POSSIBILITY OF SUCH DAMAGE.
30
*/
31
32
/*
33
* Copyright (c) 2016-2018, Klara Inc.
34
* Copyright (c) 2016-2018, Allan Jude
35
* Copyright (c) 2018-2020, Sebastian Gottschall
36
* Copyright (c) 2019-2020, Michael Niewöhner
37
* Copyright (c) 2020, The FreeBSD Foundation [1]
38
*
39
* [1] Portions of this software were developed by Allan Jude
40
* under sponsorship from the FreeBSD Foundation.
41
*/
42
43
#include <sys/param.h>
44
#include <sys/sysmacros.h>
45
#include <sys/zfs_context.h>
46
#include <sys/zio_compress.h>
47
#include <sys/spa.h>
48
#include <sys/zstd/zstd.h>
49
50
#define ZSTD_STATIC_LINKING_ONLY
51
#include "lib/zstd.h"
52
#include "lib/common/zstd_errors.h"
53
54
#ifndef IN_LIBSA
55
static uint_t zstd_earlyabort_pass = 1;
56
static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3;
57
static unsigned int zstd_abort_size = (128 * 1024);
58
#endif
59
60
#ifdef IN_BASE
61
int zfs_zstd_decompress_buf(void *, void *, size_t, size_t, int);
62
#endif
63
64
static kstat_t *zstd_ksp = NULL;
65
66
typedef struct zstd_stats {
67
kstat_named_t zstd_stat_alloc_fail;
68
kstat_named_t zstd_stat_alloc_fallback;
69
kstat_named_t zstd_stat_com_alloc_fail;
70
kstat_named_t zstd_stat_dec_alloc_fail;
71
kstat_named_t zstd_stat_com_inval;
72
kstat_named_t zstd_stat_dec_inval;
73
kstat_named_t zstd_stat_dec_header_inval;
74
kstat_named_t zstd_stat_com_fail;
75
kstat_named_t zstd_stat_dec_fail;
76
/*
77
* LZ4 first-pass early abort verdict
78
*/
79
kstat_named_t zstd_stat_lz4pass_allowed;
80
kstat_named_t zstd_stat_lz4pass_rejected;
81
/*
82
* zstd-1 second-pass early abort verdict
83
*/
84
kstat_named_t zstd_stat_zstdpass_allowed;
85
kstat_named_t zstd_stat_zstdpass_rejected;
86
/*
87
* We excluded this from early abort for some reason
88
*/
89
kstat_named_t zstd_stat_passignored;
90
kstat_named_t zstd_stat_passignored_size;
91
kstat_named_t zstd_stat_buffers;
92
kstat_named_t zstd_stat_size;
93
} zstd_stats_t;
94
95
static zstd_stats_t zstd_stats = {
96
{ "alloc_fail", KSTAT_DATA_UINT64 },
97
{ "alloc_fallback", KSTAT_DATA_UINT64 },
98
{ "compress_alloc_fail", KSTAT_DATA_UINT64 },
99
{ "decompress_alloc_fail", KSTAT_DATA_UINT64 },
100
{ "compress_level_invalid", KSTAT_DATA_UINT64 },
101
{ "decompress_level_invalid", KSTAT_DATA_UINT64 },
102
{ "decompress_header_invalid", KSTAT_DATA_UINT64 },
103
{ "compress_failed", KSTAT_DATA_UINT64 },
104
{ "decompress_failed", KSTAT_DATA_UINT64 },
105
{ "lz4pass_allowed", KSTAT_DATA_UINT64 },
106
{ "lz4pass_rejected", KSTAT_DATA_UINT64 },
107
{ "zstdpass_allowed", KSTAT_DATA_UINT64 },
108
{ "zstdpass_rejected", KSTAT_DATA_UINT64 },
109
{ "passignored", KSTAT_DATA_UINT64 },
110
{ "passignored_size", KSTAT_DATA_UINT64 },
111
{ "buffers", KSTAT_DATA_UINT64 },
112
{ "size", KSTAT_DATA_UINT64 },
113
};
114
115
#ifdef _KERNEL
116
static int
117
kstat_zstd_update(kstat_t *ksp, int rw)
118
{
119
ASSERT(ksp != NULL);
120
121
if (rw == KSTAT_WRITE && ksp == zstd_ksp) {
122
ZSTDSTAT_ZERO(zstd_stat_alloc_fail);
123
ZSTDSTAT_ZERO(zstd_stat_alloc_fallback);
124
ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail);
125
ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail);
126
ZSTDSTAT_ZERO(zstd_stat_com_inval);
127
ZSTDSTAT_ZERO(zstd_stat_dec_inval);
128
ZSTDSTAT_ZERO(zstd_stat_dec_header_inval);
129
ZSTDSTAT_ZERO(zstd_stat_com_fail);
130
ZSTDSTAT_ZERO(zstd_stat_dec_fail);
131
ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed);
132
ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected);
133
ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed);
134
ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected);
135
ZSTDSTAT_ZERO(zstd_stat_passignored);
136
ZSTDSTAT_ZERO(zstd_stat_passignored_size);
137
}
138
139
return (0);
140
}
141
#endif
142
143
/* Enums describing the allocator type specified by kmem_type in zstd_kmem */
144
enum zstd_kmem_type {
145
ZSTD_KMEM_UNKNOWN = 0,
146
/* Allocation type using kmem_vmalloc */
147
ZSTD_KMEM_DEFAULT,
148
/* Pool based allocation using mempool_alloc */
149
ZSTD_KMEM_POOL,
150
/* Reserved fallback memory for decompression only */
151
ZSTD_KMEM_DCTX,
152
ZSTD_KMEM_COUNT,
153
};
154
155
/* Structure for pooled memory objects */
156
struct zstd_pool {
157
void *mem;
158
size_t size;
159
kmutex_t barrier;
160
hrtime_t timeout;
161
};
162
163
/* Global structure for handling memory allocations */
164
struct zstd_kmem {
165
enum zstd_kmem_type kmem_type;
166
size_t kmem_size;
167
struct zstd_pool *pool;
168
};
169
170
/* Fallback memory structure used for decompression only if memory runs out */
171
struct zstd_fallback_mem {
172
size_t mem_size;
173
void *mem;
174
kmutex_t barrier;
175
};
176
177
struct zstd_levelmap {
178
int16_t zstd_level;
179
enum zio_zstd_levels level;
180
};
181
182
/*
183
* ZSTD memory handlers
184
*
185
* For decompression we use a different handler which also provides fallback
186
* memory allocation in case memory runs out.
187
*
188
* The ZSTD handlers were split up for the most simplified implementation.
189
*/
190
#ifndef IN_LIBSA
191
static void *zstd_alloc(void *opaque, size_t size);
192
#endif
193
static void *zstd_dctx_alloc(void *opaque, size_t size);
194
static void zstd_free(void *opaque, void *ptr);
195
196
#ifndef IN_LIBSA
197
/* Compression memory handler */
198
static const ZSTD_customMem zstd_malloc = {
199
zstd_alloc,
200
zstd_free,
201
NULL,
202
};
203
#endif
204
205
/* Decompression memory handler */
206
static const ZSTD_customMem zstd_dctx_malloc = {
207
zstd_dctx_alloc,
208
zstd_free,
209
NULL,
210
};
211
212
/* Level map for converting ZFS internal levels to ZSTD levels and vice versa */
213
static struct zstd_levelmap zstd_levels[] = {
214
{ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1},
215
{ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2},
216
{ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3},
217
{ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4},
218
{ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5},
219
{ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6},
220
{ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7},
221
{ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8},
222
{ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9},
223
{ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10},
224
{ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11},
225
{ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12},
226
{ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13},
227
{ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14},
228
{ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15},
229
{ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16},
230
{ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17},
231
{ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18},
232
{ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19},
233
{-1, ZIO_ZSTD_LEVEL_FAST_1},
234
{-2, ZIO_ZSTD_LEVEL_FAST_2},
235
{-3, ZIO_ZSTD_LEVEL_FAST_3},
236
{-4, ZIO_ZSTD_LEVEL_FAST_4},
237
{-5, ZIO_ZSTD_LEVEL_FAST_5},
238
{-6, ZIO_ZSTD_LEVEL_FAST_6},
239
{-7, ZIO_ZSTD_LEVEL_FAST_7},
240
{-8, ZIO_ZSTD_LEVEL_FAST_8},
241
{-9, ZIO_ZSTD_LEVEL_FAST_9},
242
{-10, ZIO_ZSTD_LEVEL_FAST_10},
243
{-20, ZIO_ZSTD_LEVEL_FAST_20},
244
{-30, ZIO_ZSTD_LEVEL_FAST_30},
245
{-40, ZIO_ZSTD_LEVEL_FAST_40},
246
{-50, ZIO_ZSTD_LEVEL_FAST_50},
247
{-60, ZIO_ZSTD_LEVEL_FAST_60},
248
{-70, ZIO_ZSTD_LEVEL_FAST_70},
249
{-80, ZIO_ZSTD_LEVEL_FAST_80},
250
{-90, ZIO_ZSTD_LEVEL_FAST_90},
251
{-100, ZIO_ZSTD_LEVEL_FAST_100},
252
{-500, ZIO_ZSTD_LEVEL_FAST_500},
253
{-1000, ZIO_ZSTD_LEVEL_FAST_1000},
254
};
255
256
/*
257
* This variable represents the maximum count of the pool based on the number
258
* of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.
259
*/
260
static int pool_count = 16;
261
262
#define ZSTD_POOL_MAX pool_count
263
#define ZSTD_POOL_TIMEOUT 60 * 2
264
265
static struct zstd_fallback_mem zstd_dctx_fallback;
266
static struct zstd_pool *zstd_mempool_cctx;
267
static struct zstd_pool *zstd_mempool_dctx;
268
269
/*
270
* The library zstd code expects these if ADDRESS_SANITIZER gets defined,
271
* and while ASAN does this, KASAN defines that and does not. So to avoid
272
* changing the external code, we do this.
273
*/
274
#if defined(ZFS_ASAN_ENABLED)
275
#define ADDRESS_SANITIZER 1
276
#endif
277
#if defined(_KERNEL) && defined(ADDRESS_SANITIZER)
278
void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
279
void __asan_poison_memory_region(void const volatile *addr, size_t size);
280
void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {};
281
void __asan_poison_memory_region(void const volatile *addr, size_t size) {};
282
#endif
283
284
285
static void
286
zstd_mempool_reap(struct zstd_pool *zstd_mempool)
287
{
288
struct zstd_pool *pool;
289
290
if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) {
291
return;
292
}
293
294
/* free obsolete slots */
295
for (int i = 0; i < ZSTD_POOL_MAX; i++) {
296
pool = &zstd_mempool[i];
297
if (pool->mem && mutex_tryenter(&pool->barrier)) {
298
/* Free memory if unused object older than 2 minutes */
299
if (pool->mem && gethrestime_sec() > pool->timeout) {
300
vmem_free(pool->mem, pool->size);
301
ZSTDSTAT_SUB(zstd_stat_buffers, 1);
302
ZSTDSTAT_SUB(zstd_stat_size, pool->size);
303
pool->mem = NULL;
304
pool->size = 0;
305
pool->timeout = 0;
306
}
307
mutex_exit(&pool->barrier);
308
}
309
}
310
}
311
312
/*
313
* Try to get a cached allocated buffer from memory pool or allocate a new one
314
* if necessary. If a object is older than 2 minutes and does not fit the
315
* requested size, it will be released and a new cached entry will be allocated.
316
* If other pooled objects are detected without being used for 2 minutes, they
317
* will be released, too.
318
*
319
* The concept is that high frequency memory allocations of bigger objects are
320
* expensive. So if a lot of work is going on, allocations will be kept for a
321
* while and can be reused in that time frame.
322
*
323
* The scheduled release will be updated every time a object is reused.
324
*/
325
326
static void *
327
zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)
328
{
329
struct zstd_pool *pool;
330
struct zstd_kmem *mem = NULL;
331
332
if (!zstd_mempool) {
333
return (NULL);
334
}
335
336
/* Seek for preallocated memory slot and free obsolete slots */
337
for (int i = 0; i < ZSTD_POOL_MAX; i++) {
338
pool = &zstd_mempool[i];
339
/*
340
* This lock is simply a marker for a pool object being in use.
341
* If it's already hold, it will be skipped.
342
*
343
* We need to create it before checking it to avoid race
344
* conditions caused by running in a threaded context.
345
*
346
* The lock is later released by zstd_mempool_free.
347
*/
348
if (mutex_tryenter(&pool->barrier)) {
349
/*
350
* Check if objects fits the size, if so we take it and
351
* update the timestamp.
352
*/
353
if (pool->mem && size <= pool->size) {
354
pool->timeout = gethrestime_sec() +
355
ZSTD_POOL_TIMEOUT;
356
mem = pool->mem;
357
return (mem);
358
}
359
mutex_exit(&pool->barrier);
360
}
361
}
362
363
/*
364
* If no preallocated slot was found, try to fill in a new one.
365
*
366
* We run a similar algorithm twice here to avoid pool fragmentation.
367
* The first one may generate holes in the list if objects get released.
368
* We always make sure that these holes get filled instead of adding new
369
* allocations constantly at the end.
370
*/
371
for (int i = 0; i < ZSTD_POOL_MAX; i++) {
372
pool = &zstd_mempool[i];
373
if (mutex_tryenter(&pool->barrier)) {
374
/* Object is free, try to allocate new one */
375
if (!pool->mem) {
376
mem = vmem_alloc(size, KM_SLEEP);
377
if (mem) {
378
ZSTDSTAT_ADD(zstd_stat_buffers, 1);
379
ZSTDSTAT_ADD(zstd_stat_size, size);
380
pool->mem = mem;
381
pool->size = size;
382
/* Keep track for later release */
383
mem->pool = pool;
384
mem->kmem_type = ZSTD_KMEM_POOL;
385
mem->kmem_size = size;
386
}
387
}
388
389
if (size <= pool->size) {
390
/* Update timestamp */
391
pool->timeout = gethrestime_sec() +
392
ZSTD_POOL_TIMEOUT;
393
394
return (pool->mem);
395
}
396
397
mutex_exit(&pool->barrier);
398
}
399
}
400
401
/*
402
* If the pool is full or the allocation failed, try lazy allocation
403
* instead.
404
*/
405
if (!mem) {
406
mem = vmem_alloc(size, KM_NOSLEEP);
407
if (mem) {
408
mem->pool = NULL;
409
mem->kmem_type = ZSTD_KMEM_DEFAULT;
410
mem->kmem_size = size;
411
}
412
}
413
414
return (mem);
415
}
416
417
/* Mark object as released by releasing the barrier mutex */
418
static void
419
zstd_mempool_free(struct zstd_kmem *z)
420
{
421
mutex_exit(&z->pool->barrier);
422
}
423
424
/* Convert ZFS internal enum to ZSTD level */
425
static int
426
zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
427
{
428
if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) {
429
*zstd_level = zstd_levels[level - 1].zstd_level;
430
return (0);
431
}
432
if (level >= ZIO_ZSTD_LEVEL_FAST_1 &&
433
level <= ZIO_ZSTD_LEVEL_FAST_1000) {
434
*zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1
435
+ ZIO_ZSTD_LEVEL_19].zstd_level;
436
return (0);
437
}
438
439
/* Invalid/unknown zfs compression enum - this should never happen. */
440
return (1);
441
}
442
443
#ifndef IN_LIBSA
444
/* Compress block using zstd */
445
static size_t
446
zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len,
447
int level)
448
{
449
size_t c_len;
450
int16_t zstd_level;
451
zfs_zstdhdr_t *hdr;
452
ZSTD_CCtx *cctx;
453
454
hdr = (zfs_zstdhdr_t *)d_start;
455
456
/* Skip compression if the specified level is invalid */
457
if (zstd_enum_to_level(level, &zstd_level)) {
458
ZSTDSTAT_BUMP(zstd_stat_com_inval);
459
return (s_len);
460
}
461
462
ASSERT3U(d_len, >=, sizeof (*hdr));
463
ASSERT3U(d_len, <=, s_len);
464
ASSERT3U(zstd_level, !=, 0);
465
466
cctx = ZSTD_createCCtx_advanced(zstd_malloc);
467
468
/*
469
* Out of kernel memory, gently fall through - this will disable
470
* compression in zio_compress_data
471
*/
472
if (!cctx) {
473
ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail);
474
return (s_len);
475
}
476
477
/* Set the compression level */
478
ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level);
479
480
/* Use the "magicless" zstd header which saves us 4 header bytes */
481
ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless);
482
483
/*
484
* Disable redundant checksum calculation and content size storage since
485
* this is already done by ZFS itself.
486
*/
487
ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0);
488
ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0);
489
490
c_len = ZSTD_compress2(cctx,
491
hdr->data,
492
d_len - sizeof (*hdr),
493
s_start, s_len);
494
495
ZSTD_freeCCtx(cctx);
496
497
/* Error in the compression routine, disable compression. */
498
if (ZSTD_isError(c_len)) {
499
/*
500
* If we are aborting the compression because the saves are
501
* too small, that is not a failure. Everything else is a
502
* failure, so increment the compression failure counter.
503
*/
504
int err = ZSTD_getErrorCode(c_len);
505
if (err != ZSTD_error_dstSize_tooSmall) {
506
ZSTDSTAT_BUMP(zstd_stat_com_fail);
507
dprintf("Error: %s", ZSTD_getErrorString(err));
508
}
509
return (s_len);
510
}
511
512
/*
513
* Encode the compressed buffer size at the start. We'll need this in
514
* decompression to counter the effects of padding which might be added
515
* to the compressed buffer and which, if unhandled, would confuse the
516
* hell out of our decompression function.
517
*/
518
hdr->c_len = BE_32(c_len);
519
520
/*
521
* Check version for overflow.
522
* The limit of 24 bits must not be exceeded. This allows a maximum
523
* version 1677.72.15 which we don't expect to be ever reached.
524
*/
525
ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF);
526
527
/*
528
* Encode the compression level as well. We may need to know the
529
* original compression level if compressed_arc is disabled, to match
530
* the compression settings to write this block to the L2ARC.
531
*
532
* Encode the actual level, so if the enum changes in the future, we
533
* will be compatible.
534
*
535
* The upper 24 bits store the ZSTD version to be able to provide
536
* future compatibility, since new versions might enhance the
537
* compression algorithm in a way, where the compressed data will
538
* change.
539
*
540
* As soon as such incompatibility occurs, handling code needs to be
541
* added, differentiating between the versions.
542
*/
543
zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER);
544
zfs_set_hdrlevel(hdr, level);
545
hdr->raw_version_level = BE_32(hdr->raw_version_level);
546
547
return (c_len + sizeof (*hdr));
548
}
549
550
static size_t
551
zfs_zstd_compress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len,
552
int level)
553
{
554
int16_t zstd_level;
555
if (zstd_enum_to_level(level, &zstd_level)) {
556
ZSTDSTAT_BUMP(zstd_stat_com_inval);
557
return (s_len);
558
}
559
/*
560
* A zstd early abort heuristic.
561
*
562
* - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
563
* 128k), don't try any of this, just go.
564
* (because experimentally that was a reasonable cutoff for a perf win
565
* with tiny ratio change)
566
* - First, we try LZ4 compression, and if it doesn't early abort, we
567
* jump directly to whatever compression level we intended to try.
568
* - Second, we try zstd-1 - if that errors out (usually, but not
569
* exclusively, if it would overflow), we give up early.
570
*
571
* If it works, instead we go on and compress anyway.
572
*
573
* Why two passes? LZ4 alone gets you a lot of the way, but on highly
574
* compressible data, it was losing up to 8.5% of the compressed
575
* savings versus no early abort, and all the zstd-fast levels are
576
* worse indications on their own than LZ4, and don't improve the LZ4
577
* pass noticably if stacked like this.
578
*/
579
size_t actual_abort_size = zstd_abort_size;
580
if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
581
s_len >= actual_abort_size) {
582
abd_t sabd, dabd;
583
abd_get_from_buf_struct(&sabd, s_start, s_len);
584
abd_get_from_buf_struct(&dabd, d_start, d_len);
585
int pass_len = zfs_lz4_compress(&sabd, &dabd, s_len, d_len, 0);
586
abd_free(&dabd);
587
abd_free(&sabd);
588
if (pass_len < d_len) {
589
ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
590
goto keep_trying;
591
}
592
ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
593
594
pass_len = zfs_zstd_compress_impl(s_start, d_start, s_len,
595
d_len, ZIO_ZSTD_LEVEL_1);
596
if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
597
ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
598
return (s_len);
599
}
600
ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
601
} else {
602
ZSTDSTAT_BUMP(zstd_stat_passignored);
603
if (s_len < actual_abort_size) {
604
ZSTDSTAT_BUMP(zstd_stat_passignored_size);
605
}
606
}
607
keep_trying:
608
return (zfs_zstd_compress_impl(s_start, d_start, s_len, d_len, level));
609
610
}
611
#endif
612
613
/* Decompress block using zstd and return its stored level */
614
static int
615
zfs_zstd_decompress_level_buf(void *s_start, void *d_start, size_t s_len,
616
size_t d_len, uint8_t *level)
617
{
618
ZSTD_DCtx *dctx;
619
size_t result;
620
int16_t zstd_level;
621
uint32_t c_len;
622
const zfs_zstdhdr_t *hdr;
623
zfs_zstdhdr_t hdr_copy;
624
625
hdr = (const zfs_zstdhdr_t *)s_start;
626
c_len = BE_32(hdr->c_len);
627
628
/*
629
* Make a copy instead of directly converting the header, since we must
630
* not modify the original data that may be used again later.
631
*/
632
hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);
633
uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy);
634
635
/*
636
* NOTE: We ignore the ZSTD version for now. As soon as any
637
* incompatibility occurs, it has to be handled accordingly.
638
* The version can be accessed via `hdr_copy.version`.
639
*/
640
641
/*
642
* Convert and check the level
643
* An invalid level is a strong indicator for data corruption! In such
644
* case return an error so the upper layers can try to fix it.
645
*/
646
if (zstd_enum_to_level(curlevel, &zstd_level)) {
647
ZSTDSTAT_BUMP(zstd_stat_dec_inval);
648
return (1);
649
}
650
651
ASSERT3U(d_len, >=, s_len);
652
ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT);
653
654
/* Invalid compressed buffer size encoded at start */
655
if (c_len + sizeof (*hdr) > s_len) {
656
ZSTDSTAT_BUMP(zstd_stat_dec_header_inval);
657
return (1);
658
}
659
660
dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc);
661
if (!dctx) {
662
ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail);
663
return (1);
664
}
665
666
/* Set header type to "magicless" */
667
ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless);
668
669
/* Decompress the data and release the context */
670
result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len);
671
ZSTD_freeDCtx(dctx);
672
673
/*
674
* Returns 0 on success (decompression function returned non-negative)
675
* and non-zero on failure (decompression function returned negative.
676
*/
677
if (ZSTD_isError(result)) {
678
ZSTDSTAT_BUMP(zstd_stat_dec_fail);
679
return (1);
680
}
681
682
if (level) {
683
*level = curlevel;
684
}
685
686
return (0);
687
}
688
689
/* Decompress datablock using zstd */
690
#ifdef IN_BASE
691
int
692
zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len,
693
size_t d_len, int level __maybe_unused)
694
{
695
696
return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len,
697
NULL));
698
}
699
#else
700
static int
701
zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len,
702
size_t d_len, int level __maybe_unused)
703
{
704
705
return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len,
706
NULL));
707
}
708
#endif
709
710
#ifndef IN_LIBSA
711
ZFS_COMPRESS_WRAP_DECL(zfs_zstd_compress)
712
ZFS_DECOMPRESS_WRAP_DECL(zfs_zstd_decompress)
713
ZFS_DECOMPRESS_LEVEL_WRAP_DECL(zfs_zstd_decompress_level)
714
715
/* Allocator for zstd compression context using mempool_allocator */
716
static void *
717
zstd_alloc(void *opaque __maybe_unused, size_t size)
718
{
719
size_t nbytes = sizeof (struct zstd_kmem) + size;
720
struct zstd_kmem *z = NULL;
721
722
z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes);
723
724
if (!z) {
725
ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
726
return (NULL);
727
}
728
729
return ((void*)z + (sizeof (struct zstd_kmem)));
730
}
731
732
#endif
733
/*
734
* Allocator for zstd decompression context using mempool_allocator with
735
* fallback to reserved memory if allocation fails
736
*/
737
static void *
738
zstd_dctx_alloc(void *opaque __maybe_unused, size_t size)
739
{
740
size_t nbytes = sizeof (struct zstd_kmem) + size;
741
struct zstd_kmem *z = NULL;
742
enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT;
743
744
z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes);
745
if (!z) {
746
/* Try harder, decompression shall not fail */
747
z = vmem_alloc(nbytes, KM_SLEEP);
748
if (z) {
749
z->pool = NULL;
750
}
751
ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
752
} else {
753
return ((void*)z + (sizeof (struct zstd_kmem)));
754
}
755
756
/* Fallback if everything fails */
757
if (!z) {
758
/*
759
* Barrier since we only can handle it in a single thread. All
760
* other following threads need to wait here until decompression
761
* is completed. zstd_free will release this barrier later.
762
*/
763
mutex_enter(&zstd_dctx_fallback.barrier);
764
765
z = zstd_dctx_fallback.mem;
766
type = ZSTD_KMEM_DCTX;
767
ZSTDSTAT_BUMP(zstd_stat_alloc_fallback);
768
}
769
770
/* Allocation should always be successful */
771
if (!z) {
772
return (NULL);
773
}
774
775
z->kmem_type = type;
776
z->kmem_size = nbytes;
777
778
return ((void*)z + (sizeof (struct zstd_kmem)));
779
}
780
781
/* Free allocated memory by its specific type */
782
static void
783
zstd_free(void *opaque __maybe_unused, void *ptr)
784
{
785
struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem));
786
enum zstd_kmem_type type;
787
788
ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT);
789
ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN);
790
791
type = z->kmem_type;
792
switch (type) {
793
case ZSTD_KMEM_DEFAULT:
794
vmem_free(z, z->kmem_size);
795
break;
796
case ZSTD_KMEM_POOL:
797
zstd_mempool_free(z);
798
break;
799
case ZSTD_KMEM_DCTX:
800
mutex_exit(&zstd_dctx_fallback.barrier);
801
break;
802
default:
803
break;
804
}
805
}
806
807
/* Allocate fallback memory to ensure safe decompression */
808
static void __init
809
create_fallback_mem(struct zstd_fallback_mem *mem, size_t size)
810
{
811
mem->mem_size = size;
812
mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP);
813
mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL);
814
}
815
816
/* Initialize memory pool barrier mutexes */
817
static void __init
818
zstd_mempool_init(void)
819
{
820
zstd_mempool_cctx =
821
vmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
822
zstd_mempool_dctx =
823
vmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
824
825
for (int i = 0; i < ZSTD_POOL_MAX; i++) {
826
mutex_init(&zstd_mempool_cctx[i].barrier, NULL,
827
MUTEX_DEFAULT, NULL);
828
mutex_init(&zstd_mempool_dctx[i].barrier, NULL,
829
MUTEX_DEFAULT, NULL);
830
}
831
}
832
833
/* Initialize zstd-related memory handling */
834
static int __init
835
zstd_meminit(void)
836
{
837
zstd_mempool_init();
838
839
/*
840
* Estimate the size of the fallback decompression context.
841
* The expected size on x64 with current ZSTD should be about 160 KB.
842
*/
843
create_fallback_mem(&zstd_dctx_fallback,
844
P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem),
845
PAGESIZE));
846
847
return (0);
848
}
849
850
/* Release object from pool and free memory */
851
static void
852
release_pool(struct zstd_pool *pool)
853
{
854
mutex_destroy(&pool->barrier);
855
vmem_free(pool->mem, pool->size);
856
pool->mem = NULL;
857
pool->size = 0;
858
}
859
860
/* Release memory pool objects */
861
static void
862
zstd_mempool_deinit(void)
863
{
864
for (int i = 0; i < ZSTD_POOL_MAX; i++) {
865
release_pool(&zstd_mempool_cctx[i]);
866
release_pool(&zstd_mempool_dctx[i]);
867
}
868
869
vmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
870
vmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
871
zstd_mempool_dctx = NULL;
872
zstd_mempool_cctx = NULL;
873
}
874
875
/* release unused memory from pool */
876
877
void
878
zfs_zstd_cache_reap_now(void)
879
{
880
881
/*
882
* Short-circuit if there are no buffers to begin with.
883
*/
884
if (ZSTDSTAT(zstd_stat_buffers) == 0)
885
return;
886
887
/*
888
* calling alloc with zero size seeks
889
* and releases old unused objects
890
*/
891
zstd_mempool_reap(zstd_mempool_cctx);
892
zstd_mempool_reap(zstd_mempool_dctx);
893
}
894
895
extern int __init
896
zstd_init(void)
897
{
898
/* Set pool size by using maximum sane thread count * 4 */
899
pool_count = (boot_ncpus * 4);
900
zstd_meminit();
901
902
/* Initialize kstat */
903
zstd_ksp = kstat_create("zfs", 0, "zstd", "misc",
904
KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t),
905
KSTAT_FLAG_VIRTUAL);
906
if (zstd_ksp != NULL) {
907
zstd_ksp->ks_data = &zstd_stats;
908
kstat_install(zstd_ksp);
909
#ifdef _KERNEL
910
zstd_ksp->ks_update = kstat_zstd_update;
911
#endif
912
}
913
914
return (0);
915
}
916
917
extern void
918
zstd_fini(void)
919
{
920
/* Deinitialize kstat */
921
if (zstd_ksp != NULL) {
922
kstat_delete(zstd_ksp);
923
zstd_ksp = NULL;
924
}
925
926
/* Release fallback memory */
927
vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size);
928
mutex_destroy(&zstd_dctx_fallback.barrier);
929
930
/* Deinit memory pool */
931
zstd_mempool_deinit();
932
}
933
934
#if defined(_KERNEL)
935
#ifdef __FreeBSD__
936
module_init(zstd_init);
937
module_exit(zstd_fini);
938
#endif
939
940
ZFS_MODULE_PARAM(zfs, zstd_, earlyabort_pass, UINT, ZMOD_RW,
941
"Enable early abort attempts when using zstd");
942
ZFS_MODULE_PARAM(zfs, zstd_, abort_size, UINT, ZMOD_RW,
943
"Minimal size of block to attempt early abort");
944
#endif
945
946