Path: blob/main/sys/contrib/openzfs/module/zstd/zfs_zstd.c
48383 views
// SPDX-License-Identifier: BSD-3-Clause1/*2* BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)3*4* Redistribution and use in source and binary forms, with or without5* modification, are permitted provided that the following conditions are met:6*7* 1. Redistributions of source code must retain the above copyright notice,8* this list of conditions and the following disclaimer.9*10* 2. Redistributions in binary form must reproduce the above copyright notice,11* this list of conditions and the following disclaimer in the documentation12* and/or other materials provided with the distribution.13*14* 3. Neither the name of the copyright holder nor the names of its15* contributors may be used to endorse or promote products derived from this16* software without specific prior written permission.17*18* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"19* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE20* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE21* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE22* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR23* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF24* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS25* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN26* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)27* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE28* POSSIBILITY OF SUCH DAMAGE.29*/3031/*32* Copyright (c) 2016-2018, Klara Inc.33* Copyright (c) 2016-2018, Allan Jude34* Copyright (c) 2018-2020, Sebastian Gottschall35* Copyright (c) 2019-2020, Michael Niewöhner36* Copyright (c) 2020, The FreeBSD Foundation [1]37*38* [1] Portions of this software were developed by Allan Jude39* under sponsorship from the FreeBSD Foundation.40*/4142#include <sys/param.h>43#include <sys/sysmacros.h>44#include <sys/zfs_context.h>45#include <sys/zio_compress.h>46#include <sys/spa.h>47#include <sys/zstd/zstd.h>4849#define ZSTD_STATIC_LINKING_ONLY50#include "lib/zstd.h"51#include "lib/common/zstd_errors.h"5253#ifndef IN_LIBSA54static uint_t zstd_earlyabort_pass = 1;55static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3;56static unsigned int zstd_abort_size = (128 * 1024);57#endif5859#ifdef IN_BASE60int zfs_zstd_decompress_buf(void *, void *, size_t, size_t, int);61#endif6263static kstat_t *zstd_ksp = NULL;6465typedef struct zstd_stats {66kstat_named_t zstd_stat_alloc_fail;67kstat_named_t zstd_stat_alloc_fallback;68kstat_named_t zstd_stat_com_alloc_fail;69kstat_named_t zstd_stat_dec_alloc_fail;70kstat_named_t zstd_stat_com_inval;71kstat_named_t zstd_stat_dec_inval;72kstat_named_t zstd_stat_dec_header_inval;73kstat_named_t zstd_stat_com_fail;74kstat_named_t zstd_stat_dec_fail;75/*76* LZ4 first-pass early abort verdict77*/78kstat_named_t zstd_stat_lz4pass_allowed;79kstat_named_t zstd_stat_lz4pass_rejected;80/*81* zstd-1 second-pass early abort verdict82*/83kstat_named_t zstd_stat_zstdpass_allowed;84kstat_named_t zstd_stat_zstdpass_rejected;85/*86* We excluded this from early abort for some reason87*/88kstat_named_t zstd_stat_passignored;89kstat_named_t zstd_stat_passignored_size;90kstat_named_t zstd_stat_buffers;91kstat_named_t zstd_stat_size;92} zstd_stats_t;9394static zstd_stats_t zstd_stats = {95{ "alloc_fail", KSTAT_DATA_UINT64 },96{ "alloc_fallback", KSTAT_DATA_UINT64 },97{ "compress_alloc_fail", KSTAT_DATA_UINT64 },98{ "decompress_alloc_fail", KSTAT_DATA_UINT64 },99{ "compress_level_invalid", KSTAT_DATA_UINT64 },100{ "decompress_level_invalid", KSTAT_DATA_UINT64 },101{ "decompress_header_invalid", KSTAT_DATA_UINT64 },102{ "compress_failed", KSTAT_DATA_UINT64 },103{ "decompress_failed", KSTAT_DATA_UINT64 },104{ "lz4pass_allowed", KSTAT_DATA_UINT64 },105{ "lz4pass_rejected", KSTAT_DATA_UINT64 },106{ "zstdpass_allowed", KSTAT_DATA_UINT64 },107{ "zstdpass_rejected", KSTAT_DATA_UINT64 },108{ "passignored", KSTAT_DATA_UINT64 },109{ "passignored_size", KSTAT_DATA_UINT64 },110{ "buffers", KSTAT_DATA_UINT64 },111{ "size", KSTAT_DATA_UINT64 },112};113114#ifdef _KERNEL115static int116kstat_zstd_update(kstat_t *ksp, int rw)117{118ASSERT(ksp != NULL);119120if (rw == KSTAT_WRITE && ksp == zstd_ksp) {121ZSTDSTAT_ZERO(zstd_stat_alloc_fail);122ZSTDSTAT_ZERO(zstd_stat_alloc_fallback);123ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail);124ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail);125ZSTDSTAT_ZERO(zstd_stat_com_inval);126ZSTDSTAT_ZERO(zstd_stat_dec_inval);127ZSTDSTAT_ZERO(zstd_stat_dec_header_inval);128ZSTDSTAT_ZERO(zstd_stat_com_fail);129ZSTDSTAT_ZERO(zstd_stat_dec_fail);130ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed);131ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected);132ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed);133ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected);134ZSTDSTAT_ZERO(zstd_stat_passignored);135ZSTDSTAT_ZERO(zstd_stat_passignored_size);136}137138return (0);139}140#endif141142/* Enums describing the allocator type specified by kmem_type in zstd_kmem */143enum zstd_kmem_type {144ZSTD_KMEM_UNKNOWN = 0,145/* Allocation type using kmem_vmalloc */146ZSTD_KMEM_DEFAULT,147/* Pool based allocation using mempool_alloc */148ZSTD_KMEM_POOL,149/* Reserved fallback memory for decompression only */150ZSTD_KMEM_DCTX,151ZSTD_KMEM_COUNT,152};153154/* Structure for pooled memory objects */155struct zstd_pool {156void *mem;157size_t size;158kmutex_t barrier;159hrtime_t timeout;160};161162/* Global structure for handling memory allocations */163struct zstd_kmem {164enum zstd_kmem_type kmem_type;165size_t kmem_size;166struct zstd_pool *pool;167};168169/* Fallback memory structure used for decompression only if memory runs out */170struct zstd_fallback_mem {171size_t mem_size;172void *mem;173kmutex_t barrier;174};175176struct zstd_levelmap {177int16_t zstd_level;178enum zio_zstd_levels level;179};180181/*182* ZSTD memory handlers183*184* For decompression we use a different handler which also provides fallback185* memory allocation in case memory runs out.186*187* The ZSTD handlers were split up for the most simplified implementation.188*/189#ifndef IN_LIBSA190static void *zstd_alloc(void *opaque, size_t size);191#endif192static void *zstd_dctx_alloc(void *opaque, size_t size);193static void zstd_free(void *opaque, void *ptr);194195#ifndef IN_LIBSA196/* Compression memory handler */197static const ZSTD_customMem zstd_malloc = {198zstd_alloc,199zstd_free,200NULL,201};202#endif203204/* Decompression memory handler */205static const ZSTD_customMem zstd_dctx_malloc = {206zstd_dctx_alloc,207zstd_free,208NULL,209};210211/* Level map for converting ZFS internal levels to ZSTD levels and vice versa */212static struct zstd_levelmap zstd_levels[] = {213{ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1},214{ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2},215{ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3},216{ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4},217{ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5},218{ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6},219{ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7},220{ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8},221{ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9},222{ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10},223{ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11},224{ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12},225{ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13},226{ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14},227{ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15},228{ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16},229{ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17},230{ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18},231{ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19},232{-1, ZIO_ZSTD_LEVEL_FAST_1},233{-2, ZIO_ZSTD_LEVEL_FAST_2},234{-3, ZIO_ZSTD_LEVEL_FAST_3},235{-4, ZIO_ZSTD_LEVEL_FAST_4},236{-5, ZIO_ZSTD_LEVEL_FAST_5},237{-6, ZIO_ZSTD_LEVEL_FAST_6},238{-7, ZIO_ZSTD_LEVEL_FAST_7},239{-8, ZIO_ZSTD_LEVEL_FAST_8},240{-9, ZIO_ZSTD_LEVEL_FAST_9},241{-10, ZIO_ZSTD_LEVEL_FAST_10},242{-20, ZIO_ZSTD_LEVEL_FAST_20},243{-30, ZIO_ZSTD_LEVEL_FAST_30},244{-40, ZIO_ZSTD_LEVEL_FAST_40},245{-50, ZIO_ZSTD_LEVEL_FAST_50},246{-60, ZIO_ZSTD_LEVEL_FAST_60},247{-70, ZIO_ZSTD_LEVEL_FAST_70},248{-80, ZIO_ZSTD_LEVEL_FAST_80},249{-90, ZIO_ZSTD_LEVEL_FAST_90},250{-100, ZIO_ZSTD_LEVEL_FAST_100},251{-500, ZIO_ZSTD_LEVEL_FAST_500},252{-1000, ZIO_ZSTD_LEVEL_FAST_1000},253};254255/*256* This variable represents the maximum count of the pool based on the number257* of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.258*/259static int pool_count = 16;260261#define ZSTD_POOL_MAX pool_count262#define ZSTD_POOL_TIMEOUT 60 * 2263264static struct zstd_fallback_mem zstd_dctx_fallback;265static struct zstd_pool *zstd_mempool_cctx;266static struct zstd_pool *zstd_mempool_dctx;267268/*269* The library zstd code expects these if ADDRESS_SANITIZER gets defined,270* and while ASAN does this, KASAN defines that and does not. So to avoid271* changing the external code, we do this.272*/273#if defined(ZFS_ASAN_ENABLED)274#define ADDRESS_SANITIZER 1275#endif276#if defined(_KERNEL) && defined(ADDRESS_SANITIZER)277void __asan_unpoison_memory_region(void const volatile *addr, size_t size);278void __asan_poison_memory_region(void const volatile *addr, size_t size);279void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {};280void __asan_poison_memory_region(void const volatile *addr, size_t size) {};281#endif282283284static void285zstd_mempool_reap(struct zstd_pool *zstd_mempool)286{287struct zstd_pool *pool;288289if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) {290return;291}292293/* free obsolete slots */294for (int i = 0; i < ZSTD_POOL_MAX; i++) {295pool = &zstd_mempool[i];296if (pool->mem && mutex_tryenter(&pool->barrier)) {297/* Free memory if unused object older than 2 minutes */298if (pool->mem && gethrestime_sec() > pool->timeout) {299vmem_free(pool->mem, pool->size);300ZSTDSTAT_SUB(zstd_stat_buffers, 1);301ZSTDSTAT_SUB(zstd_stat_size, pool->size);302pool->mem = NULL;303pool->size = 0;304pool->timeout = 0;305}306mutex_exit(&pool->barrier);307}308}309}310311/*312* Try to get a cached allocated buffer from memory pool or allocate a new one313* if necessary. If a object is older than 2 minutes and does not fit the314* requested size, it will be released and a new cached entry will be allocated.315* If other pooled objects are detected without being used for 2 minutes, they316* will be released, too.317*318* The concept is that high frequency memory allocations of bigger objects are319* expensive. So if a lot of work is going on, allocations will be kept for a320* while and can be reused in that time frame.321*322* The scheduled release will be updated every time a object is reused.323*/324325static void *326zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)327{328struct zstd_pool *pool;329struct zstd_kmem *mem = NULL;330331if (!zstd_mempool) {332return (NULL);333}334335/* Seek for preallocated memory slot and free obsolete slots */336for (int i = 0; i < ZSTD_POOL_MAX; i++) {337pool = &zstd_mempool[i];338/*339* This lock is simply a marker for a pool object being in use.340* If it's already hold, it will be skipped.341*342* We need to create it before checking it to avoid race343* conditions caused by running in a threaded context.344*345* The lock is later released by zstd_mempool_free.346*/347if (mutex_tryenter(&pool->barrier)) {348/*349* Check if objects fits the size, if so we take it and350* update the timestamp.351*/352if (pool->mem && size <= pool->size) {353pool->timeout = gethrestime_sec() +354ZSTD_POOL_TIMEOUT;355mem = pool->mem;356return (mem);357}358mutex_exit(&pool->barrier);359}360}361362/*363* If no preallocated slot was found, try to fill in a new one.364*365* We run a similar algorithm twice here to avoid pool fragmentation.366* The first one may generate holes in the list if objects get released.367* We always make sure that these holes get filled instead of adding new368* allocations constantly at the end.369*/370for (int i = 0; i < ZSTD_POOL_MAX; i++) {371pool = &zstd_mempool[i];372if (mutex_tryenter(&pool->barrier)) {373/* Object is free, try to allocate new one */374if (!pool->mem) {375mem = vmem_alloc(size, KM_SLEEP);376if (mem) {377ZSTDSTAT_ADD(zstd_stat_buffers, 1);378ZSTDSTAT_ADD(zstd_stat_size, size);379pool->mem = mem;380pool->size = size;381/* Keep track for later release */382mem->pool = pool;383mem->kmem_type = ZSTD_KMEM_POOL;384mem->kmem_size = size;385}386}387388if (size <= pool->size) {389/* Update timestamp */390pool->timeout = gethrestime_sec() +391ZSTD_POOL_TIMEOUT;392393return (pool->mem);394}395396mutex_exit(&pool->barrier);397}398}399400/*401* If the pool is full or the allocation failed, try lazy allocation402* instead.403*/404if (!mem) {405mem = vmem_alloc(size, KM_NOSLEEP);406if (mem) {407mem->pool = NULL;408mem->kmem_type = ZSTD_KMEM_DEFAULT;409mem->kmem_size = size;410}411}412413return (mem);414}415416/* Mark object as released by releasing the barrier mutex */417static void418zstd_mempool_free(struct zstd_kmem *z)419{420mutex_exit(&z->pool->barrier);421}422423/* Convert ZFS internal enum to ZSTD level */424static int425zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)426{427if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) {428*zstd_level = zstd_levels[level - 1].zstd_level;429return (0);430}431if (level >= ZIO_ZSTD_LEVEL_FAST_1 &&432level <= ZIO_ZSTD_LEVEL_FAST_1000) {433*zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1434+ ZIO_ZSTD_LEVEL_19].zstd_level;435return (0);436}437438/* Invalid/unknown zfs compression enum - this should never happen. */439return (1);440}441442#ifndef IN_LIBSA443/* Compress block using zstd */444static size_t445zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len,446int level)447{448size_t c_len;449int16_t zstd_level;450zfs_zstdhdr_t *hdr;451ZSTD_CCtx *cctx;452453hdr = (zfs_zstdhdr_t *)d_start;454455/* Skip compression if the specified level is invalid */456if (zstd_enum_to_level(level, &zstd_level)) {457ZSTDSTAT_BUMP(zstd_stat_com_inval);458return (s_len);459}460461ASSERT3U(d_len, >=, sizeof (*hdr));462ASSERT3U(d_len, <=, s_len);463ASSERT3U(zstd_level, !=, 0);464465cctx = ZSTD_createCCtx_advanced(zstd_malloc);466467/*468* Out of kernel memory, gently fall through - this will disable469* compression in zio_compress_data470*/471if (!cctx) {472ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail);473return (s_len);474}475476/* Set the compression level */477ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level);478479/* Use the "magicless" zstd header which saves us 4 header bytes */480ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless);481482/*483* Disable redundant checksum calculation and content size storage since484* this is already done by ZFS itself.485*/486ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0);487ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0);488489c_len = ZSTD_compress2(cctx,490hdr->data,491d_len - sizeof (*hdr),492s_start, s_len);493494ZSTD_freeCCtx(cctx);495496/* Error in the compression routine, disable compression. */497if (ZSTD_isError(c_len)) {498/*499* If we are aborting the compression because the saves are500* too small, that is not a failure. Everything else is a501* failure, so increment the compression failure counter.502*/503int err = ZSTD_getErrorCode(c_len);504if (err != ZSTD_error_dstSize_tooSmall) {505ZSTDSTAT_BUMP(zstd_stat_com_fail);506dprintf("Error: %s", ZSTD_getErrorString(err));507}508return (s_len);509}510511/*512* Encode the compressed buffer size at the start. We'll need this in513* decompression to counter the effects of padding which might be added514* to the compressed buffer and which, if unhandled, would confuse the515* hell out of our decompression function.516*/517hdr->c_len = BE_32(c_len);518519/*520* Check version for overflow.521* The limit of 24 bits must not be exceeded. This allows a maximum522* version 1677.72.15 which we don't expect to be ever reached.523*/524ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF);525526/*527* Encode the compression level as well. We may need to know the528* original compression level if compressed_arc is disabled, to match529* the compression settings to write this block to the L2ARC.530*531* Encode the actual level, so if the enum changes in the future, we532* will be compatible.533*534* The upper 24 bits store the ZSTD version to be able to provide535* future compatibility, since new versions might enhance the536* compression algorithm in a way, where the compressed data will537* change.538*539* As soon as such incompatibility occurs, handling code needs to be540* added, differentiating between the versions.541*/542zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER);543zfs_set_hdrlevel(hdr, level);544hdr->raw_version_level = BE_32(hdr->raw_version_level);545546return (c_len + sizeof (*hdr));547}548549static size_t550zfs_zstd_compress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len,551int level)552{553int16_t zstd_level;554if (zstd_enum_to_level(level, &zstd_level)) {555ZSTDSTAT_BUMP(zstd_stat_com_inval);556return (s_len);557}558/*559* A zstd early abort heuristic.560*561* - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently562* 128k), don't try any of this, just go.563* (because experimentally that was a reasonable cutoff for a perf win564* with tiny ratio change)565* - First, we try LZ4 compression, and if it doesn't early abort, we566* jump directly to whatever compression level we intended to try.567* - Second, we try zstd-1 - if that errors out (usually, but not568* exclusively, if it would overflow), we give up early.569*570* If it works, instead we go on and compress anyway.571*572* Why two passes? LZ4 alone gets you a lot of the way, but on highly573* compressible data, it was losing up to 8.5% of the compressed574* savings versus no early abort, and all the zstd-fast levels are575* worse indications on their own than LZ4, and don't improve the LZ4576* pass noticably if stacked like this.577*/578size_t actual_abort_size = zstd_abort_size;579if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&580s_len >= actual_abort_size) {581abd_t sabd, dabd;582abd_get_from_buf_struct(&sabd, s_start, s_len);583abd_get_from_buf_struct(&dabd, d_start, d_len);584int pass_len = zfs_lz4_compress(&sabd, &dabd, s_len, d_len, 0);585abd_free(&dabd);586abd_free(&sabd);587if (pass_len < d_len) {588ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);589goto keep_trying;590}591ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);592593pass_len = zfs_zstd_compress_impl(s_start, d_start, s_len,594d_len, ZIO_ZSTD_LEVEL_1);595if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {596ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);597return (s_len);598}599ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);600} else {601ZSTDSTAT_BUMP(zstd_stat_passignored);602if (s_len < actual_abort_size) {603ZSTDSTAT_BUMP(zstd_stat_passignored_size);604}605}606keep_trying:607return (zfs_zstd_compress_impl(s_start, d_start, s_len, d_len, level));608609}610#endif611612/* Decompress block using zstd and return its stored level */613static int614zfs_zstd_decompress_level_buf(void *s_start, void *d_start, size_t s_len,615size_t d_len, uint8_t *level)616{617ZSTD_DCtx *dctx;618size_t result;619int16_t zstd_level;620uint32_t c_len;621const zfs_zstdhdr_t *hdr;622zfs_zstdhdr_t hdr_copy;623624hdr = (const zfs_zstdhdr_t *)s_start;625c_len = BE_32(hdr->c_len);626627/*628* Make a copy instead of directly converting the header, since we must629* not modify the original data that may be used again later.630*/631hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);632uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy);633634/*635* NOTE: We ignore the ZSTD version for now. As soon as any636* incompatibility occurs, it has to be handled accordingly.637* The version can be accessed via `hdr_copy.version`.638*/639640/*641* Convert and check the level642* An invalid level is a strong indicator for data corruption! In such643* case return an error so the upper layers can try to fix it.644*/645if (zstd_enum_to_level(curlevel, &zstd_level)) {646ZSTDSTAT_BUMP(zstd_stat_dec_inval);647return (1);648}649650ASSERT3U(d_len, >=, s_len);651ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT);652653/* Invalid compressed buffer size encoded at start */654if (c_len + sizeof (*hdr) > s_len) {655ZSTDSTAT_BUMP(zstd_stat_dec_header_inval);656return (1);657}658659dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc);660if (!dctx) {661ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail);662return (1);663}664665/* Set header type to "magicless" */666ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless);667668/* Decompress the data and release the context */669result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len);670ZSTD_freeDCtx(dctx);671672/*673* Returns 0 on success (decompression function returned non-negative)674* and non-zero on failure (decompression function returned negative.675*/676if (ZSTD_isError(result)) {677ZSTDSTAT_BUMP(zstd_stat_dec_fail);678return (1);679}680681if (level) {682*level = curlevel;683}684685return (0);686}687688/* Decompress datablock using zstd */689#ifdef IN_BASE690int691zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len,692size_t d_len, int level __maybe_unused)693{694695return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len,696NULL));697}698#else699static int700zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len,701size_t d_len, int level __maybe_unused)702{703704return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len,705NULL));706}707#endif708709#ifndef IN_LIBSA710ZFS_COMPRESS_WRAP_DECL(zfs_zstd_compress)711ZFS_DECOMPRESS_WRAP_DECL(zfs_zstd_decompress)712ZFS_DECOMPRESS_LEVEL_WRAP_DECL(zfs_zstd_decompress_level)713714/* Allocator for zstd compression context using mempool_allocator */715static void *716zstd_alloc(void *opaque __maybe_unused, size_t size)717{718size_t nbytes = sizeof (struct zstd_kmem) + size;719struct zstd_kmem *z = NULL;720721z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes);722723if (!z) {724ZSTDSTAT_BUMP(zstd_stat_alloc_fail);725return (NULL);726}727728return ((void*)z + (sizeof (struct zstd_kmem)));729}730731#endif732/*733* Allocator for zstd decompression context using mempool_allocator with734* fallback to reserved memory if allocation fails735*/736static void *737zstd_dctx_alloc(void *opaque __maybe_unused, size_t size)738{739size_t nbytes = sizeof (struct zstd_kmem) + size;740struct zstd_kmem *z = NULL;741enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT;742743z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes);744if (!z) {745/* Try harder, decompression shall not fail */746z = vmem_alloc(nbytes, KM_SLEEP);747if (z) {748z->pool = NULL;749}750ZSTDSTAT_BUMP(zstd_stat_alloc_fail);751} else {752return ((void*)z + (sizeof (struct zstd_kmem)));753}754755/* Fallback if everything fails */756if (!z) {757/*758* Barrier since we only can handle it in a single thread. All759* other following threads need to wait here until decompression760* is completed. zstd_free will release this barrier later.761*/762mutex_enter(&zstd_dctx_fallback.barrier);763764z = zstd_dctx_fallback.mem;765type = ZSTD_KMEM_DCTX;766ZSTDSTAT_BUMP(zstd_stat_alloc_fallback);767}768769/* Allocation should always be successful */770if (!z) {771return (NULL);772}773774z->kmem_type = type;775z->kmem_size = nbytes;776777return ((void*)z + (sizeof (struct zstd_kmem)));778}779780/* Free allocated memory by its specific type */781static void782zstd_free(void *opaque __maybe_unused, void *ptr)783{784struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem));785enum zstd_kmem_type type;786787ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT);788ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN);789790type = z->kmem_type;791switch (type) {792case ZSTD_KMEM_DEFAULT:793vmem_free(z, z->kmem_size);794break;795case ZSTD_KMEM_POOL:796zstd_mempool_free(z);797break;798case ZSTD_KMEM_DCTX:799mutex_exit(&zstd_dctx_fallback.barrier);800break;801default:802break;803}804}805806/* Allocate fallback memory to ensure safe decompression */807static void __init808create_fallback_mem(struct zstd_fallback_mem *mem, size_t size)809{810mem->mem_size = size;811mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP);812mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL);813}814815/* Initialize memory pool barrier mutexes */816static void __init817zstd_mempool_init(void)818{819zstd_mempool_cctx =820vmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);821zstd_mempool_dctx =822vmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);823824for (int i = 0; i < ZSTD_POOL_MAX; i++) {825mutex_init(&zstd_mempool_cctx[i].barrier, NULL,826MUTEX_DEFAULT, NULL);827mutex_init(&zstd_mempool_dctx[i].barrier, NULL,828MUTEX_DEFAULT, NULL);829}830}831832/* Initialize zstd-related memory handling */833static int __init834zstd_meminit(void)835{836zstd_mempool_init();837838/*839* Estimate the size of the fallback decompression context.840* The expected size on x64 with current ZSTD should be about 160 KB.841*/842create_fallback_mem(&zstd_dctx_fallback,843P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem),844PAGESIZE));845846return (0);847}848849/* Release object from pool and free memory */850static void851release_pool(struct zstd_pool *pool)852{853mutex_destroy(&pool->barrier);854vmem_free(pool->mem, pool->size);855pool->mem = NULL;856pool->size = 0;857}858859/* Release memory pool objects */860static void861zstd_mempool_deinit(void)862{863for (int i = 0; i < ZSTD_POOL_MAX; i++) {864release_pool(&zstd_mempool_cctx[i]);865release_pool(&zstd_mempool_dctx[i]);866}867868vmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));869vmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));870zstd_mempool_dctx = NULL;871zstd_mempool_cctx = NULL;872}873874/* release unused memory from pool */875876void877zfs_zstd_cache_reap_now(void)878{879880/*881* Short-circuit if there are no buffers to begin with.882*/883if (ZSTDSTAT(zstd_stat_buffers) == 0)884return;885886/*887* calling alloc with zero size seeks888* and releases old unused objects889*/890zstd_mempool_reap(zstd_mempool_cctx);891zstd_mempool_reap(zstd_mempool_dctx);892}893894extern int __init895zstd_init(void)896{897/* Set pool size by using maximum sane thread count * 4 */898pool_count = (boot_ncpus * 4);899zstd_meminit();900901/* Initialize kstat */902zstd_ksp = kstat_create("zfs", 0, "zstd", "misc",903KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t),904KSTAT_FLAG_VIRTUAL);905if (zstd_ksp != NULL) {906zstd_ksp->ks_data = &zstd_stats;907kstat_install(zstd_ksp);908#ifdef _KERNEL909zstd_ksp->ks_update = kstat_zstd_update;910#endif911}912913return (0);914}915916extern void917zstd_fini(void)918{919/* Deinitialize kstat */920if (zstd_ksp != NULL) {921kstat_delete(zstd_ksp);922zstd_ksp = NULL;923}924925/* Release fallback memory */926vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size);927mutex_destroy(&zstd_dctx_fallback.barrier);928929/* Deinit memory pool */930zstd_mempool_deinit();931}932933#if defined(_KERNEL)934#ifdef __FreeBSD__935module_init(zstd_init);936module_exit(zstd_fini);937#endif938939ZFS_MODULE_PARAM(zfs, zstd_, earlyabort_pass, UINT, ZMOD_RW,940"Enable early abort attempts when using zstd");941ZFS_MODULE_PARAM(zfs, zstd_, abort_size, UINT, ZMOD_RW,942"Minimal size of block to attempt early abort");943#endif944945946