Path: blob/main/sys/contrib/openzfs/module/zcommon/zfs_fletcher.c
48383 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright 2009 Sun Microsystems, Inc. All rights reserved.23* Use is subject to license terms.24* Copyright (C) 2016 Gvozden Nešković. All rights reserved.25*/26/*27* Copyright 2013 Saso Kiselkov. All rights reserved.28*/2930/*31* Copyright (c) 2016 by Delphix. All rights reserved.32*/3334/*35* Fletcher Checksums36* ------------------37*38* ZFS's 2nd and 4th order Fletcher checksums are defined by the following39* recurrence relations:40*41* a = a + f42* i i-1 i-143*44* b = b + a45* i i-1 i46*47* c = c + b (fletcher-4 only)48* i i-1 i49*50* d = d + c (fletcher-4 only)51* i i-1 i52*53* Where54* a_0 = b_0 = c_0 = d_0 = 055* and56* f_0 .. f_(n-1) are the input data.57*58* Using standard techniques, these translate into the following series:59*60* __n_ __n_61* \ | \ |62* a = > f b = > i * f63* n /___| n - i n /___| n - i64* i = 1 i = 165*66*67* __n_ __n_68* \ | i*(i+1) \ | i*(i+1)*(i+2)69* c = > ------- f d = > ------------- f70* n /___| 2 n - i n /___| 6 n - i71* i = 1 i = 172*73* For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.74* Since the additions are done mod (2^64), errors in the high bits may not75* be noticed. For this reason, fletcher-2 is deprecated.76*77* For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.78* A conservative estimate of how big the buffer can get before we overflow79* can be estimated using f_i = 0xffffffff for all i:80*81* % bc82* f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*483* 226484* quit85* %86*87* So blocks of up to 2k will not overflow. Our largest block size is88* 128k, which has 32k 4-byte words, so we can compute the largest possible89* accumulators, then divide by 2^64 to figure the max amount of overflow:90*91* % bc92* a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }93* a/2^64;b/2^64;c/2^64;d/2^6494* 095* 096* 136597* 1118685898* quit99* %100*101* So a and b cannot overflow. To make sure each bit of input has some102* effect on the contents of c and d, we can look at what the factors of103* the coefficients in the equations for c_n and d_n are. The number of 2s104* in the factors determines the lowest set bit in the multiplier. Running105* through the cases for n*(n+1)/2 reveals that the highest power of 2 is106* 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow107* the 64-bit accumulators, every bit of every f_i effects every accumulator,108* even for 128k blocks.109*110* If we wanted to make a stronger version of fletcher4 (fletcher4c?),111* we could do our calculations mod (2^32 - 1) by adding in the carries112* periodically, and store the number of carries in the top 32-bits.113*114* --------------------115* Checksum Performance116* --------------------117*118* There are two interesting components to checksum performance: cached and119* uncached performance. With cached data, fletcher-2 is about four times120* faster than fletcher-4. With uncached data, the performance difference is121* negligible, since the cost of a cache fill dominates the processing time.122* Even though fletcher-4 is slower than fletcher-2, it is still a pretty123* efficient pass over the data.124*125* In normal operation, the data which is being checksummed is in a buffer126* which has been filled either by:127*128* 1. a compression step, which will be mostly cached, or129* 2. a memcpy() or copyin(), which will be uncached130* (because the copy is cache-bypassing).131*132* For both cached and uncached data, both fletcher checksums are much faster133* than sha-256, and slower than 'off', which doesn't touch the data at all.134*/135136#include <sys/types.h>137#include <sys/sysmacros.h>138#include <sys/byteorder.h>139#include <sys/simd.h>140#include <sys/spa.h>141#include <sys/zio_checksum.h>142#include <sys/zfs_context.h>143#include <zfs_fletcher.h>144145#define FLETCHER_MIN_SIMD_SIZE 64146147static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx);148static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp);149static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx,150const void *buf, uint64_t size);151static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx,152const void *buf, uint64_t size);153static boolean_t fletcher_4_scalar_valid(void);154155static const fletcher_4_ops_t fletcher_4_scalar_ops = {156.init_native = fletcher_4_scalar_init,157.fini_native = fletcher_4_scalar_fini,158.compute_native = fletcher_4_scalar_native,159.init_byteswap = fletcher_4_scalar_init,160.fini_byteswap = fletcher_4_scalar_fini,161.compute_byteswap = fletcher_4_scalar_byteswap,162.valid = fletcher_4_scalar_valid,163.uses_fpu = B_FALSE,164.name = "scalar"165};166167static fletcher_4_ops_t fletcher_4_fastest_impl = {168.name = "fastest",169.valid = fletcher_4_scalar_valid170};171172static const fletcher_4_ops_t *fletcher_4_impls[] = {173&fletcher_4_scalar_ops,174&fletcher_4_superscalar_ops,175&fletcher_4_superscalar4_ops,176#if defined(HAVE_SSE2)177&fletcher_4_sse2_ops,178#endif179#if defined(HAVE_SSE2) && defined(HAVE_SSSE3)180&fletcher_4_ssse3_ops,181#endif182#if defined(HAVE_AVX) && defined(HAVE_AVX2)183&fletcher_4_avx2_ops,184#endif185#if defined(__x86_64) && defined(HAVE_AVX512F)186&fletcher_4_avx512f_ops,187#endif188#if defined(__x86_64) && defined(HAVE_AVX512BW)189&fletcher_4_avx512bw_ops,190#endif191#if defined(__aarch64__) && !defined(__FreeBSD__)192&fletcher_4_aarch64_neon_ops,193#endif194};195196/* Hold all supported implementations */197static uint32_t fletcher_4_supp_impls_cnt = 0;198static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)];199200/* Select fletcher4 implementation */201#define IMPL_FASTEST (UINT32_MAX)202#define IMPL_CYCLE (UINT32_MAX - 1)203#define IMPL_SCALAR (0)204205static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST;206207#define IMPL_READ(i) (*(volatile uint32_t *) &(i))208209static struct fletcher_4_impl_selector {210const char *fis_name;211uint32_t fis_sel;212} fletcher_4_impl_selectors[] = {213{ "cycle", IMPL_CYCLE },214{ "fastest", IMPL_FASTEST },215{ "scalar", IMPL_SCALAR }216};217218#if defined(_KERNEL)219static kstat_t *fletcher_4_kstat;220221static struct fletcher_4_kstat {222uint64_t native;223uint64_t byteswap;224} fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];225#endif226227/* Indicate that benchmark has been completed */228static boolean_t fletcher_4_initialized = B_FALSE;229230void231fletcher_init(zio_cksum_t *zcp)232{233ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);234}235236int237fletcher_2_incremental_native(void *buf, size_t size, void *data)238{239zio_cksum_t *zcp = data;240241const uint64_t *ip = buf;242const uint64_t *ipend = ip + (size / sizeof (uint64_t));243uint64_t a0, b0, a1, b1;244245a0 = zcp->zc_word[0];246a1 = zcp->zc_word[1];247b0 = zcp->zc_word[2];248b1 = zcp->zc_word[3];249250for (; ip < ipend; ip += 2) {251a0 += ip[0];252a1 += ip[1];253b0 += a0;254b1 += a1;255}256257ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);258return (0);259}260261void262fletcher_2_native(const void *buf, uint64_t size,263const void *ctx_template, zio_cksum_t *zcp)264{265(void) ctx_template;266fletcher_init(zcp);267(void) fletcher_2_incremental_native((void *) buf, size, zcp);268}269270int271fletcher_2_incremental_byteswap(void *buf, size_t size, void *data)272{273zio_cksum_t *zcp = data;274275const uint64_t *ip = buf;276const uint64_t *ipend = ip + (size / sizeof (uint64_t));277uint64_t a0, b0, a1, b1;278279a0 = zcp->zc_word[0];280a1 = zcp->zc_word[1];281b0 = zcp->zc_word[2];282b1 = zcp->zc_word[3];283284for (; ip < ipend; ip += 2) {285a0 += BSWAP_64(ip[0]);286a1 += BSWAP_64(ip[1]);287b0 += a0;288b1 += a1;289}290291ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);292return (0);293}294295void296fletcher_2_byteswap(const void *buf, uint64_t size,297const void *ctx_template, zio_cksum_t *zcp)298{299(void) ctx_template;300fletcher_init(zcp);301(void) fletcher_2_incremental_byteswap((void *) buf, size, zcp);302}303304static void305fletcher_4_scalar_init(fletcher_4_ctx_t *ctx)306{307ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0);308}309310static void311fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)312{313memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t));314}315316static void317fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf,318uint64_t size)319{320const uint32_t *ip = buf;321const uint32_t *ipend = ip + (size / sizeof (uint32_t));322uint64_t a, b, c, d;323324a = ctx->scalar.zc_word[0];325b = ctx->scalar.zc_word[1];326c = ctx->scalar.zc_word[2];327d = ctx->scalar.zc_word[3];328329for (; ip < ipend; ip++) {330a += ip[0];331b += a;332c += b;333d += c;334}335336ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);337}338339static void340fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf,341uint64_t size)342{343const uint32_t *ip = buf;344const uint32_t *ipend = ip + (size / sizeof (uint32_t));345uint64_t a, b, c, d;346347a = ctx->scalar.zc_word[0];348b = ctx->scalar.zc_word[1];349c = ctx->scalar.zc_word[2];350d = ctx->scalar.zc_word[3];351352for (; ip < ipend; ip++) {353a += BSWAP_32(ip[0]);354b += a;355c += b;356d += c;357}358359ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);360}361362static boolean_t363fletcher_4_scalar_valid(void)364{365return (B_TRUE);366}367368int369fletcher_4_impl_set(const char *val)370{371int err = -EINVAL;372uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);373size_t i, val_len;374375val_len = strlen(val);376while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */377val_len--;378379/* check mandatory implementations */380for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) {381const char *name = fletcher_4_impl_selectors[i].fis_name;382383if (val_len == strlen(name) &&384strncmp(val, name, val_len) == 0) {385impl = fletcher_4_impl_selectors[i].fis_sel;386err = 0;387break;388}389}390391if (err != 0 && fletcher_4_initialized) {392/* check all supported implementations */393for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {394const char *name = fletcher_4_supp_impls[i]->name;395396if (val_len == strlen(name) &&397strncmp(val, name, val_len) == 0) {398impl = i;399err = 0;400break;401}402}403}404405if (err == 0) {406atomic_swap_32(&fletcher_4_impl_chosen, impl);407membar_producer();408}409410return (err);411}412413/*414* Returns the Fletcher 4 operations for checksums. When a SIMD415* implementation is not allowed in the current context, then fallback416* to the fastest generic implementation.417*/418static inline const fletcher_4_ops_t *419fletcher_4_impl_get(void)420{421if (!kfpu_allowed())422return (&fletcher_4_superscalar4_ops);423424const fletcher_4_ops_t *ops = NULL;425uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);426427switch (impl) {428case IMPL_FASTEST:429ASSERT(fletcher_4_initialized);430ops = &fletcher_4_fastest_impl;431break;432case IMPL_CYCLE:433/* Cycle through supported implementations */434ASSERT(fletcher_4_initialized);435ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);436static uint32_t cycle_count = 0;437uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;438ops = fletcher_4_supp_impls[idx];439break;440default:441ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);442ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);443ops = fletcher_4_supp_impls[impl];444break;445}446447ASSERT3P(ops, !=, NULL);448449return (ops);450}451452static inline void453fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)454{455fletcher_4_ctx_t ctx;456const fletcher_4_ops_t *ops = fletcher_4_impl_get();457458if (ops->uses_fpu == B_TRUE) {459kfpu_begin();460}461ops->init_native(&ctx);462ops->compute_native(&ctx, buf, size);463ops->fini_native(&ctx, zcp);464if (ops->uses_fpu == B_TRUE) {465kfpu_end();466}467}468469void470fletcher_4_native(const void *buf, uint64_t size,471const void *ctx_template, zio_cksum_t *zcp)472{473(void) ctx_template;474const uint64_t p2size = P2ALIGN_TYPED(size, FLETCHER_MIN_SIMD_SIZE,475uint64_t);476477ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));478479if (size == 0 || p2size == 0) {480ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);481482if (size > 0)483fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,484buf, size);485} else {486fletcher_4_native_impl(buf, p2size, zcp);487488if (p2size < size)489fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,490(char *)buf + p2size, size - p2size);491}492}493494void495fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp)496{497ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);498fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);499}500501static inline void502fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)503{504fletcher_4_ctx_t ctx;505const fletcher_4_ops_t *ops = fletcher_4_impl_get();506507if (ops->uses_fpu == B_TRUE) {508kfpu_begin();509}510ops->init_byteswap(&ctx);511ops->compute_byteswap(&ctx, buf, size);512ops->fini_byteswap(&ctx, zcp);513if (ops->uses_fpu == B_TRUE) {514kfpu_end();515}516}517518void519fletcher_4_byteswap(const void *buf, uint64_t size,520const void *ctx_template, zio_cksum_t *zcp)521{522(void) ctx_template;523const uint64_t p2size = P2ALIGN_TYPED(size, FLETCHER_MIN_SIMD_SIZE,524uint64_t);525526ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));527528if (size == 0 || p2size == 0) {529ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);530531if (size > 0)532fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,533buf, size);534} else {535fletcher_4_byteswap_impl(buf, p2size, zcp);536537if (p2size < size)538fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,539(char *)buf + p2size, size - p2size);540}541}542543/* Incremental Fletcher 4 */544545#define ZFS_FLETCHER_4_INC_MAX_SIZE (8ULL << 20)546547static inline void548fletcher_4_incremental_combine(zio_cksum_t *zcp, const uint64_t size,549const zio_cksum_t *nzcp)550{551const uint64_t c1 = size / sizeof (uint32_t);552const uint64_t c2 = c1 * (c1 + 1) / 2;553const uint64_t c3 = c2 * (c1 + 2) / 3;554555/*556* Value of 'c3' overflows on buffer sizes close to 16MiB. For that557* reason we split incremental fletcher4 computation of large buffers558* to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size.559*/560ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE);561562zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] +563c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0];564zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] +565c2 * zcp->zc_word[0];566zcp->zc_word[1] += nzcp->zc_word[1] + c1 * zcp->zc_word[0];567zcp->zc_word[0] += nzcp->zc_word[0];568}569570static inline void571fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size,572zio_cksum_t *zcp)573{574while (size > 0) {575zio_cksum_t nzc;576uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE);577578if (native)579fletcher_4_native(buf, len, NULL, &nzc);580else581fletcher_4_byteswap(buf, len, NULL, &nzc);582583fletcher_4_incremental_combine(zcp, len, &nzc);584585size -= len;586buf += len;587}588}589590int591fletcher_4_incremental_native(void *buf, size_t size, void *data)592{593zio_cksum_t *zcp = data;594/* Use scalar impl to directly update cksum of small blocks */595if (size < SPA_MINBLOCKSIZE)596fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);597else598fletcher_4_incremental_impl(B_TRUE, buf, size, zcp);599return (0);600}601602int603fletcher_4_incremental_byteswap(void *buf, size_t size, void *data)604{605zio_cksum_t *zcp = data;606/* Use scalar impl to directly update cksum of small blocks */607if (size < SPA_MINBLOCKSIZE)608fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size);609else610fletcher_4_incremental_impl(B_FALSE, buf, size, zcp);611return (0);612}613614#if defined(_KERNEL)615/*616* Fletcher 4 kstats617*/618static int619fletcher_4_kstat_headers(char *buf, size_t size)620{621ssize_t off = 0;622623off += snprintf(buf + off, size, "%-17s", "implementation");624off += snprintf(buf + off, size - off, "%-15s", "native");625(void) snprintf(buf + off, size - off, "%-15s\n", "byteswap");626627return (0);628}629630static int631fletcher_4_kstat_data(char *buf, size_t size, void *data)632{633struct fletcher_4_kstat *fastest_stat =634&fletcher_4_stat_data[fletcher_4_supp_impls_cnt];635struct fletcher_4_kstat *curr_stat = (struct fletcher_4_kstat *)data;636ssize_t off = 0;637638if (curr_stat == fastest_stat) {639off += snprintf(buf + off, size - off, "%-17s", "fastest");640off += snprintf(buf + off, size - off, "%-15s",641fletcher_4_supp_impls[fastest_stat->native]->name);642(void) snprintf(buf + off, size - off, "%-15s\n",643fletcher_4_supp_impls[fastest_stat->byteswap]->name);644} else {645ptrdiff_t id = curr_stat - fletcher_4_stat_data;646647off += snprintf(buf + off, size - off, "%-17s",648fletcher_4_supp_impls[id]->name);649off += snprintf(buf + off, size - off, "%-15llu",650(u_longlong_t)curr_stat->native);651(void) snprintf(buf + off, size - off, "%-15llu\n",652(u_longlong_t)curr_stat->byteswap);653}654655return (0);656}657658static void *659fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)660{661if (n <= fletcher_4_supp_impls_cnt)662ksp->ks_private = (void *) (fletcher_4_stat_data + n);663else664ksp->ks_private = NULL;665666return (ksp->ks_private);667}668#endif669670#define FLETCHER_4_FASTEST_FN_COPY(type, src) \671{ \672fletcher_4_fastest_impl.init_ ## type = src->init_ ## type; \673fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type; \674fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \675fletcher_4_fastest_impl.uses_fpu = src->uses_fpu; \676}677678#define FLETCHER_4_BENCH_NS (MSEC2NSEC(1)) /* 1ms */679680typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,681zio_cksum_t *);682683#if defined(_KERNEL)684static void685fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)686{687688struct fletcher_4_kstat *fastest_stat =689&fletcher_4_stat_data[fletcher_4_supp_impls_cnt];690hrtime_t start;691uint64_t run_bw, run_time_ns, best_run = 0;692zio_cksum_t zc;693uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen);694695fletcher_checksum_func_t *fletcher_4_test = native ?696fletcher_4_native : fletcher_4_byteswap;697698for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {699struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i];700uint64_t run_count = 0;701702/* temporary set an implementation */703fletcher_4_impl_chosen = i;704705kpreempt_disable();706start = gethrtime();707do {708for (l = 0; l < 32; l++, run_count++)709fletcher_4_test(data, data_size, NULL, &zc);710711run_time_ns = gethrtime() - start;712} while (run_time_ns < FLETCHER_4_BENCH_NS);713kpreempt_enable();714715run_bw = data_size * run_count * NANOSEC;716run_bw /= run_time_ns; /* B/s */717718if (native)719stat->native = run_bw;720else721stat->byteswap = run_bw;722723if (run_bw > best_run) {724best_run = run_bw;725726if (native) {727fastest_stat->native = i;728FLETCHER_4_FASTEST_FN_COPY(native,729fletcher_4_supp_impls[i]);730} else {731fastest_stat->byteswap = i;732FLETCHER_4_FASTEST_FN_COPY(byteswap,733fletcher_4_supp_impls[i]);734}735}736}737738/* restore original selection */739atomic_swap_32(&fletcher_4_impl_chosen, sel_save);740}741#endif /* _KERNEL */742743/*744* Initialize and benchmark all supported implementations.745*/746static void747fletcher_4_benchmark(void)748{749fletcher_4_ops_t *curr_impl;750int i, c;751752/* Move supported implementations into fletcher_4_supp_impls */753for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {754curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];755756if (curr_impl->valid && curr_impl->valid())757fletcher_4_supp_impls[c++] = curr_impl;758}759membar_producer(); /* complete fletcher_4_supp_impls[] init */760fletcher_4_supp_impls_cnt = c; /* number of supported impl */761762#if defined(_KERNEL)763static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */764char *databuf = vmem_alloc(data_size, KM_SLEEP);765766for (i = 0; i < data_size / sizeof (uint64_t); i++)767((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */768769fletcher_4_benchmark_impl(B_FALSE, databuf, data_size);770fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);771772vmem_free(databuf, data_size);773#else774/*775* Skip the benchmark in user space to avoid impacting libzpool776* consumers (zdb, zhack, zinject, ztest). The last implementation777* is assumed to be the fastest and used by default.778*/779memcpy(&fletcher_4_fastest_impl,780fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],781sizeof (fletcher_4_fastest_impl));782fletcher_4_fastest_impl.name = "fastest";783membar_producer();784#endif /* _KERNEL */785}786787void788fletcher_4_init(void)789{790/* Determine the fastest available implementation. */791fletcher_4_benchmark();792793#if defined(_KERNEL)794/* Install kstats for all implementations */795fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",796KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);797if (fletcher_4_kstat != NULL) {798fletcher_4_kstat->ks_data = NULL;799fletcher_4_kstat->ks_ndata = UINT32_MAX;800kstat_set_raw_ops(fletcher_4_kstat,801fletcher_4_kstat_headers,802fletcher_4_kstat_data,803fletcher_4_kstat_addr);804kstat_install(fletcher_4_kstat);805}806#endif807808/* Finish initialization */809fletcher_4_initialized = B_TRUE;810}811812void813fletcher_4_fini(void)814{815#if defined(_KERNEL)816if (fletcher_4_kstat != NULL) {817kstat_delete(fletcher_4_kstat);818fletcher_4_kstat = NULL;819}820#endif821}822823/* ABD adapters */824825static void826abd_fletcher_4_init(zio_abd_checksum_data_t *cdp)827{828const fletcher_4_ops_t *ops = fletcher_4_impl_get();829cdp->acd_private = (void *) ops;830831if (ops->uses_fpu == B_TRUE) {832kfpu_begin();833}834if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE)835ops->init_native(cdp->acd_ctx);836else837ops->init_byteswap(cdp->acd_ctx);838839}840841static void842abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp)843{844fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;845846ASSERT(ops);847848if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE)849ops->fini_native(cdp->acd_ctx, cdp->acd_zcp);850else851ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp);852853if (ops->uses_fpu == B_TRUE) {854kfpu_end();855}856}857858859static void860abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size,861zio_abd_checksum_data_t *cdp)862{863zio_cksum_t *zcp = cdp->acd_zcp;864865ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);866867abd_fletcher_4_fini(cdp);868cdp->acd_private = (void *)&fletcher_4_scalar_ops;869870if (native)871fletcher_4_incremental_native(data, size, zcp);872else873fletcher_4_incremental_byteswap(data, size, zcp);874}875876static int877abd_fletcher_4_iter(void *data, size_t size, void *private)878{879zio_abd_checksum_data_t *cdp = (zio_abd_checksum_data_t *)private;880fletcher_4_ctx_t *ctx = cdp->acd_ctx;881fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;882boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE;883uint64_t asize = P2ALIGN_TYPED(size, FLETCHER_MIN_SIMD_SIZE, uint64_t);884885ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));886887if (asize > 0) {888if (native)889ops->compute_native(ctx, data, asize);890else891ops->compute_byteswap(ctx, data, asize);892893size -= asize;894data = (char *)data + asize;895}896897if (size > 0) {898ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);899/* At this point we have to switch to scalar impl */900abd_fletcher_4_simd2scalar(native, data, size, cdp);901}902903return (0);904}905906zio_abd_checksum_func_t fletcher_4_abd_ops = {907.acf_init = abd_fletcher_4_init,908.acf_fini = abd_fletcher_4_fini,909.acf_iter = abd_fletcher_4_iter910};911912#if defined(_KERNEL)913914#define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ")915916#if defined(__linux__)917918static int919fletcher_4_param_get(char *buffer, zfs_kernel_param_t *unused)920{921const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);922char *fmt;923int cnt = 0;924925/* list fastest */926fmt = IMPL_FMT(impl, IMPL_FASTEST);927cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "fastest");928929/* list all supported implementations */930for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) {931fmt = IMPL_FMT(impl, i);932cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,933fletcher_4_supp_impls[i]->name);934}935936return (cnt);937}938939static int940fletcher_4_param_set(const char *val, zfs_kernel_param_t *unused)941{942return (fletcher_4_impl_set(val));943}944945#else946947#include <sys/sbuf.h>948949static int950fletcher_4_param(ZFS_MODULE_PARAM_ARGS)951{952int err;953954if (req->newptr == NULL) {955const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);956const int init_buflen = 64;957const char *fmt;958struct sbuf *s;959960s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req);961962/* list fastest */963fmt = IMPL_FMT(impl, IMPL_FASTEST);964(void) sbuf_printf(s, fmt, "fastest");965966/* list all supported implementations */967for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) {968fmt = IMPL_FMT(impl, i);969(void) sbuf_printf(s, fmt,970fletcher_4_supp_impls[i]->name);971}972973err = sbuf_finish(s);974sbuf_delete(s);975976return (err);977}978979char buf[16];980981err = sysctl_handle_string(oidp, buf, sizeof (buf), req);982if (err)983return (err);984return (-fletcher_4_impl_set(buf));985}986987#endif988989#undef IMPL_FMT990991/*992* Choose a fletcher 4 implementation in ZFS.993* Users can choose "cycle" to exercise all implementations, but this is994* for testing purpose therefore it can only be set in user space.995*/996ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, fletcher_4_impl,997fletcher_4_param_set, fletcher_4_param_get, ZMOD_RW,998"Select fletcher 4 implementation.");9991000EXPORT_SYMBOL(fletcher_init);1001EXPORT_SYMBOL(fletcher_2_incremental_native);1002EXPORT_SYMBOL(fletcher_2_incremental_byteswap);1003EXPORT_SYMBOL(fletcher_4_init);1004EXPORT_SYMBOL(fletcher_4_fini);1005EXPORT_SYMBOL(fletcher_2_native);1006EXPORT_SYMBOL(fletcher_2_byteswap);1007EXPORT_SYMBOL(fletcher_4_native);1008EXPORT_SYMBOL(fletcher_4_native_varsize);1009EXPORT_SYMBOL(fletcher_4_byteswap);1010EXPORT_SYMBOL(fletcher_4_incremental_native);1011EXPORT_SYMBOL(fletcher_4_incremental_byteswap);1012EXPORT_SYMBOL(fletcher_4_abd_ops);1013#endif101410151016