Path: blob/main/sys/contrib/openzfs/module/zcommon/zfs_fletcher_superscalar4.c
48383 views
// SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only1/*2* Implement fast Fletcher4 using superscalar pipelines.3*4* Use regular C code to compute5* Fletcher4 in four incremental 64-bit parallel accumulator streams,6* and then combine the streams to form the final four checksum words.7* This implementation is a derivative of the AVX SIMD implementation by8* James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c).9*10* Copyright (C) 2016 Romain Dolbeau.11*12* Authors:13* Romain Dolbeau <[email protected]>14*15* This software is available to you under a choice of one of two16* licenses. You may choose to be licensed under the terms of the GNU17* General Public License (GPL) Version 2, available from the file18* COPYING in the main directory of this source tree, or the19* OpenIB.org BSD license below:20*21* Redistribution and use in source and binary forms, with or22* without modification, are permitted provided that the following23* conditions are met:24*25* - Redistributions of source code must retain the above26* copyright notice, this list of conditions and the following27* disclaimer.28*29* - Redistributions in binary form must reproduce the above30* copyright notice, this list of conditions and the following31* disclaimer in the documentation and/or other materials32* provided with the distribution.33*34* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,35* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF36* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND37* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS38* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN39* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN40* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE41* SOFTWARE.42*/4344#include <sys/param.h>45#include <sys/byteorder.h>46#include <sys/spa_checksum.h>47#include <sys/string.h>48#include <zfs_fletcher.h>4950static void51fletcher_4_superscalar4_init(fletcher_4_ctx_t *ctx)52{53memset(ctx->superscalar, 0, 4 * sizeof (zfs_fletcher_superscalar_t));54}5556static void57fletcher_4_superscalar4_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)58{59uint64_t A, B, C, D;6061A = ctx->superscalar[0].v[0] + ctx->superscalar[0].v[1] +62ctx->superscalar[0].v[2] + ctx->superscalar[0].v[3];63B = 0 - ctx->superscalar[0].v[1] - 2 * ctx->superscalar[0].v[2] -643 * ctx->superscalar[0].v[3] + 4 * ctx->superscalar[1].v[0] +654 * ctx->superscalar[1].v[1] + 4 * ctx->superscalar[1].v[2] +664 * ctx->superscalar[1].v[3];6768C = ctx->superscalar[0].v[2] + 3 * ctx->superscalar[0].v[3] -696 * ctx->superscalar[1].v[0] - 10 * ctx->superscalar[1].v[1] -7014 * ctx->superscalar[1].v[2] - 18 * ctx->superscalar[1].v[3] +7116 * ctx->superscalar[2].v[0] + 16 * ctx->superscalar[2].v[1] +7216 * ctx->superscalar[2].v[2] + 16 * ctx->superscalar[2].v[3];7374D = 0 - ctx->superscalar[0].v[3] + 4 * ctx->superscalar[1].v[0] +7510 * ctx->superscalar[1].v[1] + 20 * ctx->superscalar[1].v[2] +7634 * ctx->superscalar[1].v[3] - 48 * ctx->superscalar[2].v[0] -7764 * ctx->superscalar[2].v[1] - 80 * ctx->superscalar[2].v[2] -7896 * ctx->superscalar[2].v[3] + 64 * ctx->superscalar[3].v[0] +7964 * ctx->superscalar[3].v[1] + 64 * ctx->superscalar[3].v[2] +8064 * ctx->superscalar[3].v[3];8182ZIO_SET_CHECKSUM(zcp, A, B, C, D);83}8485static void86fletcher_4_superscalar4_native(fletcher_4_ctx_t *ctx,87const void *buf, uint64_t size)88{89const uint32_t *ip = buf;90const uint32_t *ipend = ip + (size / sizeof (uint32_t));91uint64_t a, b, c, d;92uint64_t a2, b2, c2, d2;93uint64_t a3, b3, c3, d3;94uint64_t a4, b4, c4, d4;9596a = ctx->superscalar[0].v[0];97b = ctx->superscalar[1].v[0];98c = ctx->superscalar[2].v[0];99d = ctx->superscalar[3].v[0];100a2 = ctx->superscalar[0].v[1];101b2 = ctx->superscalar[1].v[1];102c2 = ctx->superscalar[2].v[1];103d2 = ctx->superscalar[3].v[1];104a3 = ctx->superscalar[0].v[2];105b3 = ctx->superscalar[1].v[2];106c3 = ctx->superscalar[2].v[2];107d3 = ctx->superscalar[3].v[2];108a4 = ctx->superscalar[0].v[3];109b4 = ctx->superscalar[1].v[3];110c4 = ctx->superscalar[2].v[3];111d4 = ctx->superscalar[3].v[3];112113do {114a += ip[0];115a2 += ip[1];116a3 += ip[2];117a4 += ip[3];118b += a;119b2 += a2;120b3 += a3;121b4 += a4;122c += b;123c2 += b2;124c3 += b3;125c4 += b4;126d += c;127d2 += c2;128d3 += c3;129d4 += c4;130} while ((ip += 4) < ipend);131132ctx->superscalar[0].v[0] = a;133ctx->superscalar[1].v[0] = b;134ctx->superscalar[2].v[0] = c;135ctx->superscalar[3].v[0] = d;136ctx->superscalar[0].v[1] = a2;137ctx->superscalar[1].v[1] = b2;138ctx->superscalar[2].v[1] = c2;139ctx->superscalar[3].v[1] = d2;140ctx->superscalar[0].v[2] = a3;141ctx->superscalar[1].v[2] = b3;142ctx->superscalar[2].v[2] = c3;143ctx->superscalar[3].v[2] = d3;144ctx->superscalar[0].v[3] = a4;145ctx->superscalar[1].v[3] = b4;146ctx->superscalar[2].v[3] = c4;147ctx->superscalar[3].v[3] = d4;148}149150static void151fletcher_4_superscalar4_byteswap(fletcher_4_ctx_t *ctx,152const void *buf, uint64_t size)153{154const uint32_t *ip = buf;155const uint32_t *ipend = ip + (size / sizeof (uint32_t));156uint64_t a, b, c, d;157uint64_t a2, b2, c2, d2;158uint64_t a3, b3, c3, d3;159uint64_t a4, b4, c4, d4;160161a = ctx->superscalar[0].v[0];162b = ctx->superscalar[1].v[0];163c = ctx->superscalar[2].v[0];164d = ctx->superscalar[3].v[0];165a2 = ctx->superscalar[0].v[1];166b2 = ctx->superscalar[1].v[1];167c2 = ctx->superscalar[2].v[1];168d2 = ctx->superscalar[3].v[1];169a3 = ctx->superscalar[0].v[2];170b3 = ctx->superscalar[1].v[2];171c3 = ctx->superscalar[2].v[2];172d3 = ctx->superscalar[3].v[2];173a4 = ctx->superscalar[0].v[3];174b4 = ctx->superscalar[1].v[3];175c4 = ctx->superscalar[2].v[3];176d4 = ctx->superscalar[3].v[3];177178do {179a += BSWAP_32(ip[0]);180a2 += BSWAP_32(ip[1]);181a3 += BSWAP_32(ip[2]);182a4 += BSWAP_32(ip[3]);183b += a;184b2 += a2;185b3 += a3;186b4 += a4;187c += b;188c2 += b2;189c3 += b3;190c4 += b4;191d += c;192d2 += c2;193d3 += c3;194d4 += c4;195} while ((ip += 4) < ipend);196197ctx->superscalar[0].v[0] = a;198ctx->superscalar[1].v[0] = b;199ctx->superscalar[2].v[0] = c;200ctx->superscalar[3].v[0] = d;201ctx->superscalar[0].v[1] = a2;202ctx->superscalar[1].v[1] = b2;203ctx->superscalar[2].v[1] = c2;204ctx->superscalar[3].v[1] = d2;205ctx->superscalar[0].v[2] = a3;206ctx->superscalar[1].v[2] = b3;207ctx->superscalar[2].v[2] = c3;208ctx->superscalar[3].v[2] = d3;209ctx->superscalar[0].v[3] = a4;210ctx->superscalar[1].v[3] = b4;211ctx->superscalar[2].v[3] = c4;212ctx->superscalar[3].v[3] = d4;213}214215static boolean_t fletcher_4_superscalar4_valid(void)216{217return (B_TRUE);218}219220const fletcher_4_ops_t fletcher_4_superscalar4_ops = {221.init_native = fletcher_4_superscalar4_init,222.compute_native = fletcher_4_superscalar4_native,223.fini_native = fletcher_4_superscalar4_fini,224.init_byteswap = fletcher_4_superscalar4_init,225.compute_byteswap = fletcher_4_superscalar4_byteswap,226.fini_byteswap = fletcher_4_superscalar4_fini,227.valid = fletcher_4_superscalar4_valid,228.uses_fpu = B_FALSE,229.name = "superscalar4"230};231232233