Path: blob/main/sys/contrib/openzfs/module/zcommon/zfs_fletcher_aarch64_neon.c
48383 views
// SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only1/*2* Implement fast Fletcher4 with NEON instructions. (aarch64)3*4* Use the 128-bit NEON SIMD instructions and registers to compute5* Fletcher4 in two incremental 64-bit parallel accumulator streams,6* and then combine the streams to form the final four checksum words.7* This implementation is a derivative of the AVX SIMD implementation by8* James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c).9*10* Copyright (C) 2016 Romain Dolbeau.11*12* Authors:13* Romain Dolbeau <[email protected]>14*15* This software is available to you under a choice of one of two16* licenses. You may choose to be licensed under the terms of the GNU17* General Public License (GPL) Version 2, available from the file18* COPYING in the main directory of this source tree, or the19* OpenIB.org BSD license below:20*21* Redistribution and use in source and binary forms, with or22* without modification, are permitted provided that the following23* conditions are met:24*25* - Redistributions of source code must retain the above26* copyright notice, this list of conditions and the following27* disclaimer.28*29* - Redistributions in binary form must reproduce the above30* copyright notice, this list of conditions and the following31* disclaimer in the documentation and/or other materials32* provided with the distribution.33*34* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,35* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF36* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND37* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS38* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN39* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN40* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE41* SOFTWARE.42*/4344#if defined(__aarch64__)4546#include <sys/simd.h>47#include <sys/spa_checksum.h>48#include <sys/string.h>49#include <zfs_fletcher.h>5051static void52fletcher_4_aarch64_neon_init(fletcher_4_ctx_t *ctx)53{54memset(ctx->aarch64_neon, 0, 4 * sizeof (zfs_fletcher_aarch64_neon_t));55}5657static void58fletcher_4_aarch64_neon_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)59{60uint64_t A, B, C, D;61A = ctx->aarch64_neon[0].v[0] + ctx->aarch64_neon[0].v[1];62B = 2 * ctx->aarch64_neon[1].v[0] + 2 * ctx->aarch64_neon[1].v[1] -63ctx->aarch64_neon[0].v[1];64C = 4 * ctx->aarch64_neon[2].v[0] - ctx->aarch64_neon[1].v[0] +654 * ctx->aarch64_neon[2].v[1] - 3 * ctx->aarch64_neon[1].v[1];66D = 8 * ctx->aarch64_neon[3].v[0] - 4 * ctx->aarch64_neon[2].v[0] +678 * ctx->aarch64_neon[3].v[1] - 8 * ctx->aarch64_neon[2].v[1] +68ctx->aarch64_neon[1].v[1];69ZIO_SET_CHECKSUM(zcp, A, B, C, D);70}7172#define NEON_INIT_LOOP() \73asm("eor %[ZERO].16b,%[ZERO].16b,%[ZERO].16b\n" \74"ld1 { %[ACC0].4s }, %[CTX0]\n" \75"ld1 { %[ACC1].4s }, %[CTX1]\n" \76"ld1 { %[ACC2].4s }, %[CTX2]\n" \77"ld1 { %[ACC3].4s }, %[CTX3]\n" \78: [ZERO] "=w" (ZERO), \79[ACC0] "=w" (ACC0), [ACC1] "=w" (ACC1), \80[ACC2] "=w" (ACC2), [ACC3] "=w" (ACC3) \81: [CTX0] "Q" (ctx->aarch64_neon[0]), \82[CTX1] "Q" (ctx->aarch64_neon[1]), \83[CTX2] "Q" (ctx->aarch64_neon[2]), \84[CTX3] "Q" (ctx->aarch64_neon[3]))8586#define NEON_DO_REVERSE "rev32 %[SRC].16b, %[SRC].16b\n"8788#define NEON_DONT_REVERSE ""8990#define NEON_MAIN_LOOP(REVERSE) \91asm("ld1 { %[SRC].4s }, %[IP]\n" \92REVERSE \93"zip1 %[TMP1].4s, %[SRC].4s, %[ZERO].4s\n" \94"zip2 %[TMP2].4s, %[SRC].4s, %[ZERO].4s\n" \95"add %[ACC0].2d, %[ACC0].2d, %[TMP1].2d\n" \96"add %[ACC1].2d, %[ACC1].2d, %[ACC0].2d\n" \97"add %[ACC2].2d, %[ACC2].2d, %[ACC1].2d\n" \98"add %[ACC3].2d, %[ACC3].2d, %[ACC2].2d\n" \99"add %[ACC0].2d, %[ACC0].2d, %[TMP2].2d\n" \100"add %[ACC1].2d, %[ACC1].2d, %[ACC0].2d\n" \101"add %[ACC2].2d, %[ACC2].2d, %[ACC1].2d\n" \102"add %[ACC3].2d, %[ACC3].2d, %[ACC2].2d\n" \103: [SRC] "=&w" (SRC), \104[TMP1] "=&w" (TMP1), [TMP2] "=&w" (TMP2), \105[ACC0] "+w" (ACC0), [ACC1] "+w" (ACC1), \106[ACC2] "+w" (ACC2), [ACC3] "+w" (ACC3) \107: [ZERO] "w" (ZERO), [IP] "Q" (*ip))108109#define NEON_FINI_LOOP() \110asm("st1 { %[ACC0].4s },%[DST0]\n" \111"st1 { %[ACC1].4s },%[DST1]\n" \112"st1 { %[ACC2].4s },%[DST2]\n" \113"st1 { %[ACC3].4s },%[DST3]\n" \114: [DST0] "=Q" (ctx->aarch64_neon[0]), \115[DST1] "=Q" (ctx->aarch64_neon[1]), \116[DST2] "=Q" (ctx->aarch64_neon[2]), \117[DST3] "=Q" (ctx->aarch64_neon[3]) \118: [ACC0] "w" (ACC0), [ACC1] "w" (ACC1), \119[ACC2] "w" (ACC2), [ACC3] "w" (ACC3))120121static void122fletcher_4_aarch64_neon_native(fletcher_4_ctx_t *ctx,123const void *buf, uint64_t size)124{125const uint64_t *ip = buf;126const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);127#if defined(_KERNEL)128register unsigned char ZERO asm("v0") __attribute__((vector_size(16)));129register unsigned char ACC0 asm("v1") __attribute__((vector_size(16)));130register unsigned char ACC1 asm("v2") __attribute__((vector_size(16)));131register unsigned char ACC2 asm("v3") __attribute__((vector_size(16)));132register unsigned char ACC3 asm("v4") __attribute__((vector_size(16)));133register unsigned char TMP1 asm("v5") __attribute__((vector_size(16)));134register unsigned char TMP2 asm("v6") __attribute__((vector_size(16)));135register unsigned char SRC asm("v7") __attribute__((vector_size(16)));136#else137unsigned char ZERO __attribute__((vector_size(16)));138unsigned char ACC0 __attribute__((vector_size(16)));139unsigned char ACC1 __attribute__((vector_size(16)));140unsigned char ACC2 __attribute__((vector_size(16)));141unsigned char ACC3 __attribute__((vector_size(16)));142unsigned char TMP1 __attribute__((vector_size(16)));143unsigned char TMP2 __attribute__((vector_size(16)));144unsigned char SRC __attribute__((vector_size(16)));145#endif146147NEON_INIT_LOOP();148149do {150NEON_MAIN_LOOP(NEON_DONT_REVERSE);151} while ((ip += 2) < ipend);152153NEON_FINI_LOOP();154}155156static void157fletcher_4_aarch64_neon_byteswap(fletcher_4_ctx_t *ctx,158const void *buf, uint64_t size)159{160const uint64_t *ip = buf;161const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);162#if defined(_KERNEL)163register unsigned char ZERO asm("v0") __attribute__((vector_size(16)));164register unsigned char ACC0 asm("v1") __attribute__((vector_size(16)));165register unsigned char ACC1 asm("v2") __attribute__((vector_size(16)));166register unsigned char ACC2 asm("v3") __attribute__((vector_size(16)));167register unsigned char ACC3 asm("v4") __attribute__((vector_size(16)));168register unsigned char TMP1 asm("v5") __attribute__((vector_size(16)));169register unsigned char TMP2 asm("v6") __attribute__((vector_size(16)));170register unsigned char SRC asm("v7") __attribute__((vector_size(16)));171#else172unsigned char ZERO __attribute__((vector_size(16)));173unsigned char ACC0 __attribute__((vector_size(16)));174unsigned char ACC1 __attribute__((vector_size(16)));175unsigned char ACC2 __attribute__((vector_size(16)));176unsigned char ACC3 __attribute__((vector_size(16)));177unsigned char TMP1 __attribute__((vector_size(16)));178unsigned char TMP2 __attribute__((vector_size(16)));179unsigned char SRC __attribute__((vector_size(16)));180#endif181182NEON_INIT_LOOP();183184do {185NEON_MAIN_LOOP(NEON_DO_REVERSE);186} while ((ip += 2) < ipend);187188NEON_FINI_LOOP();189}190191static boolean_t fletcher_4_aarch64_neon_valid(void)192{193return (kfpu_allowed());194}195196const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {197.init_native = fletcher_4_aarch64_neon_init,198.compute_native = fletcher_4_aarch64_neon_native,199.fini_native = fletcher_4_aarch64_neon_fini,200.init_byteswap = fletcher_4_aarch64_neon_init,201.compute_byteswap = fletcher_4_aarch64_neon_byteswap,202.fini_byteswap = fletcher_4_aarch64_neon_fini,203.valid = fletcher_4_aarch64_neon_valid,204.uses_fpu = B_TRUE,205.name = "aarch64_neon"206};207208#endif /* defined(__aarch64__) */209210211