Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/zcommon/zfs_fletcher_aarch64_neon.c
48383 views
1
// SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only
2
/*
3
* Implement fast Fletcher4 with NEON instructions. (aarch64)
4
*
5
* Use the 128-bit NEON SIMD instructions and registers to compute
6
* Fletcher4 in two incremental 64-bit parallel accumulator streams,
7
* and then combine the streams to form the final four checksum words.
8
* This implementation is a derivative of the AVX SIMD implementation by
9
* James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c).
10
*
11
* Copyright (C) 2016 Romain Dolbeau.
12
*
13
* Authors:
14
* Romain Dolbeau <[email protected]>
15
*
16
* This software is available to you under a choice of one of two
17
* licenses. You may choose to be licensed under the terms of the GNU
18
* General Public License (GPL) Version 2, available from the file
19
* COPYING in the main directory of this source tree, or the
20
* OpenIB.org BSD license below:
21
*
22
* Redistribution and use in source and binary forms, with or
23
* without modification, are permitted provided that the following
24
* conditions are met:
25
*
26
* - Redistributions of source code must retain the above
27
* copyright notice, this list of conditions and the following
28
* disclaimer.
29
*
30
* - Redistributions in binary form must reproduce the above
31
* copyright notice, this list of conditions and the following
32
* disclaimer in the documentation and/or other materials
33
* provided with the distribution.
34
*
35
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
36
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
37
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
38
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
39
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
40
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
41
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
42
* SOFTWARE.
43
*/
44
45
#if defined(__aarch64__)
46
47
#include <sys/simd.h>
48
#include <sys/spa_checksum.h>
49
#include <sys/string.h>
50
#include <zfs_fletcher.h>
51
52
static void
53
fletcher_4_aarch64_neon_init(fletcher_4_ctx_t *ctx)
54
{
55
memset(ctx->aarch64_neon, 0, 4 * sizeof (zfs_fletcher_aarch64_neon_t));
56
}
57
58
static void
59
fletcher_4_aarch64_neon_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
60
{
61
uint64_t A, B, C, D;
62
A = ctx->aarch64_neon[0].v[0] + ctx->aarch64_neon[0].v[1];
63
B = 2 * ctx->aarch64_neon[1].v[0] + 2 * ctx->aarch64_neon[1].v[1] -
64
ctx->aarch64_neon[0].v[1];
65
C = 4 * ctx->aarch64_neon[2].v[0] - ctx->aarch64_neon[1].v[0] +
66
4 * ctx->aarch64_neon[2].v[1] - 3 * ctx->aarch64_neon[1].v[1];
67
D = 8 * ctx->aarch64_neon[3].v[0] - 4 * ctx->aarch64_neon[2].v[0] +
68
8 * ctx->aarch64_neon[3].v[1] - 8 * ctx->aarch64_neon[2].v[1] +
69
ctx->aarch64_neon[1].v[1];
70
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
71
}
72
73
#define NEON_INIT_LOOP() \
74
asm("eor %[ZERO].16b,%[ZERO].16b,%[ZERO].16b\n" \
75
"ld1 { %[ACC0].4s }, %[CTX0]\n" \
76
"ld1 { %[ACC1].4s }, %[CTX1]\n" \
77
"ld1 { %[ACC2].4s }, %[CTX2]\n" \
78
"ld1 { %[ACC3].4s }, %[CTX3]\n" \
79
: [ZERO] "=w" (ZERO), \
80
[ACC0] "=w" (ACC0), [ACC1] "=w" (ACC1), \
81
[ACC2] "=w" (ACC2), [ACC3] "=w" (ACC3) \
82
: [CTX0] "Q" (ctx->aarch64_neon[0]), \
83
[CTX1] "Q" (ctx->aarch64_neon[1]), \
84
[CTX2] "Q" (ctx->aarch64_neon[2]), \
85
[CTX3] "Q" (ctx->aarch64_neon[3]))
86
87
#define NEON_DO_REVERSE "rev32 %[SRC].16b, %[SRC].16b\n"
88
89
#define NEON_DONT_REVERSE ""
90
91
#define NEON_MAIN_LOOP(REVERSE) \
92
asm("ld1 { %[SRC].4s }, %[IP]\n" \
93
REVERSE \
94
"zip1 %[TMP1].4s, %[SRC].4s, %[ZERO].4s\n" \
95
"zip2 %[TMP2].4s, %[SRC].4s, %[ZERO].4s\n" \
96
"add %[ACC0].2d, %[ACC0].2d, %[TMP1].2d\n" \
97
"add %[ACC1].2d, %[ACC1].2d, %[ACC0].2d\n" \
98
"add %[ACC2].2d, %[ACC2].2d, %[ACC1].2d\n" \
99
"add %[ACC3].2d, %[ACC3].2d, %[ACC2].2d\n" \
100
"add %[ACC0].2d, %[ACC0].2d, %[TMP2].2d\n" \
101
"add %[ACC1].2d, %[ACC1].2d, %[ACC0].2d\n" \
102
"add %[ACC2].2d, %[ACC2].2d, %[ACC1].2d\n" \
103
"add %[ACC3].2d, %[ACC3].2d, %[ACC2].2d\n" \
104
: [SRC] "=&w" (SRC), \
105
[TMP1] "=&w" (TMP1), [TMP2] "=&w" (TMP2), \
106
[ACC0] "+w" (ACC0), [ACC1] "+w" (ACC1), \
107
[ACC2] "+w" (ACC2), [ACC3] "+w" (ACC3) \
108
: [ZERO] "w" (ZERO), [IP] "Q" (*ip))
109
110
#define NEON_FINI_LOOP() \
111
asm("st1 { %[ACC0].4s },%[DST0]\n" \
112
"st1 { %[ACC1].4s },%[DST1]\n" \
113
"st1 { %[ACC2].4s },%[DST2]\n" \
114
"st1 { %[ACC3].4s },%[DST3]\n" \
115
: [DST0] "=Q" (ctx->aarch64_neon[0]), \
116
[DST1] "=Q" (ctx->aarch64_neon[1]), \
117
[DST2] "=Q" (ctx->aarch64_neon[2]), \
118
[DST3] "=Q" (ctx->aarch64_neon[3]) \
119
: [ACC0] "w" (ACC0), [ACC1] "w" (ACC1), \
120
[ACC2] "w" (ACC2), [ACC3] "w" (ACC3))
121
122
static void
123
fletcher_4_aarch64_neon_native(fletcher_4_ctx_t *ctx,
124
const void *buf, uint64_t size)
125
{
126
const uint64_t *ip = buf;
127
const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
128
#if defined(_KERNEL)
129
register unsigned char ZERO asm("v0") __attribute__((vector_size(16)));
130
register unsigned char ACC0 asm("v1") __attribute__((vector_size(16)));
131
register unsigned char ACC1 asm("v2") __attribute__((vector_size(16)));
132
register unsigned char ACC2 asm("v3") __attribute__((vector_size(16)));
133
register unsigned char ACC3 asm("v4") __attribute__((vector_size(16)));
134
register unsigned char TMP1 asm("v5") __attribute__((vector_size(16)));
135
register unsigned char TMP2 asm("v6") __attribute__((vector_size(16)));
136
register unsigned char SRC asm("v7") __attribute__((vector_size(16)));
137
#else
138
unsigned char ZERO __attribute__((vector_size(16)));
139
unsigned char ACC0 __attribute__((vector_size(16)));
140
unsigned char ACC1 __attribute__((vector_size(16)));
141
unsigned char ACC2 __attribute__((vector_size(16)));
142
unsigned char ACC3 __attribute__((vector_size(16)));
143
unsigned char TMP1 __attribute__((vector_size(16)));
144
unsigned char TMP2 __attribute__((vector_size(16)));
145
unsigned char SRC __attribute__((vector_size(16)));
146
#endif
147
148
NEON_INIT_LOOP();
149
150
do {
151
NEON_MAIN_LOOP(NEON_DONT_REVERSE);
152
} while ((ip += 2) < ipend);
153
154
NEON_FINI_LOOP();
155
}
156
157
static void
158
fletcher_4_aarch64_neon_byteswap(fletcher_4_ctx_t *ctx,
159
const void *buf, uint64_t size)
160
{
161
const uint64_t *ip = buf;
162
const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
163
#if defined(_KERNEL)
164
register unsigned char ZERO asm("v0") __attribute__((vector_size(16)));
165
register unsigned char ACC0 asm("v1") __attribute__((vector_size(16)));
166
register unsigned char ACC1 asm("v2") __attribute__((vector_size(16)));
167
register unsigned char ACC2 asm("v3") __attribute__((vector_size(16)));
168
register unsigned char ACC3 asm("v4") __attribute__((vector_size(16)));
169
register unsigned char TMP1 asm("v5") __attribute__((vector_size(16)));
170
register unsigned char TMP2 asm("v6") __attribute__((vector_size(16)));
171
register unsigned char SRC asm("v7") __attribute__((vector_size(16)));
172
#else
173
unsigned char ZERO __attribute__((vector_size(16)));
174
unsigned char ACC0 __attribute__((vector_size(16)));
175
unsigned char ACC1 __attribute__((vector_size(16)));
176
unsigned char ACC2 __attribute__((vector_size(16)));
177
unsigned char ACC3 __attribute__((vector_size(16)));
178
unsigned char TMP1 __attribute__((vector_size(16)));
179
unsigned char TMP2 __attribute__((vector_size(16)));
180
unsigned char SRC __attribute__((vector_size(16)));
181
#endif
182
183
NEON_INIT_LOOP();
184
185
do {
186
NEON_MAIN_LOOP(NEON_DO_REVERSE);
187
} while ((ip += 2) < ipend);
188
189
NEON_FINI_LOOP();
190
}
191
192
static boolean_t fletcher_4_aarch64_neon_valid(void)
193
{
194
return (kfpu_allowed());
195
}
196
197
const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {
198
.init_native = fletcher_4_aarch64_neon_init,
199
.compute_native = fletcher_4_aarch64_neon_native,
200
.fini_native = fletcher_4_aarch64_neon_fini,
201
.init_byteswap = fletcher_4_aarch64_neon_init,
202
.compute_byteswap = fletcher_4_aarch64_neon_byteswap,
203
.fini_byteswap = fletcher_4_aarch64_neon_fini,
204
.valid = fletcher_4_aarch64_neon_valid,
205
.uses_fpu = B_TRUE,
206
.name = "aarch64_neon"
207
};
208
209
#endif /* defined(__aarch64__) */
210
211