Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crc/riscv/crc-clmul-template.h
26285 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/* Copyright 2025 Google LLC */
3
4
/*
5
* This file is a "template" that generates a CRC function optimized using the
6
* RISC-V Zbc (scalar carryless multiplication) extension. The includer of this
7
* file must define the following parameters to specify the type of CRC:
8
*
9
* crc_t: the data type of the CRC, e.g. u32 for a 32-bit CRC
10
* LSB_CRC: 0 for a msb (most-significant-bit) first CRC, i.e. natural
11
* mapping between bits and polynomial coefficients
12
* 1 for a lsb (least-significant-bit) first CRC, i.e. reflected
13
* mapping between bits and polynomial coefficients
14
*/
15
16
#include <asm/byteorder.h>
17
#include <linux/minmax.h>
18
19
#define CRC_BITS (8 * sizeof(crc_t)) /* a.k.a. 'n' */
20
21
static inline unsigned long clmul(unsigned long a, unsigned long b)
22
{
23
unsigned long res;
24
25
asm(".option push\n"
26
".option arch,+zbc\n"
27
"clmul %0, %1, %2\n"
28
".option pop\n"
29
: "=r" (res) : "r" (a), "r" (b));
30
return res;
31
}
32
33
static inline unsigned long clmulh(unsigned long a, unsigned long b)
34
{
35
unsigned long res;
36
37
asm(".option push\n"
38
".option arch,+zbc\n"
39
"clmulh %0, %1, %2\n"
40
".option pop\n"
41
: "=r" (res) : "r" (a), "r" (b));
42
return res;
43
}
44
45
static inline unsigned long clmulr(unsigned long a, unsigned long b)
46
{
47
unsigned long res;
48
49
asm(".option push\n"
50
".option arch,+zbc\n"
51
"clmulr %0, %1, %2\n"
52
".option pop\n"
53
: "=r" (res) : "r" (a), "r" (b));
54
return res;
55
}
56
57
/*
58
* crc_load_long() loads one "unsigned long" of aligned data bytes, producing a
59
* polynomial whose bit order matches the CRC's bit order.
60
*/
61
#ifdef CONFIG_64BIT
62
# if LSB_CRC
63
# define crc_load_long(x) le64_to_cpup(x)
64
# else
65
# define crc_load_long(x) be64_to_cpup(x)
66
# endif
67
#else
68
# if LSB_CRC
69
# define crc_load_long(x) le32_to_cpup(x)
70
# else
71
# define crc_load_long(x) be32_to_cpup(x)
72
# endif
73
#endif
74
75
/* XOR @crc into the end of @msgpoly that represents the high-order terms. */
76
static inline unsigned long
77
crc_clmul_prep(crc_t crc, unsigned long msgpoly)
78
{
79
#if LSB_CRC
80
return msgpoly ^ crc;
81
#else
82
return msgpoly ^ ((unsigned long)crc << (BITS_PER_LONG - CRC_BITS));
83
#endif
84
}
85
86
/*
87
* Multiply the long-sized @msgpoly by x^n (a.k.a. x^CRC_BITS) and reduce it
88
* modulo the generator polynomial G. This gives the CRC of @msgpoly.
89
*/
90
static inline crc_t
91
crc_clmul_long(unsigned long msgpoly, const struct crc_clmul_consts *consts)
92
{
93
unsigned long tmp;
94
95
/*
96
* First step of Barrett reduction with integrated multiplication by
97
* x^n: calculate floor((msgpoly * x^n) / G). This is the value by
98
* which G needs to be multiplied to cancel out the x^n and higher terms
99
* of msgpoly * x^n. Do it using the following formula:
100
*
101
* msb-first:
102
* floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G)) / x^(BITS_PER_LONG-1))
103
* lsb-first:
104
* floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G) * x) / x^BITS_PER_LONG)
105
*
106
* barrett_reduction_const_1 contains floor(x^(BITS_PER_LONG-1+n) / G),
107
* which fits a long exactly. Using any lower power of x there would
108
* not carry enough precision through the calculation, while using any
109
* higher power of x would require extra instructions to handle a wider
110
* multiplication. In the msb-first case, using this power of x results
111
* in needing a floored division by x^(BITS_PER_LONG-1), which matches
112
* what clmulr produces. In the lsb-first case, a factor of x gets
113
* implicitly introduced by each carryless multiplication (shown as
114
* '* x' above), and the floored division instead needs to be by
115
* x^BITS_PER_LONG which matches what clmul produces.
116
*/
117
#if LSB_CRC
118
tmp = clmul(msgpoly, consts->barrett_reduction_const_1);
119
#else
120
tmp = clmulr(msgpoly, consts->barrett_reduction_const_1);
121
#endif
122
123
/*
124
* Second step of Barrett reduction:
125
*
126
* crc := (msgpoly * x^n) + (G * floor((msgpoly * x^n) / G))
127
*
128
* This reduces (msgpoly * x^n) modulo G by adding the appropriate
129
* multiple of G to it. The result uses only the x^0..x^(n-1) terms.
130
* HOWEVER, since the unreduced value (msgpoly * x^n) is zero in those
131
* terms in the first place, it is more efficient to do the equivalent:
132
*
133
* crc := ((G - x^n) * floor((msgpoly * x^n) / G)) mod x^n
134
*
135
* In the lsb-first case further modify it to the following which avoids
136
* a shift, as the crc ends up in the physically low n bits from clmulr:
137
*
138
* product := ((G - x^n) * x^(BITS_PER_LONG - n)) * floor((msgpoly * x^n) / G) * x
139
* crc := floor(product / x^(BITS_PER_LONG + 1 - n)) mod x^n
140
*
141
* barrett_reduction_const_2 contains the constant multiplier (G - x^n)
142
* or (G - x^n) * x^(BITS_PER_LONG - n) from the formulas above. The
143
* cast of the result to crc_t is essential, as it applies the mod x^n!
144
*/
145
#if LSB_CRC
146
return clmulr(tmp, consts->barrett_reduction_const_2);
147
#else
148
return clmul(tmp, consts->barrett_reduction_const_2);
149
#endif
150
}
151
152
/* Update @crc with the data from @msgpoly. */
153
static inline crc_t
154
crc_clmul_update_long(crc_t crc, unsigned long msgpoly,
155
const struct crc_clmul_consts *consts)
156
{
157
return crc_clmul_long(crc_clmul_prep(crc, msgpoly), consts);
158
}
159
160
/* Update @crc with 1 <= @len < sizeof(unsigned long) bytes of data. */
161
static inline crc_t
162
crc_clmul_update_partial(crc_t crc, const u8 *p, size_t len,
163
const struct crc_clmul_consts *consts)
164
{
165
unsigned long msgpoly;
166
size_t i;
167
168
#if LSB_CRC
169
msgpoly = (unsigned long)p[0] << (BITS_PER_LONG - 8);
170
for (i = 1; i < len; i++)
171
msgpoly = (msgpoly >> 8) ^ ((unsigned long)p[i] << (BITS_PER_LONG - 8));
172
#else
173
msgpoly = p[0];
174
for (i = 1; i < len; i++)
175
msgpoly = (msgpoly << 8) ^ p[i];
176
#endif
177
178
if (len >= sizeof(crc_t)) {
179
#if LSB_CRC
180
msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
181
#else
182
msgpoly ^= (unsigned long)crc << (8*len - CRC_BITS);
183
#endif
184
return crc_clmul_long(msgpoly, consts);
185
}
186
#if LSB_CRC
187
msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
188
return crc_clmul_long(msgpoly, consts) ^ (crc >> (8*len));
189
#else
190
msgpoly ^= crc >> (CRC_BITS - 8*len);
191
return crc_clmul_long(msgpoly, consts) ^ (crc << (8*len));
192
#endif
193
}
194
195
static inline crc_t
196
crc_clmul(crc_t crc, const void *p, size_t len,
197
const struct crc_clmul_consts *consts)
198
{
199
size_t align;
200
201
/* This implementation assumes that the CRC fits in an unsigned long. */
202
BUILD_BUG_ON(sizeof(crc_t) > sizeof(unsigned long));
203
204
/* If the buffer is not long-aligned, align it. */
205
align = (unsigned long)p % sizeof(unsigned long);
206
if (align && len) {
207
align = min(sizeof(unsigned long) - align, len);
208
crc = crc_clmul_update_partial(crc, p, align, consts);
209
p += align;
210
len -= align;
211
}
212
213
if (len >= 4 * sizeof(unsigned long)) {
214
unsigned long m0, m1;
215
216
m0 = crc_clmul_prep(crc, crc_load_long(p));
217
m1 = crc_load_long(p + sizeof(unsigned long));
218
p += 2 * sizeof(unsigned long);
219
len -= 2 * sizeof(unsigned long);
220
/*
221
* Main loop. Each iteration starts with a message polynomial
222
* (x^BITS_PER_LONG)*m0 + m1, then logically extends it by two
223
* more longs of data to form x^(3*BITS_PER_LONG)*m0 +
224
* x^(2*BITS_PER_LONG)*m1 + x^BITS_PER_LONG*m2 + m3, then
225
* "folds" that back into a congruent (modulo G) value that uses
226
* just m0 and m1 again. This is done by multiplying m0 by the
227
* precomputed constant (x^(3*BITS_PER_LONG) mod G) and m1 by
228
* the precomputed constant (x^(2*BITS_PER_LONG) mod G), then
229
* adding the results to m2 and m3 as appropriate. Each such
230
* multiplication produces a result twice the length of a long,
231
* which in RISC-V is two instructions clmul and clmulh.
232
*
233
* This could be changed to fold across more than 2 longs at a
234
* time if there is a CPU that can take advantage of it.
235
*/
236
do {
237
unsigned long p0, p1, p2, p3;
238
239
p0 = clmulh(m0, consts->fold_across_2_longs_const_hi);
240
p1 = clmul(m0, consts->fold_across_2_longs_const_hi);
241
p2 = clmulh(m1, consts->fold_across_2_longs_const_lo);
242
p3 = clmul(m1, consts->fold_across_2_longs_const_lo);
243
m0 = (LSB_CRC ? p1 ^ p3 : p0 ^ p2) ^ crc_load_long(p);
244
m1 = (LSB_CRC ? p0 ^ p2 : p1 ^ p3) ^
245
crc_load_long(p + sizeof(unsigned long));
246
247
p += 2 * sizeof(unsigned long);
248
len -= 2 * sizeof(unsigned long);
249
} while (len >= 2 * sizeof(unsigned long));
250
251
crc = crc_clmul_long(m0, consts);
252
crc = crc_clmul_update_long(crc, m1, consts);
253
}
254
255
while (len >= sizeof(unsigned long)) {
256
crc = crc_clmul_update_long(crc, crc_load_long(p), consts);
257
p += sizeof(unsigned long);
258
len -= sizeof(unsigned long);
259
}
260
261
if (len)
262
crc = crc_clmul_update_partial(crc, p, len, consts);
263
264
return crc;
265
}
266
267