CoCalc -- crc-pclmul-template.S

GitHub Repository: torvalds/linux
Path: blob/master/lib/crc/x86/crc-pclmul-template.S
²⁶²⁹² views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
//
3
// Template to generate [V]PCLMULQDQ-based CRC functions for x86
4
//
5
// Copyright 2025 Google LLC
6
//
7
// Author: Eric Biggers <ebiggers@google.com>
8

9
#include <linux/linkage.h>
10
#include <linux/objtool.h>
11

12
// Offsets within the generated constants table
13
.set OFFSETOF_BSWAP_MASK,			-5*16	// msb-first CRCs only
14
.set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS,	-4*16	// must precede next
15
.set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS,	-3*16	// must precede next
16
.set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS,	-2*16	// must precede next
17
.set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS,	-1*16	// must precede next
18
.set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS,	0*16	// must be 0
19
.set OFFSETOF_SHUF_TABLE,			1*16
20
.set OFFSETOF_BARRETT_REDUCTION_CONSTS,		4*16
21

22
// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the
23
// corresponding non-VEX instruction plus any needed moves.  The supported
24
// instruction formats are:
25
//
26
//     - Two-arg [src, dst], where the non-VEX format is the same.
27
//     - Three-arg [src1, src2, dst] where the non-VEX format is
28
//	 [src1, src2_and_dst].  If src2 != dst, then src1 must != dst too.
29
//
30
// \insn gives the instruction without a "v" prefix and including any immediate
31
// argument if needed to make the instruction follow one of the above formats.
32
// If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to
33
// it first; this is needed when \arg1 is an unaligned mem operand.
34
.macro	_cond_vex	insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp
35
.if AVX_LEVEL == 0
36
  // VEX not allowed.  Emulate it.
37
  .ifnb \arg3 // Three-arg [src1, src2, dst]
38
    .ifc "\arg2", "\arg3" // src2 == dst?
39
      .ifnb \unaligned_mem_tmp
40
	movdqu		\arg1, \unaligned_mem_tmp
41
	\insn		\unaligned_mem_tmp, \arg3
42
      .else
43
	\insn		\arg1, \arg3
44
      .endif
45
    .else // src2 != dst
46
      .ifc "\arg1", "\arg3"
47
	.error "Can't have src1 == dst when src2 != dst"
48
      .endif
49
      .ifnb \unaligned_mem_tmp
50
	movdqu		\arg1, \unaligned_mem_tmp
51
	movdqa		\arg2, \arg3
52
	\insn		\unaligned_mem_tmp, \arg3
53
      .else
54
	movdqa		\arg2, \arg3
55
	\insn		\arg1, \arg3
56
      .endif
57
    .endif
58
  .else // Two-arg [src, dst]
59
    .ifnb \unaligned_mem_tmp
60
	movdqu		\arg1, \unaligned_mem_tmp
61
	\insn		\unaligned_mem_tmp, \arg2
62
    .else
63
	\insn		\arg1, \arg2
64
    .endif
65
  .endif
66
.else
67
  // VEX is allowed.  Emit the desired instruction directly.
68
  .ifnb \arg3
69
	v\insn		\arg1, \arg2, \arg3
70
  .else
71
	v\insn		\arg1, \arg2
72
  .endif
73
.endif
74
.endm
75

76
// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector
77
// register of length VL.
78
.macro	_vbroadcast	src, dst
79
.if VL == 16
80
	_cond_vex movdqa,	\src, \dst
81
.elseif VL == 32
82
	vbroadcasti128		\src, \dst
83
.else
84
	vbroadcasti32x4		\src, \dst
85
.endif
86
.endm
87

88
// Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC
89
// is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane.
90
.macro	_load_data	vl, src, bswap_mask, dst
91
.if \vl < 64
92
	_cond_vex movdqu,	"\src", \dst
93
.else
94
	vmovdqu8		\src, \dst
95
.endif
96
.if !LSB_CRC
97
	_cond_vex pshufb,	\bswap_mask, \dst, \dst
98
.endif
99
.endm
100

101
.macro	_prepare_v0	vl, v0, v1, bswap_mask
102
.if LSB_CRC
103
  .if \vl < 64
104
	_cond_vex pxor,		(BUF), \v0, \v0, unaligned_mem_tmp=\v1
105
  .else
106
	vpxorq			(BUF), \v0, \v0
107
  .endif
108
.else
109
	_load_data		\vl, (BUF), \bswap_mask, \v1
110
  .if \vl < 64
111
	_cond_vex pxor,		\v1, \v0, \v0
112
  .else
113
	vpxorq			\v1, \v0, \v0
114
  .endif
115
.endif
116
.endm
117

118
// The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for
119
// msb-first order or the physically high qword for lsb-first order
120
#define LO64_TERMS 0
121

122
// The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high
123
// qword for msb-first order or the physically low qword for lsb-first order
124
#define HI64_TERMS 1
125

126
// Multiply the given \src1_terms of each 128-bit lane of \src1 by the given
127
// \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst.
128
.macro	_pclmulqdq	src1, src1_terms, src2, src2_terms, dst
129
	_cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \
130
		  \src1, \src2, \dst
131
.endm
132

133
// Fold \acc into \data and store the result back into \acc.  \data can be an
134
// unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no
135
// byte-reflection is needed; otherwise it must be a vector register.  \consts
136
// is a vector register containing the needed fold constants, and \tmp is a
137
// temporary vector register.  All arguments must be the same length.
138
.macro	_fold_vec	acc, data, consts, tmp
139
	_pclmulqdq	\consts, HI64_TERMS, \acc, HI64_TERMS, \tmp
140
	_pclmulqdq	\consts, LO64_TERMS, \acc, LO64_TERMS, \acc
141
.if AVX_LEVEL <= 2
142
	_cond_vex pxor,	\data, \tmp, \tmp
143
	_cond_vex pxor,	\tmp, \acc, \acc
144
.else
145
	vpternlogq	$0x96, \data, \tmp, \acc
146
.endif
147
.endm
148

149
// Fold \acc into \data and store the result back into \acc.  \data is an
150
// unaligned mem operand, \consts is a vector register containing the needed
151
// fold constants, \bswap_mask is a vector register containing the
152
// byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are
153
// temporary vector registers.  All arguments must have length \vl.
154
.macro	_fold_vec_mem	vl, acc, data, consts, bswap_mask, tmp1, tmp2
155
.if AVX_LEVEL == 0 || !LSB_CRC
156
	_load_data	\vl, \data, \bswap_mask, \tmp1
157
	_fold_vec	\acc, \tmp1, \consts, \tmp2
158
.else
159
	_fold_vec	\acc, \data, \consts, \tmp1
160
.endif
161
.endm
162

163
// Load the constants for folding across 2**i vectors of length VL at a time
164
// into all 128-bit lanes of the vector register CONSTS.
165
.macro	_load_vec_folding_consts	i
166
	_vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \
167
		    CONSTS
168
.endm
169

170
// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store
171
// the result back into \v0.  If the remaining length mod \vl is nonzero, also
172
// fold \vl data bytes from BUF.  For both operations the fold distance is \vl.
173
// \consts must be a register of length \vl containing the fold constants.
174
.macro	_fold_vec_final	vl, v0, v1, consts, bswap_mask, tmp1, tmp2
175
	_fold_vec	\v0, \v1, \consts, \tmp1
176
	test		$\vl, LEN8
177
	jz		.Lfold_vec_final_done\@
178
	_fold_vec_mem	\vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2
179
	add		$\vl, BUF
180
.Lfold_vec_final_done\@:
181
.endm
182

183
// This macro generates the body of a CRC function with the following prototype:
184
//
185
// crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts);
186
//
187
// |crc| is the initial CRC, and crc_t is a data type wide enough to hold it.
188
// |buf| is the data to checksum.  |len| is the data length in bytes, which must
189
// be at least 16.  |consts| is a pointer to the fold_across_128_bits_consts
190
// field of the constants struct that was generated for the chosen CRC variant.
191
//
192
// Moving onto the macro parameters, \n is the number of bits in the CRC, e.g.
193
// 32 for a CRC-32.  Currently the supported values are 8, 16, 32, and 64.  If
194
// the file is compiled in i386 mode, then the maximum supported value is 32.
195
//
196
// \lsb_crc is 1 if the CRC processes the least significant bit of each byte
197
// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0.  \lsb_crc is 0
198
// if the CRC processes the most significant bit of each byte first, i.e. maps
199
// bit0 to x^0, bit1 to x^1, bit7 to x^7.
200
//
201
// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64.
202
//
203
// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or
204
// 512 for AVX512.
205
//
206
// If \vl == 16 && \avx_level == 0, the generated code requires:
207
// PCLMULQDQ && SSE4.1.  (Note: all known CPUs with PCLMULQDQ also have SSE4.1.)
208
//
209
// If \vl == 32 && \avx_level == 2, the generated code requires:
210
// VPCLMULQDQ && AVX2.
211
//
212
// If \vl == 64 && \avx_level == 512, the generated code requires:
213
// VPCLMULQDQ && AVX512BW && AVX512VL.
214
//
215
// Other \vl and \avx_level combinations are either not supported or not useful.
216
.macro	_crc_pclmul	n, lsb_crc, vl, avx_level
217
	.set	LSB_CRC,	\lsb_crc
218
	.set	VL,		\vl
219
	.set	AVX_LEVEL,	\avx_level
220

221
	// Define aliases for the xmm, ymm, or zmm registers according to VL.
222
.irp i, 0,1,2,3,4,5,6,7
223
  .if VL == 16
224
	.set	V\i,		%xmm\i
225
	.set	LOG2_VL,	4
226
  .elseif VL == 32
227
	.set	V\i,		%ymm\i
228
	.set	LOG2_VL,	5
229
  .elseif VL == 64
230
	.set	V\i,		%zmm\i
231
	.set	LOG2_VL,	6
232
  .else
233
	.error "Unsupported vector length"
234
  .endif
235
.endr
236
	// Define aliases for the function parameters.
237
	// Note: when crc_t is shorter than u32, zero-extension to 32 bits is
238
	// guaranteed by the ABI.  Zero-extension to 64 bits is *not* guaranteed
239
	// when crc_t is shorter than u64.
240
#ifdef __x86_64__
241
.if \n <= 32
242
	.set	CRC,		%edi
243
.else
244
	.set	CRC,		%rdi
245
.endif
246
	.set	BUF,		%rsi
247
	.set	LEN,		%rdx
248
	.set	LEN32,		%edx
249
	.set	LEN8,		%dl
250
	.set	CONSTS_PTR,	%rcx
251
#else
252
	// 32-bit support, assuming -mregparm=3 and not including support for
253
	// CRC-64 (which would use both eax and edx to pass the crc parameter).
254
	.set	CRC,		%eax
255
	.set	BUF,		%edx
256
	.set	LEN,		%ecx
257
	.set	LEN32,		%ecx
258
	.set	LEN8,		%cl
259
	.set	CONSTS_PTR,	%ebx	// Passed on stack
260
#endif
261

262
	// Define aliases for some local variables.  V0-V5 are used without
263
	// aliases (for accumulators, data, temporary values, etc).  Staying
264
	// within the first 8 vector registers keeps the code 32-bit SSE
265
	// compatible and reduces the size of 64-bit SSE code slightly.
266
	.set	BSWAP_MASK,	V6
267
	.set	BSWAP_MASK_YMM,	%ymm6
268
	.set	BSWAP_MASK_XMM,	%xmm6
269
	.set	CONSTS,		V7
270
	.set	CONSTS_YMM,	%ymm7
271
	.set	CONSTS_XMM,	%xmm7
272

273
	// Use ANNOTATE_NOENDBR to suppress an objtool warning, since the
274
	// functions generated by this macro are called only by static_call.
275
	ANNOTATE_NOENDBR
276

277
#ifdef __i386__
278
	push		CONSTS_PTR
279
	mov		8(%esp), CONSTS_PTR
280
#endif
281

282
	// Create a 128-bit vector that contains the initial CRC in the end
283
	// representing the high-order polynomial coefficients, and the rest 0.
284
	// If the CRC is msb-first, also load the byte-reflection table.
285
.if \n <= 32
286
	_cond_vex movd,	CRC, %xmm0
287
.else
288
	_cond_vex movq,	CRC, %xmm0
289
.endif
290
.if !LSB_CRC
291
	_cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0
292
	_vbroadcast	OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK
293
.endif
294

295
	// Load the first vector of data and XOR the initial CRC into the
296
	// appropriate end of the first 128-bit lane of data.  If LEN < VL, then
297
	// use a short vector and jump ahead to the final reduction.  (LEN >= 16
298
	// is guaranteed here but not necessarily LEN >= VL.)
299
.if VL >= 32
300
	cmp		$VL, LEN
301
	jae		.Lat_least_1vec\@
302
  .if VL == 64
303
	cmp		$32, LEN32
304
	jb		.Lless_than_32bytes\@
305
	_prepare_v0	32, %ymm0, %ymm1, BSWAP_MASK_YMM
306
	add		$32, BUF
307
	jmp		.Lreduce_256bits_to_128bits\@
308
.Lless_than_32bytes\@:
309
  .endif
310
	_prepare_v0	16, %xmm0, %xmm1, BSWAP_MASK_XMM
311
	add		$16, BUF
312
	vmovdqa		OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
313
	jmp		.Lcheck_for_partial_block\@
314
.Lat_least_1vec\@:
315
.endif
316
	_prepare_v0	VL, V0, V1, BSWAP_MASK
317

318
	// Handle VL <= LEN < 4*VL.
319
	cmp		$4*VL-1, LEN
320
	ja		.Lat_least_4vecs\@
321
	add		$VL, BUF
322
	// If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector.
323
	// If VL==16 then load fold_across_128_bits_consts first, as the final
324
	// reduction depends on it and it won't be loaded anywhere else.
325
	cmp		$2*VL-1, LEN32
326
.if VL == 16
327
	_cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
328
.endif
329
	jbe		.Lreduce_1vec_to_128bits\@
330
	// Otherwise 2*VL <= LEN < 4*VL.  Load one more vector and jump ahead to
331
	// the reduction from 2 vectors.
332
	_load_data	VL, (BUF), BSWAP_MASK, V1
333
	add		$VL, BUF
334
	jmp		.Lreduce_2vecs_to_1\@
335

336
.Lat_least_4vecs\@:
337
	// Load 3 more vectors of data.
338
	_load_data	VL, 1*VL(BUF), BSWAP_MASK, V1
339
	_load_data	VL, 2*VL(BUF), BSWAP_MASK, V2
340
	_load_data	VL, 3*VL(BUF), BSWAP_MASK, V3
341
	sub		$-4*VL, BUF	// Shorter than 'add 4*VL' when VL=32
342
	add		$-4*VL, LEN	// Shorter than 'sub 4*VL' when VL=32
343

344
	// Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next
345
	// 4 vectors of data and write the result back to V0-V3.
346
	cmp		$4*VL-1, LEN	// Shorter than 'cmp 4*VL' when VL=32
347
	jbe		.Lreduce_4vecs_to_2\@
348
	_load_vec_folding_consts	2
349
.Lfold_4vecs_loop\@:
350
	_fold_vec_mem	VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
351
	_fold_vec_mem	VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
352
	_fold_vec_mem	VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
353
	_fold_vec_mem	VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
354
	sub		$-4*VL, BUF
355
	add		$-4*VL, LEN
356
	cmp		$4*VL-1, LEN
357
	ja		.Lfold_4vecs_loop\@
358

359
	// Fold V0,V1 into V2,V3 and write the result back to V0,V1.  Then fold
360
	// two more vectors of data from BUF, if at least that much remains.
361
.Lreduce_4vecs_to_2\@:
362
	_load_vec_folding_consts	1
363
	_fold_vec	V0, V2, CONSTS, V4
364
	_fold_vec	V1, V3, CONSTS, V4
365
	test		$2*VL, LEN8
366
	jz		.Lreduce_2vecs_to_1\@
367
	_fold_vec_mem	VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
368
	_fold_vec_mem	VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
369
	sub		$-2*VL, BUF
370

371
	// Fold V0 into V1 and write the result back to V0.  Then fold one more
372
	// vector of data from BUF, if at least that much remains.
373
.Lreduce_2vecs_to_1\@:
374
	_load_vec_folding_consts	0
375
	_fold_vec_final	VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5
376

377
.Lreduce_1vec_to_128bits\@:
378
.if VL == 64
379
	// Reduce 512-bit %zmm0 to 256-bit %ymm0.  Then fold 256 more bits of
380
	// data from BUF, if at least that much remains.
381
	vbroadcasti128	OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM
382
	vextracti64x4	$1, %zmm0, %ymm1
383
	_fold_vec_final	32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5
384
.Lreduce_256bits_to_128bits\@:
385
.endif
386
.if VL >= 32
387
	// Reduce 256-bit %ymm0 to 128-bit %xmm0.  Then fold 128 more bits of
388
	// data from BUF, if at least that much remains.
389
	vmovdqa		OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
390
	vextracti128	$1, %ymm0, %xmm1
391
	_fold_vec_final	16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5
392
.Lcheck_for_partial_block\@:
393
.endif
394
	and		$15, LEN32
395
	jz		.Lreduce_128bits_to_crc\@
396

397
	// 1 <= LEN <= 15 data bytes remain in BUF.  The polynomial is now
398
	// A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0
399
	// and B is the polynomial of the remaining LEN data bytes.  To reduce
400
	// this to 128 bits without needing fold constants for each possible
401
	// LEN, rearrange this expression into C1*(x^128) + C2, where
402
	// C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128.
403
	// Then fold C1 into C2, which is just another fold across 128 bits.
404

405
.if !LSB_CRC || AVX_LEVEL == 0
406
	// Load the last 16 data bytes.  Note that originally LEN was >= 16.
407
	_load_data	16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2
408
.endif // Else will use vpblendvb mem operand later.
409
.if !LSB_CRC
410
	neg		LEN	// Needed for indexing shuf_table
411
.endif
412

413
	// tmp = A*x^(8*LEN) mod x^128
414
	// lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1]
415
	//	i.e. right-shift by LEN bytes.
416
	// msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN]
417
	//	i.e. left-shift by LEN bytes.
418
	_cond_vex movdqu,	"OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3
419
	_cond_vex pshufb,	%xmm3, %xmm0, %xmm1
420

421
	// C1 = floor(A / x^(128 - 8*LEN))
422
	// lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1]
423
	//	i.e. left-shift by 16-LEN bytes.
424
	// msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1]
425
	//	i.e. right-shift by 16-LEN bytes.
426
	_cond_vex pshufb,	"OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \
427
				%xmm0, %xmm0, unaligned_mem_tmp=%xmm4
428

429
	// C2 = tmp + B.  This is just a blend of tmp with the last 16 data
430
	// bytes (reflected if msb-first).  The blend mask is the shuffle table
431
	// that was used to create tmp.  0 selects tmp, and 1 last16databytes.
432
.if AVX_LEVEL == 0
433
	movdqa		%xmm0, %xmm4
434
	movdqa		%xmm3, %xmm0
435
	pblendvb	%xmm2, %xmm1	// uses %xmm0 as implicit operand
436
	movdqa		%xmm4, %xmm0
437
.elseif LSB_CRC
438
	vpblendvb	%xmm3, -16(BUF,LEN), %xmm1, %xmm1
439
.else
440
	vpblendvb	%xmm3, %xmm2, %xmm1, %xmm1
441
.endif
442

443
	// Fold C1 into C2 and store the 128-bit result in %xmm0.
444
	_fold_vec	%xmm0, %xmm1, CONSTS_XMM, %xmm4
445

446
.Lreduce_128bits_to_crc\@:
447
	// Compute the CRC as %xmm0 * x^n mod G.  Here %xmm0 means the 128-bit
448
	// polynomial stored in %xmm0 (using either lsb-first or msb-first bit
449
	// order according to LSB_CRC), and G is the CRC's generator polynomial.
450

451
	// First, multiply %xmm0 by x^n and reduce the result to 64+n bits:
452
	//
453
	//	t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) +
454
	//	      x^n * (%xmm0 mod x^64)
455
	//
456
	// Store t0 * x^(64-n) in %xmm0.  I.e., actually do:
457
	//
458
	//	%xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) +
459
	//		 x^64 * (%xmm0 mod x^64)
460
	//
461
	// The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned
462
	// to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily
463
	// select it.  The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the
464
	// msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case
465
	// (considering the extra factor of x that gets implicitly introduced by
466
	// each pclmulqdq when using lsb-first order), is identical to the
467
	// constant that was used earlier for folding the LO64_TERMS across 128
468
	// bits.  Thus it's already available in LO64_TERMS of CONSTS_XMM.
469
	_pclmulqdq		CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1
470
.if LSB_CRC
471
	_cond_vex psrldq,	$8, %xmm0, %xmm0  // x^64 * (%xmm0 mod x^64)
472
.else
473
	_cond_vex pslldq,	$8, %xmm0, %xmm0  // x^64 * (%xmm0 mod x^64)
474
.endif
475
	_cond_vex pxor,		%xmm1, %xmm0, %xmm0
476
	// The HI64_TERMS of %xmm0 now contain floor(t0 / x^n).
477
	// The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n).
478

479
	// First step of Barrett reduction: Compute floor(t0 / G).  This is the
480
	// polynomial by which G needs to be multiplied to cancel out the x^n
481
	// and higher terms of t0, i.e. to reduce t0 mod G.  First do:
482
	//
483
	//	t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n)
484
	//
485
	// Then the desired value floor(t0 / G) is floor(t1 / x^64).  The 63 in
486
	// x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest
487
	// value that makes enough precision be carried through the calculation.
488
	//
489
	// The '* x' makes it so the result is floor(t1 / x^64) rather than
490
	// floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it
491
	// can be extracted much more easily in the next step.  In the lsb-first
492
	// case the '* x' happens implicitly.  In the msb-first case it must be
493
	// done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the
494
	// constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and
495
	// the multiplication by the x^64 term is handled using a pxor.  The
496
	// pxor causes the low 64 terms of t1 to be wrong, but they are unused.
497
	_cond_vex movdqa,	OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM
498
	_pclmulqdq		CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1
499
.if !LSB_CRC
500
	_cond_vex pxor,		%xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n)
501
.endif
502
	// The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G).
503

504
	// Second step of Barrett reduction: Cancel out the x^n and higher terms
505
	// of t0 by subtracting the needed multiple of G.  This gives the CRC:
506
	//
507
	//	crc := t0 - (G * floor(t0 / G))
508
	//
509
	// But %xmm0 contains t0 * x^(64-n), so it's more convenient to do:
510
	//
511
	//	crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n)
512
	//
513
	// Furthermore, since the resulting CRC is n-bit, if mod x^n is
514
	// explicitly applied to it then the x^n term of G makes no difference
515
	// in the result and can be omitted.  This helps keep the constant
516
	// multiplier in 64 bits in most cases.  This gives the following:
517
	//
518
	//	%xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G))
519
	//	crc := (%xmm0 / x^(64-n)) mod x^n
520
	//
521
	// In the lsb-first case, each pclmulqdq implicitly introduces
522
	// an extra factor of x, so in that case the constant that needs to be
523
	// passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63.
524
	// For lsb-first CRCs where n=64, the extra factor of x cannot be as
525
	// easily avoided.  In that case, instead pass '(G - x^n - x^0) / x' to
526
	// pclmulqdq and handle the x^0 term (i.e. 1) separately.  (All CRC
527
	// polynomials have nonzero x^n and x^0 terms.)  It works out as: the
528
	// CRC has be XORed with the physically low qword of %xmm1, representing
529
	// floor(t0 / G).  The most efficient way to do that is to move it to
530
	// the physically high qword and use a ternlog to combine the two XORs.
531
.if LSB_CRC && \n == 64
532
	_cond_vex punpcklqdq,	%xmm1, %xmm2, %xmm2
533
	_pclmulqdq		CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
534
    .if AVX_LEVEL <= 2
535
	_cond_vex pxor,		%xmm2, %xmm0, %xmm0
536
	_cond_vex pxor,		%xmm1, %xmm0, %xmm0
537
    .else
538
	vpternlogq		$0x96, %xmm2, %xmm1, %xmm0
539
    .endif
540
	_cond_vex "pextrq $1,",	%xmm0, %rax  // (%xmm0 / x^0) mod x^64
541
.else
542
	_pclmulqdq		CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
543
	_cond_vex pxor,		%xmm1, %xmm0, %xmm0
544
  .if \n == 8
545
	_cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8
546
  .elseif \n == 16
547
	_cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16
548
  .elseif \n == 32
549
	_cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32
550
  .else // \n == 64 && !LSB_CRC
551
	_cond_vex movq,		%xmm0, %rax  // (%xmm0 / x^0) mod x^64
552
  .endif
553
.endif
554

555
.if VL > 16
556
	vzeroupper	// Needed when ymm or zmm registers may have been used.
557
.endif
558
#ifdef __i386__
559
	pop		CONSTS_PTR
560
#endif
561
	RET
562
.endm
563

564
#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb)			\
565
SYM_FUNC_START(prefix##_pclmul_sse);					\
566
	_crc_pclmul	n=bits, lsb_crc=lsb, vl=16, avx_level=0;	\
567
SYM_FUNC_END(prefix##_pclmul_sse);					\
568
									\
569
SYM_FUNC_START(prefix##_vpclmul_avx2);					\
570
	_crc_pclmul	n=bits, lsb_crc=lsb, vl=32, avx_level=2;	\
571
SYM_FUNC_END(prefix##_vpclmul_avx2);					\
572
									\
573
SYM_FUNC_START(prefix##_vpclmul_avx512);				\
574
	_crc_pclmul	n=bits, lsb_crc=lsb, vl=64, avx_level=512;	\
575
SYM_FUNC_END(prefix##_vpclmul_avx512);
576

577
Product

Resources

Company