CoCalc -- crc-vpmsum-template.S

GitHub Repository: torvalds/linux
Path: blob/master/lib/crc/powerpc/crc-vpmsum-template.S
²⁶²⁸⁹ views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
 * Core of the accelerated CRC algorithm.
4
 * In your file, define the constants and CRC_FUNCTION_NAME
5
 * Then include this file.
6
 *
7
 * Calculate the checksum of data that is 16 byte aligned and a multiple of
8
 * 16 bytes.
9
 *
10
 * The first step is to reduce it to 1024 bits. We do this in 8 parallel
11
 * chunks in order to mask the latency of the vpmsum instructions. If we
12
 * have more than 32 kB of data to checksum we repeat this step multiple
13
 * times, passing in the previous 1024 bits.
14
 *
15
 * The next step is to reduce the 1024 bits to 64 bits. This step adds
16
 * 32 bits of 0s to the end - this matches what a CRC does. We just
17
 * calculate constants that land the data in this 32 bits.
18
 *
19
 * We then use fixed point Barrett reduction to compute a mod n over GF(2)
20
 * for n = CRC using POWER8 instructions. We use x = 32.
21
 *
22
 * https://en.wikipedia.org/wiki/Barrett_reduction
23
 *
24
 * Copyright (C) 2015 Anton Blanchard <[email protected]>, IBM
25
*/
26

27
#include <asm/ppc_asm.h>
28
#include <asm/ppc-opcode.h>
29

30
#define MAX_SIZE	32768
31

32
	.text
33

34
#if defined(__BIG_ENDIAN__) && defined(REFLECT)
35
#define BYTESWAP_DATA
36
#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
37
#define BYTESWAP_DATA
38
#else
39
#undef BYTESWAP_DATA
40
#endif
41

42
#define off16		r25
43
#define off32		r26
44
#define off48		r27
45
#define off64		r28
46
#define off80		r29
47
#define off96		r30
48
#define off112		r31
49

50
#define const1		v24
51
#define const2		v25
52

53
#define byteswap	v26
54
#define	mask_32bit	v27
55
#define	mask_64bit	v28
56
#define zeroes		v29
57

58
#ifdef BYTESWAP_DATA
59
#define VPERM(A, B, C, D) vperm	A, B, C, D
60
#else
61
#define VPERM(A, B, C, D)
62
#endif
63

64
/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
65
FUNC_START(CRC_FUNCTION_NAME)
66
	std	r31,-8(r1)
67
	std	r30,-16(r1)
68
	std	r29,-24(r1)
69
	std	r28,-32(r1)
70
	std	r27,-40(r1)
71
	std	r26,-48(r1)
72
	std	r25,-56(r1)
73

74
	li	off16,16
75
	li	off32,32
76
	li	off48,48
77
	li	off64,64
78
	li	off80,80
79
	li	off96,96
80
	li	off112,112
81
	li	r0,0
82

83
	/* Enough room for saving 10 non volatile VMX registers */
84
	subi	r6,r1,56+10*16
85
	subi	r7,r1,56+2*16
86

87
	stvx	v20,0,r6
88
	stvx	v21,off16,r6
89
	stvx	v22,off32,r6
90
	stvx	v23,off48,r6
91
	stvx	v24,off64,r6
92
	stvx	v25,off80,r6
93
	stvx	v26,off96,r6
94
	stvx	v27,off112,r6
95
	stvx	v28,0,r7
96
	stvx	v29,off16,r7
97

98
	mr	r10,r3
99

100
	vxor	zeroes,zeroes,zeroes
101
	vspltisw v0,-1
102

103
	vsldoi	mask_32bit,zeroes,v0,4
104
	vsldoi	mask_64bit,zeroes,v0,8
105

106
	/* Get the initial value into v8 */
107
	vxor	v8,v8,v8
108
	MTVRD(v8, R3)
109
#ifdef REFLECT
110
	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
111
#else
112
	vsldoi	v8,v8,zeroes,4	/* shift into top 32 bits */
113
#endif
114

115
#ifdef BYTESWAP_DATA
116
	LOAD_REG_ADDR(r3, .byteswap_constant)
117
	lvx	byteswap,0,r3
118
	addi	r3,r3,16
119
#endif
120

121
	cmpdi	r5,256
122
	blt	.Lshort
123

124
	rldicr	r6,r5,0,56
125

126
	/* Checksum in blocks of MAX_SIZE */
127
1:	lis	r7,MAX_SIZE@h
128
	ori	r7,r7,MAX_SIZE@l
129
	mr	r9,r7
130
	cmpd	r6,r7
131
	bgt	2f
132
	mr	r7,r6
133
2:	subf	r6,r7,r6
134

135
	/* our main loop does 128 bytes at a time */
136
	srdi	r7,r7,7
137

138
	/*
139
	 * Work out the offset into the constants table to start at. Each
140
	 * constant is 16 bytes, and it is used against 128 bytes of input
141
	 * data - 128 / 16 = 8
142
	 */
143
	sldi	r8,r7,4
144
	srdi	r9,r9,3
145
	subf	r8,r8,r9
146

147
	/* We reduce our final 128 bytes in a separate step */
148
	addi	r7,r7,-1
149
	mtctr	r7
150

151
	LOAD_REG_ADDR(r3, .constants)
152

153
	/* Find the start of our constants */
154
	add	r3,r3,r8
155

156
	/* zero v0-v7 which will contain our checksums */
157
	vxor	v0,v0,v0
158
	vxor	v1,v1,v1
159
	vxor	v2,v2,v2
160
	vxor	v3,v3,v3
161
	vxor	v4,v4,v4
162
	vxor	v5,v5,v5
163
	vxor	v6,v6,v6
164
	vxor	v7,v7,v7
165

166
	lvx	const1,0,r3
167

168
	/*
169
	 * If we are looping back to consume more data we use the values
170
	 * already in v16-v23.
171
	 */
172
	cmpdi	r0,1
173
	beq	2f
174

175
	/* First warm up pass */
176
	lvx	v16,0,r4
177
	lvx	v17,off16,r4
178
	VPERM(v16,v16,v16,byteswap)
179
	VPERM(v17,v17,v17,byteswap)
180
	lvx	v18,off32,r4
181
	lvx	v19,off48,r4
182
	VPERM(v18,v18,v18,byteswap)
183
	VPERM(v19,v19,v19,byteswap)
184
	lvx	v20,off64,r4
185
	lvx	v21,off80,r4
186
	VPERM(v20,v20,v20,byteswap)
187
	VPERM(v21,v21,v21,byteswap)
188
	lvx	v22,off96,r4
189
	lvx	v23,off112,r4
190
	VPERM(v22,v22,v22,byteswap)
191
	VPERM(v23,v23,v23,byteswap)
192
	addi	r4,r4,8*16
193

194
	/* xor in initial value */
195
	vxor	v16,v16,v8
196

197
2:	bdz	.Lfirst_warm_up_done
198

199
	addi	r3,r3,16
200
	lvx	const2,0,r3
201

202
	/* Second warm up pass */
203
	VPMSUMD(v8,v16,const1)
204
	lvx	v16,0,r4
205
	VPERM(v16,v16,v16,byteswap)
206
	ori	r2,r2,0
207

208
	VPMSUMD(v9,v17,const1)
209
	lvx	v17,off16,r4
210
	VPERM(v17,v17,v17,byteswap)
211
	ori	r2,r2,0
212

213
	VPMSUMD(v10,v18,const1)
214
	lvx	v18,off32,r4
215
	VPERM(v18,v18,v18,byteswap)
216
	ori	r2,r2,0
217

218
	VPMSUMD(v11,v19,const1)
219
	lvx	v19,off48,r4
220
	VPERM(v19,v19,v19,byteswap)
221
	ori	r2,r2,0
222

223
	VPMSUMD(v12,v20,const1)
224
	lvx	v20,off64,r4
225
	VPERM(v20,v20,v20,byteswap)
226
	ori	r2,r2,0
227

228
	VPMSUMD(v13,v21,const1)
229
	lvx	v21,off80,r4
230
	VPERM(v21,v21,v21,byteswap)
231
	ori	r2,r2,0
232

233
	VPMSUMD(v14,v22,const1)
234
	lvx	v22,off96,r4
235
	VPERM(v22,v22,v22,byteswap)
236
	ori	r2,r2,0
237

238
	VPMSUMD(v15,v23,const1)
239
	lvx	v23,off112,r4
240
	VPERM(v23,v23,v23,byteswap)
241

242
	addi	r4,r4,8*16
243

244
	bdz	.Lfirst_cool_down
245

246
	/*
247
	 * main loop. We modulo schedule it such that it takes three iterations
248
	 * to complete - first iteration load, second iteration vpmsum, third
249
	 * iteration xor.
250
	 */
251
	.balign	16
252
4:	lvx	const1,0,r3
253
	addi	r3,r3,16
254
	ori	r2,r2,0
255

256
	vxor	v0,v0,v8
257
	VPMSUMD(v8,v16,const2)
258
	lvx	v16,0,r4
259
	VPERM(v16,v16,v16,byteswap)
260
	ori	r2,r2,0
261

262
	vxor	v1,v1,v9
263
	VPMSUMD(v9,v17,const2)
264
	lvx	v17,off16,r4
265
	VPERM(v17,v17,v17,byteswap)
266
	ori	r2,r2,0
267

268
	vxor	v2,v2,v10
269
	VPMSUMD(v10,v18,const2)
270
	lvx	v18,off32,r4
271
	VPERM(v18,v18,v18,byteswap)
272
	ori	r2,r2,0
273

274
	vxor	v3,v3,v11
275
	VPMSUMD(v11,v19,const2)
276
	lvx	v19,off48,r4
277
	VPERM(v19,v19,v19,byteswap)
278
	lvx	const2,0,r3
279
	ori	r2,r2,0
280

281
	vxor	v4,v4,v12
282
	VPMSUMD(v12,v20,const1)
283
	lvx	v20,off64,r4
284
	VPERM(v20,v20,v20,byteswap)
285
	ori	r2,r2,0
286

287
	vxor	v5,v5,v13
288
	VPMSUMD(v13,v21,const1)
289
	lvx	v21,off80,r4
290
	VPERM(v21,v21,v21,byteswap)
291
	ori	r2,r2,0
292

293
	vxor	v6,v6,v14
294
	VPMSUMD(v14,v22,const1)
295
	lvx	v22,off96,r4
296
	VPERM(v22,v22,v22,byteswap)
297
	ori	r2,r2,0
298

299
	vxor	v7,v7,v15
300
	VPMSUMD(v15,v23,const1)
301
	lvx	v23,off112,r4
302
	VPERM(v23,v23,v23,byteswap)
303

304
	addi	r4,r4,8*16
305

306
	bdnz	4b
307

308
.Lfirst_cool_down:
309
	/* First cool down pass */
310
	lvx	const1,0,r3
311
	addi	r3,r3,16
312

313
	vxor	v0,v0,v8
314
	VPMSUMD(v8,v16,const1)
315
	ori	r2,r2,0
316

317
	vxor	v1,v1,v9
318
	VPMSUMD(v9,v17,const1)
319
	ori	r2,r2,0
320

321
	vxor	v2,v2,v10
322
	VPMSUMD(v10,v18,const1)
323
	ori	r2,r2,0
324

325
	vxor	v3,v3,v11
326
	VPMSUMD(v11,v19,const1)
327
	ori	r2,r2,0
328

329
	vxor	v4,v4,v12
330
	VPMSUMD(v12,v20,const1)
331
	ori	r2,r2,0
332

333
	vxor	v5,v5,v13
334
	VPMSUMD(v13,v21,const1)
335
	ori	r2,r2,0
336

337
	vxor	v6,v6,v14
338
	VPMSUMD(v14,v22,const1)
339
	ori	r2,r2,0
340

341
	vxor	v7,v7,v15
342
	VPMSUMD(v15,v23,const1)
343
	ori	r2,r2,0
344

345
.Lsecond_cool_down:
346
	/* Second cool down pass */
347
	vxor	v0,v0,v8
348
	vxor	v1,v1,v9
349
	vxor	v2,v2,v10
350
	vxor	v3,v3,v11
351
	vxor	v4,v4,v12
352
	vxor	v5,v5,v13
353
	vxor	v6,v6,v14
354
	vxor	v7,v7,v15
355

356
#ifdef REFLECT
357
	/*
358
	 * vpmsumd produces a 96 bit result in the least significant bits
359
	 * of the register. Since we are bit reflected we have to shift it
360
	 * left 32 bits so it occupies the least significant bits in the
361
	 * bit reflected domain.
362
	 */
363
	vsldoi	v0,v0,zeroes,4
364
	vsldoi	v1,v1,zeroes,4
365
	vsldoi	v2,v2,zeroes,4
366
	vsldoi	v3,v3,zeroes,4
367
	vsldoi	v4,v4,zeroes,4
368
	vsldoi	v5,v5,zeroes,4
369
	vsldoi	v6,v6,zeroes,4
370
	vsldoi	v7,v7,zeroes,4
371
#endif
372

373
	/* xor with last 1024 bits */
374
	lvx	v8,0,r4
375
	lvx	v9,off16,r4
376
	VPERM(v8,v8,v8,byteswap)
377
	VPERM(v9,v9,v9,byteswap)
378
	lvx	v10,off32,r4
379
	lvx	v11,off48,r4
380
	VPERM(v10,v10,v10,byteswap)
381
	VPERM(v11,v11,v11,byteswap)
382
	lvx	v12,off64,r4
383
	lvx	v13,off80,r4
384
	VPERM(v12,v12,v12,byteswap)
385
	VPERM(v13,v13,v13,byteswap)
386
	lvx	v14,off96,r4
387
	lvx	v15,off112,r4
388
	VPERM(v14,v14,v14,byteswap)
389
	VPERM(v15,v15,v15,byteswap)
390

391
	addi	r4,r4,8*16
392

393
	vxor	v16,v0,v8
394
	vxor	v17,v1,v9
395
	vxor	v18,v2,v10
396
	vxor	v19,v3,v11
397
	vxor	v20,v4,v12
398
	vxor	v21,v5,v13
399
	vxor	v22,v6,v14
400
	vxor	v23,v7,v15
401

402
	li	r0,1
403
	cmpdi	r6,0
404
	addi	r6,r6,128
405
	bne	1b
406

407
	/* Work out how many bytes we have left */
408
	andi.	r5,r5,127
409

410
	/* Calculate where in the constant table we need to start */
411
	subfic	r6,r5,128
412
	add	r3,r3,r6
413

414
	/* How many 16 byte chunks are in the tail */
415
	srdi	r7,r5,4
416
	mtctr	r7
417

418
	/*
419
	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
420
	 * 32 bits to include the trailing 32 bits of zeros
421
	 */
422
	lvx	v0,0,r3
423
	lvx	v1,off16,r3
424
	lvx	v2,off32,r3
425
	lvx	v3,off48,r3
426
	lvx	v4,off64,r3
427
	lvx	v5,off80,r3
428
	lvx	v6,off96,r3
429
	lvx	v7,off112,r3
430
	addi	r3,r3,8*16
431

432
	VPMSUMW(v0,v16,v0)
433
	VPMSUMW(v1,v17,v1)
434
	VPMSUMW(v2,v18,v2)
435
	VPMSUMW(v3,v19,v3)
436
	VPMSUMW(v4,v20,v4)
437
	VPMSUMW(v5,v21,v5)
438
	VPMSUMW(v6,v22,v6)
439
	VPMSUMW(v7,v23,v7)
440

441
	/* Now reduce the tail (0 - 112 bytes) */
442
	cmpdi	r7,0
443
	beq	1f
444

445
	lvx	v16,0,r4
446
	lvx	v17,0,r3
447
	VPERM(v16,v16,v16,byteswap)
448
	VPMSUMW(v16,v16,v17)
449
	vxor	v0,v0,v16
450
	bdz	1f
451

452
	lvx	v16,off16,r4
453
	lvx	v17,off16,r3
454
	VPERM(v16,v16,v16,byteswap)
455
	VPMSUMW(v16,v16,v17)
456
	vxor	v0,v0,v16
457
	bdz	1f
458

459
	lvx	v16,off32,r4
460
	lvx	v17,off32,r3
461
	VPERM(v16,v16,v16,byteswap)
462
	VPMSUMW(v16,v16,v17)
463
	vxor	v0,v0,v16
464
	bdz	1f
465

466
	lvx	v16,off48,r4
467
	lvx	v17,off48,r3
468
	VPERM(v16,v16,v16,byteswap)
469
	VPMSUMW(v16,v16,v17)
470
	vxor	v0,v0,v16
471
	bdz	1f
472

473
	lvx	v16,off64,r4
474
	lvx	v17,off64,r3
475
	VPERM(v16,v16,v16,byteswap)
476
	VPMSUMW(v16,v16,v17)
477
	vxor	v0,v0,v16
478
	bdz	1f
479

480
	lvx	v16,off80,r4
481
	lvx	v17,off80,r3
482
	VPERM(v16,v16,v16,byteswap)
483
	VPMSUMW(v16,v16,v17)
484
	vxor	v0,v0,v16
485
	bdz	1f
486

487
	lvx	v16,off96,r4
488
	lvx	v17,off96,r3
489
	VPERM(v16,v16,v16,byteswap)
490
	VPMSUMW(v16,v16,v17)
491
	vxor	v0,v0,v16
492

493
	/* Now xor all the parallel chunks together */
494
1:	vxor	v0,v0,v1
495
	vxor	v2,v2,v3
496
	vxor	v4,v4,v5
497
	vxor	v6,v6,v7
498

499
	vxor	v0,v0,v2
500
	vxor	v4,v4,v6
501

502
	vxor	v0,v0,v4
503

504
.Lbarrett_reduction:
505
	/* Barrett constants */
506
	LOAD_REG_ADDR(r3, .barrett_constants)
507

508
	lvx	const1,0,r3
509
	lvx	const2,off16,r3
510

511
	vsldoi	v1,v0,v0,8
512
	vxor	v0,v0,v1		/* xor two 64 bit results together */
513

514
#ifdef REFLECT
515
	/* shift left one bit */
516
	vspltisb v1,1
517
	vsl	v0,v0,v1
518
#endif
519

520
	vand	v0,v0,mask_64bit
521
#ifndef REFLECT
522
	/*
523
	 * Now for the Barrett reduction algorithm. The idea is to calculate q,
524
	 * the multiple of our polynomial that we need to subtract. By
525
	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
526
	 * result back down 2x bits, we round down to the nearest multiple.
527
	 */
528
	VPMSUMD(v1,v0,const1)	/* ma */
529
	vsldoi	v1,zeroes,v1,8	/* q = floor(ma/(2^64)) */
530
	VPMSUMD(v1,v1,const2)	/* qn */
531
	vxor	v0,v0,v1	/* a - qn, subtraction is xor in GF(2) */
532

533
	/*
534
	 * Get the result into r3. We need to shift it left 8 bytes:
535
	 * V0 [ 0 1 2 X ]
536
	 * V0 [ 0 X 2 3 ]
537
	 */
538
	vsldoi	v0,v0,zeroes,8	/* shift result into top 64 bits */
539
#else
540
	/*
541
	 * The reflected version of Barrett reduction. Instead of bit
542
	 * reflecting our data (which is expensive to do), we bit reflect our
543
	 * constants and our algorithm, which means the intermediate data in
544
	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
545
	 * the algorithm because we don't carry in mod 2 arithmetic.
546
	 */
547
	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
548
	VPMSUMD(v1,v1,const1)		/* ma */
549
	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
550
	VPMSUMD(v1,v1,const2)		/* qn */
551
	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
552

553
	/*
554
	 * Since we are bit reflected, the result (ie the low 32 bits) is in
555
	 * the high 32 bits. We just need to shift it left 4 bytes
556
	 * V0 [ 0 1 X 3 ]
557
	 * V0 [ 0 X 2 3 ]
558
	 */
559
	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
560
#endif
561

562
	/* Get it into r3 */
563
	MFVRD(R3, v0)
564

565
.Lout:
566
	subi	r6,r1,56+10*16
567
	subi	r7,r1,56+2*16
568

569
	lvx	v20,0,r6
570
	lvx	v21,off16,r6
571
	lvx	v22,off32,r6
572
	lvx	v23,off48,r6
573
	lvx	v24,off64,r6
574
	lvx	v25,off80,r6
575
	lvx	v26,off96,r6
576
	lvx	v27,off112,r6
577
	lvx	v28,0,r7
578
	lvx	v29,off16,r7
579

580
	ld	r31,-8(r1)
581
	ld	r30,-16(r1)
582
	ld	r29,-24(r1)
583
	ld	r28,-32(r1)
584
	ld	r27,-40(r1)
585
	ld	r26,-48(r1)
586
	ld	r25,-56(r1)
587

588
	blr
589

590
.Lfirst_warm_up_done:
591
	lvx	const1,0,r3
592
	addi	r3,r3,16
593

594
	VPMSUMD(v8,v16,const1)
595
	VPMSUMD(v9,v17,const1)
596
	VPMSUMD(v10,v18,const1)
597
	VPMSUMD(v11,v19,const1)
598
	VPMSUMD(v12,v20,const1)
599
	VPMSUMD(v13,v21,const1)
600
	VPMSUMD(v14,v22,const1)
601
	VPMSUMD(v15,v23,const1)
602

603
	b	.Lsecond_cool_down
604

605
.Lshort:
606
	cmpdi	r5,0
607
	beq	.Lzero
608

609
	LOAD_REG_ADDR(r3, .short_constants)
610

611
	/* Calculate where in the constant table we need to start */
612
	subfic	r6,r5,256
613
	add	r3,r3,r6
614

615
	/* How many 16 byte chunks? */
616
	srdi	r7,r5,4
617
	mtctr	r7
618

619
	vxor	v19,v19,v19
620
	vxor	v20,v20,v20
621

622
	lvx	v0,0,r4
623
	lvx	v16,0,r3
624
	VPERM(v0,v0,v16,byteswap)
625
	vxor	v0,v0,v8	/* xor in initial value */
626
	VPMSUMW(v0,v0,v16)
627
	bdz	.Lv0
628

629
	lvx	v1,off16,r4
630
	lvx	v17,off16,r3
631
	VPERM(v1,v1,v17,byteswap)
632
	VPMSUMW(v1,v1,v17)
633
	bdz	.Lv1
634

635
	lvx	v2,off32,r4
636
	lvx	v16,off32,r3
637
	VPERM(v2,v2,v16,byteswap)
638
	VPMSUMW(v2,v2,v16)
639
	bdz	.Lv2
640

641
	lvx	v3,off48,r4
642
	lvx	v17,off48,r3
643
	VPERM(v3,v3,v17,byteswap)
644
	VPMSUMW(v3,v3,v17)
645
	bdz	.Lv3
646

647
	lvx	v4,off64,r4
648
	lvx	v16,off64,r3
649
	VPERM(v4,v4,v16,byteswap)
650
	VPMSUMW(v4,v4,v16)
651
	bdz	.Lv4
652

653
	lvx	v5,off80,r4
654
	lvx	v17,off80,r3
655
	VPERM(v5,v5,v17,byteswap)
656
	VPMSUMW(v5,v5,v17)
657
	bdz	.Lv5
658

659
	lvx	v6,off96,r4
660
	lvx	v16,off96,r3
661
	VPERM(v6,v6,v16,byteswap)
662
	VPMSUMW(v6,v6,v16)
663
	bdz	.Lv6
664

665
	lvx	v7,off112,r4
666
	lvx	v17,off112,r3
667
	VPERM(v7,v7,v17,byteswap)
668
	VPMSUMW(v7,v7,v17)
669
	bdz	.Lv7
670

671
	addi	r3,r3,128
672
	addi	r4,r4,128
673

674
	lvx	v8,0,r4
675
	lvx	v16,0,r3
676
	VPERM(v8,v8,v16,byteswap)
677
	VPMSUMW(v8,v8,v16)
678
	bdz	.Lv8
679

680
	lvx	v9,off16,r4
681
	lvx	v17,off16,r3
682
	VPERM(v9,v9,v17,byteswap)
683
	VPMSUMW(v9,v9,v17)
684
	bdz	.Lv9
685

686
	lvx	v10,off32,r4
687
	lvx	v16,off32,r3
688
	VPERM(v10,v10,v16,byteswap)
689
	VPMSUMW(v10,v10,v16)
690
	bdz	.Lv10
691

692
	lvx	v11,off48,r4
693
	lvx	v17,off48,r3
694
	VPERM(v11,v11,v17,byteswap)
695
	VPMSUMW(v11,v11,v17)
696
	bdz	.Lv11
697

698
	lvx	v12,off64,r4
699
	lvx	v16,off64,r3
700
	VPERM(v12,v12,v16,byteswap)
701
	VPMSUMW(v12,v12,v16)
702
	bdz	.Lv12
703

704
	lvx	v13,off80,r4
705
	lvx	v17,off80,r3
706
	VPERM(v13,v13,v17,byteswap)
707
	VPMSUMW(v13,v13,v17)
708
	bdz	.Lv13
709

710
	lvx	v14,off96,r4
711
	lvx	v16,off96,r3
712
	VPERM(v14,v14,v16,byteswap)
713
	VPMSUMW(v14,v14,v16)
714
	bdz	.Lv14
715

716
	lvx	v15,off112,r4
717
	lvx	v17,off112,r3
718
	VPERM(v15,v15,v17,byteswap)
719
	VPMSUMW(v15,v15,v17)
720

721
.Lv15:	vxor	v19,v19,v15
722
.Lv14:	vxor	v20,v20,v14
723
.Lv13:	vxor	v19,v19,v13
724
.Lv12:	vxor	v20,v20,v12
725
.Lv11:	vxor	v19,v19,v11
726
.Lv10:	vxor	v20,v20,v10
727
.Lv9:	vxor	v19,v19,v9
728
.Lv8:	vxor	v20,v20,v8
729
.Lv7:	vxor	v19,v19,v7
730
.Lv6:	vxor	v20,v20,v6
731
.Lv5:	vxor	v19,v19,v5
732
.Lv4:	vxor	v20,v20,v4
733
.Lv3:	vxor	v19,v19,v3
734
.Lv2:	vxor	v20,v20,v2
735
.Lv1:	vxor	v19,v19,v1
736
.Lv0:	vxor	v20,v20,v0
737

738
	vxor	v0,v19,v20
739

740
	b	.Lbarrett_reduction
741

742
.Lzero:
743
	mr	r3,r10
744
	b	.Lout
745

746
FUNC_END(CRC_FUNCTION_NAME)
747

748
Product

Resources

Company