Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crc/powerpc/crc-vpmsum-template.S
26289 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* Core of the accelerated CRC algorithm.
4
* In your file, define the constants and CRC_FUNCTION_NAME
5
* Then include this file.
6
*
7
* Calculate the checksum of data that is 16 byte aligned and a multiple of
8
* 16 bytes.
9
*
10
* The first step is to reduce it to 1024 bits. We do this in 8 parallel
11
* chunks in order to mask the latency of the vpmsum instructions. If we
12
* have more than 32 kB of data to checksum we repeat this step multiple
13
* times, passing in the previous 1024 bits.
14
*
15
* The next step is to reduce the 1024 bits to 64 bits. This step adds
16
* 32 bits of 0s to the end - this matches what a CRC does. We just
17
* calculate constants that land the data in this 32 bits.
18
*
19
* We then use fixed point Barrett reduction to compute a mod n over GF(2)
20
* for n = CRC using POWER8 instructions. We use x = 32.
21
*
22
* https://en.wikipedia.org/wiki/Barrett_reduction
23
*
24
* Copyright (C) 2015 Anton Blanchard <[email protected]>, IBM
25
*/
26
27
#include <asm/ppc_asm.h>
28
#include <asm/ppc-opcode.h>
29
30
#define MAX_SIZE 32768
31
32
.text
33
34
#if defined(__BIG_ENDIAN__) && defined(REFLECT)
35
#define BYTESWAP_DATA
36
#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
37
#define BYTESWAP_DATA
38
#else
39
#undef BYTESWAP_DATA
40
#endif
41
42
#define off16 r25
43
#define off32 r26
44
#define off48 r27
45
#define off64 r28
46
#define off80 r29
47
#define off96 r30
48
#define off112 r31
49
50
#define const1 v24
51
#define const2 v25
52
53
#define byteswap v26
54
#define mask_32bit v27
55
#define mask_64bit v28
56
#define zeroes v29
57
58
#ifdef BYTESWAP_DATA
59
#define VPERM(A, B, C, D) vperm A, B, C, D
60
#else
61
#define VPERM(A, B, C, D)
62
#endif
63
64
/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
65
FUNC_START(CRC_FUNCTION_NAME)
66
std r31,-8(r1)
67
std r30,-16(r1)
68
std r29,-24(r1)
69
std r28,-32(r1)
70
std r27,-40(r1)
71
std r26,-48(r1)
72
std r25,-56(r1)
73
74
li off16,16
75
li off32,32
76
li off48,48
77
li off64,64
78
li off80,80
79
li off96,96
80
li off112,112
81
li r0,0
82
83
/* Enough room for saving 10 non volatile VMX registers */
84
subi r6,r1,56+10*16
85
subi r7,r1,56+2*16
86
87
stvx v20,0,r6
88
stvx v21,off16,r6
89
stvx v22,off32,r6
90
stvx v23,off48,r6
91
stvx v24,off64,r6
92
stvx v25,off80,r6
93
stvx v26,off96,r6
94
stvx v27,off112,r6
95
stvx v28,0,r7
96
stvx v29,off16,r7
97
98
mr r10,r3
99
100
vxor zeroes,zeroes,zeroes
101
vspltisw v0,-1
102
103
vsldoi mask_32bit,zeroes,v0,4
104
vsldoi mask_64bit,zeroes,v0,8
105
106
/* Get the initial value into v8 */
107
vxor v8,v8,v8
108
MTVRD(v8, R3)
109
#ifdef REFLECT
110
vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
111
#else
112
vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */
113
#endif
114
115
#ifdef BYTESWAP_DATA
116
LOAD_REG_ADDR(r3, .byteswap_constant)
117
lvx byteswap,0,r3
118
addi r3,r3,16
119
#endif
120
121
cmpdi r5,256
122
blt .Lshort
123
124
rldicr r6,r5,0,56
125
126
/* Checksum in blocks of MAX_SIZE */
127
1: lis r7,MAX_SIZE@h
128
ori r7,r7,MAX_SIZE@l
129
mr r9,r7
130
cmpd r6,r7
131
bgt 2f
132
mr r7,r6
133
2: subf r6,r7,r6
134
135
/* our main loop does 128 bytes at a time */
136
srdi r7,r7,7
137
138
/*
139
* Work out the offset into the constants table to start at. Each
140
* constant is 16 bytes, and it is used against 128 bytes of input
141
* data - 128 / 16 = 8
142
*/
143
sldi r8,r7,4
144
srdi r9,r9,3
145
subf r8,r8,r9
146
147
/* We reduce our final 128 bytes in a separate step */
148
addi r7,r7,-1
149
mtctr r7
150
151
LOAD_REG_ADDR(r3, .constants)
152
153
/* Find the start of our constants */
154
add r3,r3,r8
155
156
/* zero v0-v7 which will contain our checksums */
157
vxor v0,v0,v0
158
vxor v1,v1,v1
159
vxor v2,v2,v2
160
vxor v3,v3,v3
161
vxor v4,v4,v4
162
vxor v5,v5,v5
163
vxor v6,v6,v6
164
vxor v7,v7,v7
165
166
lvx const1,0,r3
167
168
/*
169
* If we are looping back to consume more data we use the values
170
* already in v16-v23.
171
*/
172
cmpdi r0,1
173
beq 2f
174
175
/* First warm up pass */
176
lvx v16,0,r4
177
lvx v17,off16,r4
178
VPERM(v16,v16,v16,byteswap)
179
VPERM(v17,v17,v17,byteswap)
180
lvx v18,off32,r4
181
lvx v19,off48,r4
182
VPERM(v18,v18,v18,byteswap)
183
VPERM(v19,v19,v19,byteswap)
184
lvx v20,off64,r4
185
lvx v21,off80,r4
186
VPERM(v20,v20,v20,byteswap)
187
VPERM(v21,v21,v21,byteswap)
188
lvx v22,off96,r4
189
lvx v23,off112,r4
190
VPERM(v22,v22,v22,byteswap)
191
VPERM(v23,v23,v23,byteswap)
192
addi r4,r4,8*16
193
194
/* xor in initial value */
195
vxor v16,v16,v8
196
197
2: bdz .Lfirst_warm_up_done
198
199
addi r3,r3,16
200
lvx const2,0,r3
201
202
/* Second warm up pass */
203
VPMSUMD(v8,v16,const1)
204
lvx v16,0,r4
205
VPERM(v16,v16,v16,byteswap)
206
ori r2,r2,0
207
208
VPMSUMD(v9,v17,const1)
209
lvx v17,off16,r4
210
VPERM(v17,v17,v17,byteswap)
211
ori r2,r2,0
212
213
VPMSUMD(v10,v18,const1)
214
lvx v18,off32,r4
215
VPERM(v18,v18,v18,byteswap)
216
ori r2,r2,0
217
218
VPMSUMD(v11,v19,const1)
219
lvx v19,off48,r4
220
VPERM(v19,v19,v19,byteswap)
221
ori r2,r2,0
222
223
VPMSUMD(v12,v20,const1)
224
lvx v20,off64,r4
225
VPERM(v20,v20,v20,byteswap)
226
ori r2,r2,0
227
228
VPMSUMD(v13,v21,const1)
229
lvx v21,off80,r4
230
VPERM(v21,v21,v21,byteswap)
231
ori r2,r2,0
232
233
VPMSUMD(v14,v22,const1)
234
lvx v22,off96,r4
235
VPERM(v22,v22,v22,byteswap)
236
ori r2,r2,0
237
238
VPMSUMD(v15,v23,const1)
239
lvx v23,off112,r4
240
VPERM(v23,v23,v23,byteswap)
241
242
addi r4,r4,8*16
243
244
bdz .Lfirst_cool_down
245
246
/*
247
* main loop. We modulo schedule it such that it takes three iterations
248
* to complete - first iteration load, second iteration vpmsum, third
249
* iteration xor.
250
*/
251
.balign 16
252
4: lvx const1,0,r3
253
addi r3,r3,16
254
ori r2,r2,0
255
256
vxor v0,v0,v8
257
VPMSUMD(v8,v16,const2)
258
lvx v16,0,r4
259
VPERM(v16,v16,v16,byteswap)
260
ori r2,r2,0
261
262
vxor v1,v1,v9
263
VPMSUMD(v9,v17,const2)
264
lvx v17,off16,r4
265
VPERM(v17,v17,v17,byteswap)
266
ori r2,r2,0
267
268
vxor v2,v2,v10
269
VPMSUMD(v10,v18,const2)
270
lvx v18,off32,r4
271
VPERM(v18,v18,v18,byteswap)
272
ori r2,r2,0
273
274
vxor v3,v3,v11
275
VPMSUMD(v11,v19,const2)
276
lvx v19,off48,r4
277
VPERM(v19,v19,v19,byteswap)
278
lvx const2,0,r3
279
ori r2,r2,0
280
281
vxor v4,v4,v12
282
VPMSUMD(v12,v20,const1)
283
lvx v20,off64,r4
284
VPERM(v20,v20,v20,byteswap)
285
ori r2,r2,0
286
287
vxor v5,v5,v13
288
VPMSUMD(v13,v21,const1)
289
lvx v21,off80,r4
290
VPERM(v21,v21,v21,byteswap)
291
ori r2,r2,0
292
293
vxor v6,v6,v14
294
VPMSUMD(v14,v22,const1)
295
lvx v22,off96,r4
296
VPERM(v22,v22,v22,byteswap)
297
ori r2,r2,0
298
299
vxor v7,v7,v15
300
VPMSUMD(v15,v23,const1)
301
lvx v23,off112,r4
302
VPERM(v23,v23,v23,byteswap)
303
304
addi r4,r4,8*16
305
306
bdnz 4b
307
308
.Lfirst_cool_down:
309
/* First cool down pass */
310
lvx const1,0,r3
311
addi r3,r3,16
312
313
vxor v0,v0,v8
314
VPMSUMD(v8,v16,const1)
315
ori r2,r2,0
316
317
vxor v1,v1,v9
318
VPMSUMD(v9,v17,const1)
319
ori r2,r2,0
320
321
vxor v2,v2,v10
322
VPMSUMD(v10,v18,const1)
323
ori r2,r2,0
324
325
vxor v3,v3,v11
326
VPMSUMD(v11,v19,const1)
327
ori r2,r2,0
328
329
vxor v4,v4,v12
330
VPMSUMD(v12,v20,const1)
331
ori r2,r2,0
332
333
vxor v5,v5,v13
334
VPMSUMD(v13,v21,const1)
335
ori r2,r2,0
336
337
vxor v6,v6,v14
338
VPMSUMD(v14,v22,const1)
339
ori r2,r2,0
340
341
vxor v7,v7,v15
342
VPMSUMD(v15,v23,const1)
343
ori r2,r2,0
344
345
.Lsecond_cool_down:
346
/* Second cool down pass */
347
vxor v0,v0,v8
348
vxor v1,v1,v9
349
vxor v2,v2,v10
350
vxor v3,v3,v11
351
vxor v4,v4,v12
352
vxor v5,v5,v13
353
vxor v6,v6,v14
354
vxor v7,v7,v15
355
356
#ifdef REFLECT
357
/*
358
* vpmsumd produces a 96 bit result in the least significant bits
359
* of the register. Since we are bit reflected we have to shift it
360
* left 32 bits so it occupies the least significant bits in the
361
* bit reflected domain.
362
*/
363
vsldoi v0,v0,zeroes,4
364
vsldoi v1,v1,zeroes,4
365
vsldoi v2,v2,zeroes,4
366
vsldoi v3,v3,zeroes,4
367
vsldoi v4,v4,zeroes,4
368
vsldoi v5,v5,zeroes,4
369
vsldoi v6,v6,zeroes,4
370
vsldoi v7,v7,zeroes,4
371
#endif
372
373
/* xor with last 1024 bits */
374
lvx v8,0,r4
375
lvx v9,off16,r4
376
VPERM(v8,v8,v8,byteswap)
377
VPERM(v9,v9,v9,byteswap)
378
lvx v10,off32,r4
379
lvx v11,off48,r4
380
VPERM(v10,v10,v10,byteswap)
381
VPERM(v11,v11,v11,byteswap)
382
lvx v12,off64,r4
383
lvx v13,off80,r4
384
VPERM(v12,v12,v12,byteswap)
385
VPERM(v13,v13,v13,byteswap)
386
lvx v14,off96,r4
387
lvx v15,off112,r4
388
VPERM(v14,v14,v14,byteswap)
389
VPERM(v15,v15,v15,byteswap)
390
391
addi r4,r4,8*16
392
393
vxor v16,v0,v8
394
vxor v17,v1,v9
395
vxor v18,v2,v10
396
vxor v19,v3,v11
397
vxor v20,v4,v12
398
vxor v21,v5,v13
399
vxor v22,v6,v14
400
vxor v23,v7,v15
401
402
li r0,1
403
cmpdi r6,0
404
addi r6,r6,128
405
bne 1b
406
407
/* Work out how many bytes we have left */
408
andi. r5,r5,127
409
410
/* Calculate where in the constant table we need to start */
411
subfic r6,r5,128
412
add r3,r3,r6
413
414
/* How many 16 byte chunks are in the tail */
415
srdi r7,r5,4
416
mtctr r7
417
418
/*
419
* Reduce the previously calculated 1024 bits to 64 bits, shifting
420
* 32 bits to include the trailing 32 bits of zeros
421
*/
422
lvx v0,0,r3
423
lvx v1,off16,r3
424
lvx v2,off32,r3
425
lvx v3,off48,r3
426
lvx v4,off64,r3
427
lvx v5,off80,r3
428
lvx v6,off96,r3
429
lvx v7,off112,r3
430
addi r3,r3,8*16
431
432
VPMSUMW(v0,v16,v0)
433
VPMSUMW(v1,v17,v1)
434
VPMSUMW(v2,v18,v2)
435
VPMSUMW(v3,v19,v3)
436
VPMSUMW(v4,v20,v4)
437
VPMSUMW(v5,v21,v5)
438
VPMSUMW(v6,v22,v6)
439
VPMSUMW(v7,v23,v7)
440
441
/* Now reduce the tail (0 - 112 bytes) */
442
cmpdi r7,0
443
beq 1f
444
445
lvx v16,0,r4
446
lvx v17,0,r3
447
VPERM(v16,v16,v16,byteswap)
448
VPMSUMW(v16,v16,v17)
449
vxor v0,v0,v16
450
bdz 1f
451
452
lvx v16,off16,r4
453
lvx v17,off16,r3
454
VPERM(v16,v16,v16,byteswap)
455
VPMSUMW(v16,v16,v17)
456
vxor v0,v0,v16
457
bdz 1f
458
459
lvx v16,off32,r4
460
lvx v17,off32,r3
461
VPERM(v16,v16,v16,byteswap)
462
VPMSUMW(v16,v16,v17)
463
vxor v0,v0,v16
464
bdz 1f
465
466
lvx v16,off48,r4
467
lvx v17,off48,r3
468
VPERM(v16,v16,v16,byteswap)
469
VPMSUMW(v16,v16,v17)
470
vxor v0,v0,v16
471
bdz 1f
472
473
lvx v16,off64,r4
474
lvx v17,off64,r3
475
VPERM(v16,v16,v16,byteswap)
476
VPMSUMW(v16,v16,v17)
477
vxor v0,v0,v16
478
bdz 1f
479
480
lvx v16,off80,r4
481
lvx v17,off80,r3
482
VPERM(v16,v16,v16,byteswap)
483
VPMSUMW(v16,v16,v17)
484
vxor v0,v0,v16
485
bdz 1f
486
487
lvx v16,off96,r4
488
lvx v17,off96,r3
489
VPERM(v16,v16,v16,byteswap)
490
VPMSUMW(v16,v16,v17)
491
vxor v0,v0,v16
492
493
/* Now xor all the parallel chunks together */
494
1: vxor v0,v0,v1
495
vxor v2,v2,v3
496
vxor v4,v4,v5
497
vxor v6,v6,v7
498
499
vxor v0,v0,v2
500
vxor v4,v4,v6
501
502
vxor v0,v0,v4
503
504
.Lbarrett_reduction:
505
/* Barrett constants */
506
LOAD_REG_ADDR(r3, .barrett_constants)
507
508
lvx const1,0,r3
509
lvx const2,off16,r3
510
511
vsldoi v1,v0,v0,8
512
vxor v0,v0,v1 /* xor two 64 bit results together */
513
514
#ifdef REFLECT
515
/* shift left one bit */
516
vspltisb v1,1
517
vsl v0,v0,v1
518
#endif
519
520
vand v0,v0,mask_64bit
521
#ifndef REFLECT
522
/*
523
* Now for the Barrett reduction algorithm. The idea is to calculate q,
524
* the multiple of our polynomial that we need to subtract. By
525
* doing the computation 2x bits higher (ie 64 bits) and shifting the
526
* result back down 2x bits, we round down to the nearest multiple.
527
*/
528
VPMSUMD(v1,v0,const1) /* ma */
529
vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */
530
VPMSUMD(v1,v1,const2) /* qn */
531
vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
532
533
/*
534
* Get the result into r3. We need to shift it left 8 bytes:
535
* V0 [ 0 1 2 X ]
536
* V0 [ 0 X 2 3 ]
537
*/
538
vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */
539
#else
540
/*
541
* The reflected version of Barrett reduction. Instead of bit
542
* reflecting our data (which is expensive to do), we bit reflect our
543
* constants and our algorithm, which means the intermediate data in
544
* our vector registers goes from 0-63 instead of 63-0. We can reflect
545
* the algorithm because we don't carry in mod 2 arithmetic.
546
*/
547
vand v1,v0,mask_32bit /* bottom 32 bits of a */
548
VPMSUMD(v1,v1,const1) /* ma */
549
vand v1,v1,mask_32bit /* bottom 32bits of ma */
550
VPMSUMD(v1,v1,const2) /* qn */
551
vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
552
553
/*
554
* Since we are bit reflected, the result (ie the low 32 bits) is in
555
* the high 32 bits. We just need to shift it left 4 bytes
556
* V0 [ 0 1 X 3 ]
557
* V0 [ 0 X 2 3 ]
558
*/
559
vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
560
#endif
561
562
/* Get it into r3 */
563
MFVRD(R3, v0)
564
565
.Lout:
566
subi r6,r1,56+10*16
567
subi r7,r1,56+2*16
568
569
lvx v20,0,r6
570
lvx v21,off16,r6
571
lvx v22,off32,r6
572
lvx v23,off48,r6
573
lvx v24,off64,r6
574
lvx v25,off80,r6
575
lvx v26,off96,r6
576
lvx v27,off112,r6
577
lvx v28,0,r7
578
lvx v29,off16,r7
579
580
ld r31,-8(r1)
581
ld r30,-16(r1)
582
ld r29,-24(r1)
583
ld r28,-32(r1)
584
ld r27,-40(r1)
585
ld r26,-48(r1)
586
ld r25,-56(r1)
587
588
blr
589
590
.Lfirst_warm_up_done:
591
lvx const1,0,r3
592
addi r3,r3,16
593
594
VPMSUMD(v8,v16,const1)
595
VPMSUMD(v9,v17,const1)
596
VPMSUMD(v10,v18,const1)
597
VPMSUMD(v11,v19,const1)
598
VPMSUMD(v12,v20,const1)
599
VPMSUMD(v13,v21,const1)
600
VPMSUMD(v14,v22,const1)
601
VPMSUMD(v15,v23,const1)
602
603
b .Lsecond_cool_down
604
605
.Lshort:
606
cmpdi r5,0
607
beq .Lzero
608
609
LOAD_REG_ADDR(r3, .short_constants)
610
611
/* Calculate where in the constant table we need to start */
612
subfic r6,r5,256
613
add r3,r3,r6
614
615
/* How many 16 byte chunks? */
616
srdi r7,r5,4
617
mtctr r7
618
619
vxor v19,v19,v19
620
vxor v20,v20,v20
621
622
lvx v0,0,r4
623
lvx v16,0,r3
624
VPERM(v0,v0,v16,byteswap)
625
vxor v0,v0,v8 /* xor in initial value */
626
VPMSUMW(v0,v0,v16)
627
bdz .Lv0
628
629
lvx v1,off16,r4
630
lvx v17,off16,r3
631
VPERM(v1,v1,v17,byteswap)
632
VPMSUMW(v1,v1,v17)
633
bdz .Lv1
634
635
lvx v2,off32,r4
636
lvx v16,off32,r3
637
VPERM(v2,v2,v16,byteswap)
638
VPMSUMW(v2,v2,v16)
639
bdz .Lv2
640
641
lvx v3,off48,r4
642
lvx v17,off48,r3
643
VPERM(v3,v3,v17,byteswap)
644
VPMSUMW(v3,v3,v17)
645
bdz .Lv3
646
647
lvx v4,off64,r4
648
lvx v16,off64,r3
649
VPERM(v4,v4,v16,byteswap)
650
VPMSUMW(v4,v4,v16)
651
bdz .Lv4
652
653
lvx v5,off80,r4
654
lvx v17,off80,r3
655
VPERM(v5,v5,v17,byteswap)
656
VPMSUMW(v5,v5,v17)
657
bdz .Lv5
658
659
lvx v6,off96,r4
660
lvx v16,off96,r3
661
VPERM(v6,v6,v16,byteswap)
662
VPMSUMW(v6,v6,v16)
663
bdz .Lv6
664
665
lvx v7,off112,r4
666
lvx v17,off112,r3
667
VPERM(v7,v7,v17,byteswap)
668
VPMSUMW(v7,v7,v17)
669
bdz .Lv7
670
671
addi r3,r3,128
672
addi r4,r4,128
673
674
lvx v8,0,r4
675
lvx v16,0,r3
676
VPERM(v8,v8,v16,byteswap)
677
VPMSUMW(v8,v8,v16)
678
bdz .Lv8
679
680
lvx v9,off16,r4
681
lvx v17,off16,r3
682
VPERM(v9,v9,v17,byteswap)
683
VPMSUMW(v9,v9,v17)
684
bdz .Lv9
685
686
lvx v10,off32,r4
687
lvx v16,off32,r3
688
VPERM(v10,v10,v16,byteswap)
689
VPMSUMW(v10,v10,v16)
690
bdz .Lv10
691
692
lvx v11,off48,r4
693
lvx v17,off48,r3
694
VPERM(v11,v11,v17,byteswap)
695
VPMSUMW(v11,v11,v17)
696
bdz .Lv11
697
698
lvx v12,off64,r4
699
lvx v16,off64,r3
700
VPERM(v12,v12,v16,byteswap)
701
VPMSUMW(v12,v12,v16)
702
bdz .Lv12
703
704
lvx v13,off80,r4
705
lvx v17,off80,r3
706
VPERM(v13,v13,v17,byteswap)
707
VPMSUMW(v13,v13,v17)
708
bdz .Lv13
709
710
lvx v14,off96,r4
711
lvx v16,off96,r3
712
VPERM(v14,v14,v16,byteswap)
713
VPMSUMW(v14,v14,v16)
714
bdz .Lv14
715
716
lvx v15,off112,r4
717
lvx v17,off112,r3
718
VPERM(v15,v15,v17,byteswap)
719
VPMSUMW(v15,v15,v17)
720
721
.Lv15: vxor v19,v19,v15
722
.Lv14: vxor v20,v20,v14
723
.Lv13: vxor v19,v19,v13
724
.Lv12: vxor v20,v20,v12
725
.Lv11: vxor v19,v19,v11
726
.Lv10: vxor v20,v20,v10
727
.Lv9: vxor v19,v19,v9
728
.Lv8: vxor v20,v20,v8
729
.Lv7: vxor v19,v19,v7
730
.Lv6: vxor v20,v20,v6
731
.Lv5: vxor v19,v19,v5
732
.Lv4: vxor v20,v20,v4
733
.Lv3: vxor v19,v19,v3
734
.Lv2: vxor v20,v20,v2
735
.Lv1: vxor v19,v19,v1
736
.Lv0: vxor v20,v20,v0
737
738
vxor v0,v19,v20
739
740
b .Lbarrett_reduction
741
742
.Lzero:
743
mr r3,r10
744
b .Lout
745
746
FUNC_END(CRC_FUNCTION_NAME)
747
748