Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/arm/poly1305-armv4.pl
26288 views
1
#!/usr/bin/env perl
2
# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
3
#
4
# ====================================================================
5
# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
6
# project.
7
# ====================================================================
8
#
9
# IALU(*)/gcc-4.4 NEON
10
#
11
# ARM11xx(ARMv6) 7.78/+100% -
12
# Cortex-A5 6.35/+130% 3.00
13
# Cortex-A8 6.25/+115% 2.36
14
# Cortex-A9 5.10/+95% 2.55
15
# Cortex-A15 3.85/+85% 1.25(**)
16
# Snapdragon S4 5.70/+100% 1.48(**)
17
#
18
# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
19
# (**) these are trade-off results, they can be improved by ~8% but at
20
# the cost of 15/12% regression on Cortex-A5/A7, it's even possible
21
# to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
22
23
$flavour = shift;
24
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
25
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
26
27
if ($flavour && $flavour ne "void") {
28
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
29
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
30
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
31
die "can't locate arm-xlate.pl";
32
33
open STDOUT,"| \"$^X\" $xlate $flavour $output";
34
} else {
35
open STDOUT,">$output";
36
}
37
38
($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
39
40
$code.=<<___;
41
#ifndef __KERNEL__
42
# include "arm_arch.h"
43
#else
44
# define __ARM_ARCH__ __LINUX_ARM_ARCH__
45
# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
46
# define poly1305_init poly1305_block_init_arch
47
# define poly1305_blocks poly1305_blocks_arm
48
# define poly1305_emit poly1305_emit_arch
49
#endif
50
51
#if defined(__thumb2__)
52
.syntax unified
53
.thumb
54
#else
55
.code 32
56
#endif
57
58
.text
59
60
.globl poly1305_emit
61
.globl poly1305_blocks
62
.globl poly1305_init
63
.type poly1305_init,%function
64
.align 5
65
poly1305_init:
66
.Lpoly1305_init:
67
stmdb sp!,{r4-r11}
68
69
eor r3,r3,r3
70
cmp $inp,#0
71
str r3,[$ctx,#0] @ zero hash value
72
str r3,[$ctx,#4]
73
str r3,[$ctx,#8]
74
str r3,[$ctx,#12]
75
str r3,[$ctx,#16]
76
str r3,[$ctx,#36] @ clear is_base2_26
77
add $ctx,$ctx,#20
78
79
#ifdef __thumb2__
80
it eq
81
#endif
82
moveq r0,#0
83
beq .Lno_key
84
85
#if __ARM_MAX_ARCH__>=7
86
mov r3,#-1
87
str r3,[$ctx,#28] @ impossible key power value
88
# ifndef __KERNEL__
89
adr r11,.Lpoly1305_init
90
ldr r12,.LOPENSSL_armcap
91
# endif
92
#endif
93
ldrb r4,[$inp,#0]
94
mov r10,#0x0fffffff
95
ldrb r5,[$inp,#1]
96
and r3,r10,#-4 @ 0x0ffffffc
97
ldrb r6,[$inp,#2]
98
ldrb r7,[$inp,#3]
99
orr r4,r4,r5,lsl#8
100
ldrb r5,[$inp,#4]
101
orr r4,r4,r6,lsl#16
102
ldrb r6,[$inp,#5]
103
orr r4,r4,r7,lsl#24
104
ldrb r7,[$inp,#6]
105
and r4,r4,r10
106
107
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
108
# if !defined(_WIN32)
109
ldr r12,[r11,r12] @ OPENSSL_armcap_P
110
# endif
111
# if defined(__APPLE__) || defined(_WIN32)
112
ldr r12,[r12]
113
# endif
114
#endif
115
ldrb r8,[$inp,#7]
116
orr r5,r5,r6,lsl#8
117
ldrb r6,[$inp,#8]
118
orr r5,r5,r7,lsl#16
119
ldrb r7,[$inp,#9]
120
orr r5,r5,r8,lsl#24
121
ldrb r8,[$inp,#10]
122
and r5,r5,r3
123
124
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
125
tst r12,#ARMV7_NEON @ check for NEON
126
# ifdef __thumb2__
127
adr r9,.Lpoly1305_blocks_neon
128
adr r11,.Lpoly1305_blocks
129
it ne
130
movne r11,r9
131
adr r12,.Lpoly1305_emit
132
orr r11,r11,#1 @ thumb-ify addresses
133
orr r12,r12,#1
134
# else
135
add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
136
ite eq
137
addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
138
addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
139
# endif
140
#endif
141
ldrb r9,[$inp,#11]
142
orr r6,r6,r7,lsl#8
143
ldrb r7,[$inp,#12]
144
orr r6,r6,r8,lsl#16
145
ldrb r8,[$inp,#13]
146
orr r6,r6,r9,lsl#24
147
ldrb r9,[$inp,#14]
148
and r6,r6,r3
149
150
ldrb r10,[$inp,#15]
151
orr r7,r7,r8,lsl#8
152
str r4,[$ctx,#0]
153
orr r7,r7,r9,lsl#16
154
str r5,[$ctx,#4]
155
orr r7,r7,r10,lsl#24
156
str r6,[$ctx,#8]
157
and r7,r7,r3
158
str r7,[$ctx,#12]
159
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
160
stmia r2,{r11,r12} @ fill functions table
161
mov r0,#1
162
#else
163
mov r0,#0
164
#endif
165
.Lno_key:
166
ldmia sp!,{r4-r11}
167
#if __ARM_ARCH__>=5
168
ret @ bx lr
169
#else
170
tst lr,#1
171
moveq pc,lr @ be binary compatible with V4, yet
172
bx lr @ interoperable with Thumb ISA:-)
173
#endif
174
.size poly1305_init,.-poly1305_init
175
___
176
{
177
my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
178
my ($s1,$s2,$s3)=($r1,$r2,$r3);
179
180
$code.=<<___;
181
.type poly1305_blocks,%function
182
.align 5
183
poly1305_blocks:
184
.Lpoly1305_blocks:
185
stmdb sp!,{r3-r11,lr}
186
187
ands $len,$len,#-16
188
beq .Lno_data
189
190
add $len,$len,$inp @ end pointer
191
sub sp,sp,#32
192
193
#if __ARM_ARCH__<7
194
ldmia $ctx,{$h0-$r3} @ load context
195
add $ctx,$ctx,#20
196
str $len,[sp,#16] @ offload stuff
197
str $ctx,[sp,#12]
198
#else
199
ldr lr,[$ctx,#36] @ is_base2_26
200
ldmia $ctx!,{$h0-$h4} @ load hash value
201
str $len,[sp,#16] @ offload stuff
202
str $ctx,[sp,#12]
203
204
adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
205
mov $r1,$h1,lsr#6
206
adcs $r1,$r1,$h2,lsl#20
207
mov $r2,$h2,lsr#12
208
adcs $r2,$r2,$h3,lsl#14
209
mov $r3,$h3,lsr#18
210
adcs $r3,$r3,$h4,lsl#8
211
mov $len,#0
212
teq lr,#0
213
str $len,[$ctx,#16] @ clear is_base2_26
214
adc $len,$len,$h4,lsr#24
215
216
itttt ne
217
movne $h0,$r0 @ choose between radixes
218
movne $h1,$r1
219
movne $h2,$r2
220
movne $h3,$r3
221
ldmia $ctx,{$r0-$r3} @ load key
222
it ne
223
movne $h4,$len
224
#endif
225
226
mov lr,$inp
227
cmp $padbit,#0
228
str $r1,[sp,#20]
229
str $r2,[sp,#24]
230
str $r3,[sp,#28]
231
b .Loop
232
233
.align 4
234
.Loop:
235
#if __ARM_ARCH__<7
236
ldrb r0,[lr],#16 @ load input
237
# ifdef __thumb2__
238
it hi
239
# endif
240
addhi $h4,$h4,#1 @ 1<<128
241
ldrb r1,[lr,#-15]
242
ldrb r2,[lr,#-14]
243
ldrb r3,[lr,#-13]
244
orr r1,r0,r1,lsl#8
245
ldrb r0,[lr,#-12]
246
orr r2,r1,r2,lsl#16
247
ldrb r1,[lr,#-11]
248
orr r3,r2,r3,lsl#24
249
ldrb r2,[lr,#-10]
250
adds $h0,$h0,r3 @ accumulate input
251
252
ldrb r3,[lr,#-9]
253
orr r1,r0,r1,lsl#8
254
ldrb r0,[lr,#-8]
255
orr r2,r1,r2,lsl#16
256
ldrb r1,[lr,#-7]
257
orr r3,r2,r3,lsl#24
258
ldrb r2,[lr,#-6]
259
adcs $h1,$h1,r3
260
261
ldrb r3,[lr,#-5]
262
orr r1,r0,r1,lsl#8
263
ldrb r0,[lr,#-4]
264
orr r2,r1,r2,lsl#16
265
ldrb r1,[lr,#-3]
266
orr r3,r2,r3,lsl#24
267
ldrb r2,[lr,#-2]
268
adcs $h2,$h2,r3
269
270
ldrb r3,[lr,#-1]
271
orr r1,r0,r1,lsl#8
272
str lr,[sp,#8] @ offload input pointer
273
orr r2,r1,r2,lsl#16
274
add $s1,$r1,$r1,lsr#2
275
orr r3,r2,r3,lsl#24
276
#else
277
ldr r0,[lr],#16 @ load input
278
it hi
279
addhi $h4,$h4,#1 @ padbit
280
ldr r1,[lr,#-12]
281
ldr r2,[lr,#-8]
282
ldr r3,[lr,#-4]
283
# ifdef __ARMEB__
284
rev r0,r0
285
rev r1,r1
286
rev r2,r2
287
rev r3,r3
288
# endif
289
adds $h0,$h0,r0 @ accumulate input
290
str lr,[sp,#8] @ offload input pointer
291
adcs $h1,$h1,r1
292
add $s1,$r1,$r1,lsr#2
293
adcs $h2,$h2,r2
294
#endif
295
add $s2,$r2,$r2,lsr#2
296
adcs $h3,$h3,r3
297
add $s3,$r3,$r3,lsr#2
298
299
umull r2,r3,$h1,$r0
300
adc $h4,$h4,#0
301
umull r0,r1,$h0,$r0
302
umlal r2,r3,$h4,$s1
303
umlal r0,r1,$h3,$s1
304
ldr $r1,[sp,#20] @ reload $r1
305
umlal r2,r3,$h2,$s3
306
umlal r0,r1,$h1,$s3
307
umlal r2,r3,$h3,$s2
308
umlal r0,r1,$h2,$s2
309
umlal r2,r3,$h0,$r1
310
str r0,[sp,#0] @ future $h0
311
mul r0,$s2,$h4
312
ldr $r2,[sp,#24] @ reload $r2
313
adds r2,r2,r1 @ d1+=d0>>32
314
eor r1,r1,r1
315
adc lr,r3,#0 @ future $h2
316
str r2,[sp,#4] @ future $h1
317
318
mul r2,$s3,$h4
319
eor r3,r3,r3
320
umlal r0,r1,$h3,$s3
321
ldr $r3,[sp,#28] @ reload $r3
322
umlal r2,r3,$h3,$r0
323
umlal r0,r1,$h2,$r0
324
umlal r2,r3,$h2,$r1
325
umlal r0,r1,$h1,$r1
326
umlal r2,r3,$h1,$r2
327
umlal r0,r1,$h0,$r2
328
umlal r2,r3,$h0,$r3
329
ldr $h0,[sp,#0]
330
mul $h4,$r0,$h4
331
ldr $h1,[sp,#4]
332
333
adds $h2,lr,r0 @ d2+=d1>>32
334
ldr lr,[sp,#8] @ reload input pointer
335
adc r1,r1,#0
336
adds $h3,r2,r1 @ d3+=d2>>32
337
ldr r0,[sp,#16] @ reload end pointer
338
adc r3,r3,#0
339
add $h4,$h4,r3 @ h4+=d3>>32
340
341
and r1,$h4,#-4
342
and $h4,$h4,#3
343
add r1,r1,r1,lsr#2 @ *=5
344
adds $h0,$h0,r1
345
adcs $h1,$h1,#0
346
adcs $h2,$h2,#0
347
adcs $h3,$h3,#0
348
adc $h4,$h4,#0
349
350
cmp r0,lr @ done yet?
351
bhi .Loop
352
353
ldr $ctx,[sp,#12]
354
add sp,sp,#32
355
stmdb $ctx,{$h0-$h4} @ store the result
356
357
.Lno_data:
358
#if __ARM_ARCH__>=5
359
ldmia sp!,{r3-r11,pc}
360
#else
361
ldmia sp!,{r3-r11,lr}
362
tst lr,#1
363
moveq pc,lr @ be binary compatible with V4, yet
364
bx lr @ interoperable with Thumb ISA:-)
365
#endif
366
.size poly1305_blocks,.-poly1305_blocks
367
___
368
}
369
{
370
my ($ctx,$mac,$nonce)=map("r$_",(0..2));
371
my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
372
my $g4=$ctx;
373
374
$code.=<<___;
375
.type poly1305_emit,%function
376
.align 5
377
poly1305_emit:
378
.Lpoly1305_emit:
379
stmdb sp!,{r4-r11}
380
381
ldmia $ctx,{$h0-$h4}
382
383
#if __ARM_ARCH__>=7
384
ldr ip,[$ctx,#36] @ is_base2_26
385
386
adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
387
mov $g1,$h1,lsr#6
388
adcs $g1,$g1,$h2,lsl#20
389
mov $g2,$h2,lsr#12
390
adcs $g2,$g2,$h3,lsl#14
391
mov $g3,$h3,lsr#18
392
adcs $g3,$g3,$h4,lsl#8
393
mov $g4,#0
394
adc $g4,$g4,$h4,lsr#24
395
396
tst ip,ip
397
itttt ne
398
movne $h0,$g0
399
movne $h1,$g1
400
movne $h2,$g2
401
movne $h3,$g3
402
it ne
403
movne $h4,$g4
404
#endif
405
406
adds $g0,$h0,#5 @ compare to modulus
407
adcs $g1,$h1,#0
408
adcs $g2,$h2,#0
409
adcs $g3,$h3,#0
410
adc $g4,$h4,#0
411
tst $g4,#4 @ did it carry/borrow?
412
413
#ifdef __thumb2__
414
it ne
415
#endif
416
movne $h0,$g0
417
ldr $g0,[$nonce,#0]
418
#ifdef __thumb2__
419
it ne
420
#endif
421
movne $h1,$g1
422
ldr $g1,[$nonce,#4]
423
#ifdef __thumb2__
424
it ne
425
#endif
426
movne $h2,$g2
427
ldr $g2,[$nonce,#8]
428
#ifdef __thumb2__
429
it ne
430
#endif
431
movne $h3,$g3
432
ldr $g3,[$nonce,#12]
433
434
adds $h0,$h0,$g0
435
adcs $h1,$h1,$g1
436
adcs $h2,$h2,$g2
437
adc $h3,$h3,$g3
438
439
#if __ARM_ARCH__>=7
440
# ifdef __ARMEB__
441
rev $h0,$h0
442
rev $h1,$h1
443
rev $h2,$h2
444
rev $h3,$h3
445
# endif
446
str $h0,[$mac,#0]
447
str $h1,[$mac,#4]
448
str $h2,[$mac,#8]
449
str $h3,[$mac,#12]
450
#else
451
strb $h0,[$mac,#0]
452
mov $h0,$h0,lsr#8
453
strb $h1,[$mac,#4]
454
mov $h1,$h1,lsr#8
455
strb $h2,[$mac,#8]
456
mov $h2,$h2,lsr#8
457
strb $h3,[$mac,#12]
458
mov $h3,$h3,lsr#8
459
460
strb $h0,[$mac,#1]
461
mov $h0,$h0,lsr#8
462
strb $h1,[$mac,#5]
463
mov $h1,$h1,lsr#8
464
strb $h2,[$mac,#9]
465
mov $h2,$h2,lsr#8
466
strb $h3,[$mac,#13]
467
mov $h3,$h3,lsr#8
468
469
strb $h0,[$mac,#2]
470
mov $h0,$h0,lsr#8
471
strb $h1,[$mac,#6]
472
mov $h1,$h1,lsr#8
473
strb $h2,[$mac,#10]
474
mov $h2,$h2,lsr#8
475
strb $h3,[$mac,#14]
476
mov $h3,$h3,lsr#8
477
478
strb $h0,[$mac,#3]
479
strb $h1,[$mac,#7]
480
strb $h2,[$mac,#11]
481
strb $h3,[$mac,#15]
482
#endif
483
ldmia sp!,{r4-r11}
484
#if __ARM_ARCH__>=5
485
ret @ bx lr
486
#else
487
tst lr,#1
488
moveq pc,lr @ be binary compatible with V4, yet
489
bx lr @ interoperable with Thumb ISA:-)
490
#endif
491
.size poly1305_emit,.-poly1305_emit
492
___
493
{
494
my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
495
my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
496
my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
497
498
my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
499
500
$code.=<<___;
501
#if __ARM_MAX_ARCH__>=7
502
.fpu neon
503
504
.type poly1305_init_neon,%function
505
.align 5
506
poly1305_init_neon:
507
.Lpoly1305_init_neon:
508
ldr r3,[$ctx,#48] @ first table element
509
cmp r3,#-1 @ is value impossible?
510
bne .Lno_init_neon
511
512
ldr r4,[$ctx,#20] @ load key base 2^32
513
ldr r5,[$ctx,#24]
514
ldr r6,[$ctx,#28]
515
ldr r7,[$ctx,#32]
516
517
and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
518
mov r3,r4,lsr#26
519
mov r4,r5,lsr#20
520
orr r3,r3,r5,lsl#6
521
mov r5,r6,lsr#14
522
orr r4,r4,r6,lsl#12
523
mov r6,r7,lsr#8
524
orr r5,r5,r7,lsl#18
525
and r3,r3,#0x03ffffff
526
and r4,r4,#0x03ffffff
527
and r5,r5,#0x03ffffff
528
529
vdup.32 $R0,r2 @ r^1 in both lanes
530
add r2,r3,r3,lsl#2 @ *5
531
vdup.32 $R1,r3
532
add r3,r4,r4,lsl#2
533
vdup.32 $S1,r2
534
vdup.32 $R2,r4
535
add r4,r5,r5,lsl#2
536
vdup.32 $S2,r3
537
vdup.32 $R3,r5
538
add r5,r6,r6,lsl#2
539
vdup.32 $S3,r4
540
vdup.32 $R4,r6
541
vdup.32 $S4,r5
542
543
mov $zeros,#2 @ counter
544
545
.Lsquare_neon:
546
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
547
@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
548
@ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
549
@ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
550
@ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
551
@ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
552
553
vmull.u32 $D0,$R0,${R0}[1]
554
vmull.u32 $D1,$R1,${R0}[1]
555
vmull.u32 $D2,$R2,${R0}[1]
556
vmull.u32 $D3,$R3,${R0}[1]
557
vmull.u32 $D4,$R4,${R0}[1]
558
559
vmlal.u32 $D0,$R4,${S1}[1]
560
vmlal.u32 $D1,$R0,${R1}[1]
561
vmlal.u32 $D2,$R1,${R1}[1]
562
vmlal.u32 $D3,$R2,${R1}[1]
563
vmlal.u32 $D4,$R3,${R1}[1]
564
565
vmlal.u32 $D0,$R3,${S2}[1]
566
vmlal.u32 $D1,$R4,${S2}[1]
567
vmlal.u32 $D3,$R1,${R2}[1]
568
vmlal.u32 $D2,$R0,${R2}[1]
569
vmlal.u32 $D4,$R2,${R2}[1]
570
571
vmlal.u32 $D0,$R2,${S3}[1]
572
vmlal.u32 $D3,$R0,${R3}[1]
573
vmlal.u32 $D1,$R3,${S3}[1]
574
vmlal.u32 $D2,$R4,${S3}[1]
575
vmlal.u32 $D4,$R1,${R3}[1]
576
577
vmlal.u32 $D3,$R4,${S4}[1]
578
vmlal.u32 $D0,$R1,${S4}[1]
579
vmlal.u32 $D1,$R2,${S4}[1]
580
vmlal.u32 $D2,$R3,${S4}[1]
581
vmlal.u32 $D4,$R0,${R4}[1]
582
583
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
584
@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
585
@ and P. Schwabe
586
@
587
@ H0>>+H1>>+H2>>+H3>>+H4
588
@ H3>>+H4>>*5+H0>>+H1
589
@
590
@ Trivia.
591
@
592
@ Result of multiplication of n-bit number by m-bit number is
593
@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
594
@ m-bit number multiplied by 2^n is still n+m bits wide.
595
@
596
@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
597
@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
598
@ one is n+1 bits wide.
599
@
600
@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
601
@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
602
@ can be 27. However! In cases when their width exceeds 26 bits
603
@ they are limited by 2^26+2^6. This in turn means that *sum*
604
@ of the products with these values can still be viewed as sum
605
@ of 52-bit numbers as long as the amount of addends is not a
606
@ power of 2. For example,
607
@
608
@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
609
@
610
@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
611
@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
612
@ 8 * (2^52) or 2^55. However, the value is then multiplied by
613
@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
614
@ which is less than 32 * (2^52) or 2^57. And when processing
615
@ data we are looking at triple as many addends...
616
@
617
@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
618
@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
619
@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
620
@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
621
@ instruction accepts 2x32-bit input and writes 2x64-bit result.
622
@ This means that result of reduction have to be compressed upon
623
@ loop wrap-around. This can be done in the process of reduction
624
@ to minimize amount of instructions [as well as amount of
625
@ 128-bit instructions, which benefits low-end processors], but
626
@ one has to watch for H2 (which is narrower than H0) and 5*H4
627
@ not being wider than 58 bits, so that result of right shift
628
@ by 26 bits fits in 32 bits. This is also useful on x86,
629
@ because it allows to use paddd in place for paddq, which
630
@ benefits Atom, where paddq is ridiculously slow.
631
632
vshr.u64 $T0,$D3,#26
633
vmovn.i64 $D3#lo,$D3
634
vshr.u64 $T1,$D0,#26
635
vmovn.i64 $D0#lo,$D0
636
vadd.i64 $D4,$D4,$T0 @ h3 -> h4
637
vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
638
vadd.i64 $D1,$D1,$T1 @ h0 -> h1
639
vbic.i32 $D0#lo,#0xfc000000
640
641
vshrn.u64 $T0#lo,$D4,#26
642
vmovn.i64 $D4#lo,$D4
643
vshr.u64 $T1,$D1,#26
644
vmovn.i64 $D1#lo,$D1
645
vadd.i64 $D2,$D2,$T1 @ h1 -> h2
646
vbic.i32 $D4#lo,#0xfc000000
647
vbic.i32 $D1#lo,#0xfc000000
648
649
vadd.i32 $D0#lo,$D0#lo,$T0#lo
650
vshl.u32 $T0#lo,$T0#lo,#2
651
vshrn.u64 $T1#lo,$D2,#26
652
vmovn.i64 $D2#lo,$D2
653
vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
654
vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
655
vbic.i32 $D2#lo,#0xfc000000
656
657
vshr.u32 $T0#lo,$D0#lo,#26
658
vbic.i32 $D0#lo,#0xfc000000
659
vshr.u32 $T1#lo,$D3#lo,#26
660
vbic.i32 $D3#lo,#0xfc000000
661
vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
662
vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
663
664
subs $zeros,$zeros,#1
665
beq .Lsquare_break_neon
666
667
add $tbl0,$ctx,#(48+0*9*4)
668
add $tbl1,$ctx,#(48+1*9*4)
669
670
vtrn.32 $R0,$D0#lo @ r^2:r^1
671
vtrn.32 $R2,$D2#lo
672
vtrn.32 $R3,$D3#lo
673
vtrn.32 $R1,$D1#lo
674
vtrn.32 $R4,$D4#lo
675
676
vshl.u32 $S2,$R2,#2 @ *5
677
vshl.u32 $S3,$R3,#2
678
vshl.u32 $S1,$R1,#2
679
vshl.u32 $S4,$R4,#2
680
vadd.i32 $S2,$S2,$R2
681
vadd.i32 $S1,$S1,$R1
682
vadd.i32 $S3,$S3,$R3
683
vadd.i32 $S4,$S4,$R4
684
685
vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
686
vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
687
vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
688
vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
689
vst1.32 {${S4}[0]},[$tbl0,:32]
690
vst1.32 {${S4}[1]},[$tbl1,:32]
691
692
b .Lsquare_neon
693
694
.align 4
695
.Lsquare_break_neon:
696
add $tbl0,$ctx,#(48+2*4*9)
697
add $tbl1,$ctx,#(48+3*4*9)
698
699
vmov $R0,$D0#lo @ r^4:r^3
700
vshl.u32 $S1,$D1#lo,#2 @ *5
701
vmov $R1,$D1#lo
702
vshl.u32 $S2,$D2#lo,#2
703
vmov $R2,$D2#lo
704
vshl.u32 $S3,$D3#lo,#2
705
vmov $R3,$D3#lo
706
vshl.u32 $S4,$D4#lo,#2
707
vmov $R4,$D4#lo
708
vadd.i32 $S1,$S1,$D1#lo
709
vadd.i32 $S2,$S2,$D2#lo
710
vadd.i32 $S3,$S3,$D3#lo
711
vadd.i32 $S4,$S4,$D4#lo
712
713
vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
714
vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
715
vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
716
vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
717
vst1.32 {${S4}[0]},[$tbl0]
718
vst1.32 {${S4}[1]},[$tbl1]
719
720
.Lno_init_neon:
721
ret @ bx lr
722
.size poly1305_init_neon,.-poly1305_init_neon
723
724
.globl poly1305_blocks_neon
725
.type poly1305_blocks_neon,%function
726
.align 5
727
poly1305_blocks_neon:
728
.Lpoly1305_blocks_neon:
729
ldr ip,[$ctx,#36] @ is_base2_26
730
731
cmp $len,#64
732
blo .Lpoly1305_blocks
733
734
stmdb sp!,{r4-r7}
735
vstmdb sp!,{d8-d15} @ ABI specification says so
736
737
tst ip,ip @ is_base2_26?
738
bne .Lbase2_26_neon
739
740
stmdb sp!,{r1-r3,lr}
741
bl .Lpoly1305_init_neon
742
743
ldr r4,[$ctx,#0] @ load hash value base 2^32
744
ldr r5,[$ctx,#4]
745
ldr r6,[$ctx,#8]
746
ldr r7,[$ctx,#12]
747
ldr ip,[$ctx,#16]
748
749
and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
750
mov r3,r4,lsr#26
751
veor $D0#lo,$D0#lo,$D0#lo
752
mov r4,r5,lsr#20
753
orr r3,r3,r5,lsl#6
754
veor $D1#lo,$D1#lo,$D1#lo
755
mov r5,r6,lsr#14
756
orr r4,r4,r6,lsl#12
757
veor $D2#lo,$D2#lo,$D2#lo
758
mov r6,r7,lsr#8
759
orr r5,r5,r7,lsl#18
760
veor $D3#lo,$D3#lo,$D3#lo
761
and r3,r3,#0x03ffffff
762
orr r6,r6,ip,lsl#24
763
veor $D4#lo,$D4#lo,$D4#lo
764
and r4,r4,#0x03ffffff
765
mov r1,#1
766
and r5,r5,#0x03ffffff
767
str r1,[$ctx,#36] @ set is_base2_26
768
769
vmov.32 $D0#lo[0],r2
770
vmov.32 $D1#lo[0],r3
771
vmov.32 $D2#lo[0],r4
772
vmov.32 $D3#lo[0],r5
773
vmov.32 $D4#lo[0],r6
774
adr $zeros,.Lzeros
775
776
ldmia sp!,{r1-r3,lr}
777
b .Lhash_loaded
778
779
.align 4
780
.Lbase2_26_neon:
781
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
782
@ load hash value
783
784
veor $D0#lo,$D0#lo,$D0#lo
785
veor $D1#lo,$D1#lo,$D1#lo
786
veor $D2#lo,$D2#lo,$D2#lo
787
veor $D3#lo,$D3#lo,$D3#lo
788
veor $D4#lo,$D4#lo,$D4#lo
789
vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
790
adr $zeros,.Lzeros
791
vld1.32 {$D4#lo[0]},[$ctx]
792
sub $ctx,$ctx,#16 @ rewind
793
794
.Lhash_loaded:
795
add $in2,$inp,#32
796
mov $padbit,$padbit,lsl#24
797
tst $len,#31
798
beq .Leven
799
800
vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
801
vmov.32 $H4#lo[0],$padbit
802
sub $len,$len,#16
803
add $in2,$inp,#32
804
805
# ifdef __ARMEB__
806
vrev32.8 $H0,$H0
807
vrev32.8 $H3,$H3
808
vrev32.8 $H1,$H1
809
vrev32.8 $H2,$H2
810
# endif
811
vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
812
vshl.u32 $H3#lo,$H3#lo,#18
813
814
vsri.u32 $H3#lo,$H2#lo,#14
815
vshl.u32 $H2#lo,$H2#lo,#12
816
vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
817
818
vbic.i32 $H3#lo,#0xfc000000
819
vsri.u32 $H2#lo,$H1#lo,#20
820
vshl.u32 $H1#lo,$H1#lo,#6
821
822
vbic.i32 $H2#lo,#0xfc000000
823
vsri.u32 $H1#lo,$H0#lo,#26
824
vadd.i32 $H3#hi,$H3#lo,$D3#lo
825
826
vbic.i32 $H0#lo,#0xfc000000
827
vbic.i32 $H1#lo,#0xfc000000
828
vadd.i32 $H2#hi,$H2#lo,$D2#lo
829
830
vadd.i32 $H0#hi,$H0#lo,$D0#lo
831
vadd.i32 $H1#hi,$H1#lo,$D1#lo
832
833
mov $tbl1,$zeros
834
add $tbl0,$ctx,#48
835
836
cmp $len,$len
837
b .Long_tail
838
839
.align 4
840
.Leven:
841
subs $len,$len,#64
842
it lo
843
movlo $in2,$zeros
844
845
vmov.i32 $H4,#1<<24 @ padbit, yes, always
846
vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
847
add $inp,$inp,#64
848
vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
849
add $in2,$in2,#64
850
itt hi
851
addhi $tbl1,$ctx,#(48+1*9*4)
852
addhi $tbl0,$ctx,#(48+3*9*4)
853
854
# ifdef __ARMEB__
855
vrev32.8 $H0,$H0
856
vrev32.8 $H3,$H3
857
vrev32.8 $H1,$H1
858
vrev32.8 $H2,$H2
859
# endif
860
vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
861
vshl.u32 $H3,$H3,#18
862
863
vsri.u32 $H3,$H2,#14
864
vshl.u32 $H2,$H2,#12
865
866
vbic.i32 $H3,#0xfc000000
867
vsri.u32 $H2,$H1,#20
868
vshl.u32 $H1,$H1,#6
869
870
vbic.i32 $H2,#0xfc000000
871
vsri.u32 $H1,$H0,#26
872
873
vbic.i32 $H0,#0xfc000000
874
vbic.i32 $H1,#0xfc000000
875
876
bls .Lskip_loop
877
878
vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
879
vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
880
vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
881
vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
882
b .Loop_neon
883
884
.align 5
885
.Loop_neon:
886
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
887
@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
888
@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
889
@ \___________________/
890
@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
891
@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
892
@ \___________________/ \____________________/
893
@
894
@ Note that we start with inp[2:3]*r^2. This is because it
895
@ doesn't depend on reduction in previous iteration.
896
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
897
@ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
898
@ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
899
@ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
900
@ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
901
@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
902
903
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
904
@ inp[2:3]*r^2
905
906
vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
907
vmull.u32 $D2,$H2#hi,${R0}[1]
908
vadd.i32 $H0#lo,$H0#lo,$D0#lo
909
vmull.u32 $D0,$H0#hi,${R0}[1]
910
vadd.i32 $H3#lo,$H3#lo,$D3#lo
911
vmull.u32 $D3,$H3#hi,${R0}[1]
912
vmlal.u32 $D2,$H1#hi,${R1}[1]
913
vadd.i32 $H1#lo,$H1#lo,$D1#lo
914
vmull.u32 $D1,$H1#hi,${R0}[1]
915
916
vadd.i32 $H4#lo,$H4#lo,$D4#lo
917
vmull.u32 $D4,$H4#hi,${R0}[1]
918
subs $len,$len,#64
919
vmlal.u32 $D0,$H4#hi,${S1}[1]
920
it lo
921
movlo $in2,$zeros
922
vmlal.u32 $D3,$H2#hi,${R1}[1]
923
vld1.32 ${S4}[1],[$tbl1,:32]
924
vmlal.u32 $D1,$H0#hi,${R1}[1]
925
vmlal.u32 $D4,$H3#hi,${R1}[1]
926
927
vmlal.u32 $D0,$H3#hi,${S2}[1]
928
vmlal.u32 $D3,$H1#hi,${R2}[1]
929
vmlal.u32 $D4,$H2#hi,${R2}[1]
930
vmlal.u32 $D1,$H4#hi,${S2}[1]
931
vmlal.u32 $D2,$H0#hi,${R2}[1]
932
933
vmlal.u32 $D3,$H0#hi,${R3}[1]
934
vmlal.u32 $D0,$H2#hi,${S3}[1]
935
vmlal.u32 $D4,$H1#hi,${R3}[1]
936
vmlal.u32 $D1,$H3#hi,${S3}[1]
937
vmlal.u32 $D2,$H4#hi,${S3}[1]
938
939
vmlal.u32 $D3,$H4#hi,${S4}[1]
940
vmlal.u32 $D0,$H1#hi,${S4}[1]
941
vmlal.u32 $D4,$H0#hi,${R4}[1]
942
vmlal.u32 $D1,$H2#hi,${S4}[1]
943
vmlal.u32 $D2,$H3#hi,${S4}[1]
944
945
vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
946
add $in2,$in2,#64
947
948
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
949
@ (hash+inp[0:1])*r^4 and accumulate
950
951
vmlal.u32 $D3,$H3#lo,${R0}[0]
952
vmlal.u32 $D0,$H0#lo,${R0}[0]
953
vmlal.u32 $D4,$H4#lo,${R0}[0]
954
vmlal.u32 $D1,$H1#lo,${R0}[0]
955
vmlal.u32 $D2,$H2#lo,${R0}[0]
956
vld1.32 ${S4}[0],[$tbl0,:32]
957
958
vmlal.u32 $D3,$H2#lo,${R1}[0]
959
vmlal.u32 $D0,$H4#lo,${S1}[0]
960
vmlal.u32 $D4,$H3#lo,${R1}[0]
961
vmlal.u32 $D1,$H0#lo,${R1}[0]
962
vmlal.u32 $D2,$H1#lo,${R1}[0]
963
964
vmlal.u32 $D3,$H1#lo,${R2}[0]
965
vmlal.u32 $D0,$H3#lo,${S2}[0]
966
vmlal.u32 $D4,$H2#lo,${R2}[0]
967
vmlal.u32 $D1,$H4#lo,${S2}[0]
968
vmlal.u32 $D2,$H0#lo,${R2}[0]
969
970
vmlal.u32 $D3,$H0#lo,${R3}[0]
971
vmlal.u32 $D0,$H2#lo,${S3}[0]
972
vmlal.u32 $D4,$H1#lo,${R3}[0]
973
vmlal.u32 $D1,$H3#lo,${S3}[0]
974
vmlal.u32 $D3,$H4#lo,${S4}[0]
975
976
vmlal.u32 $D2,$H4#lo,${S3}[0]
977
vmlal.u32 $D0,$H1#lo,${S4}[0]
978
vmlal.u32 $D4,$H0#lo,${R4}[0]
979
vmov.i32 $H4,#1<<24 @ padbit, yes, always
980
vmlal.u32 $D1,$H2#lo,${S4}[0]
981
vmlal.u32 $D2,$H3#lo,${S4}[0]
982
983
vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
984
add $inp,$inp,#64
985
# ifdef __ARMEB__
986
vrev32.8 $H0,$H0
987
vrev32.8 $H1,$H1
988
vrev32.8 $H2,$H2
989
vrev32.8 $H3,$H3
990
# endif
991
992
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
993
@ lazy reduction interleaved with base 2^32 -> base 2^26 of
994
@ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
995
996
vshr.u64 $T0,$D3,#26
997
vmovn.i64 $D3#lo,$D3
998
vshr.u64 $T1,$D0,#26
999
vmovn.i64 $D0#lo,$D0
1000
vadd.i64 $D4,$D4,$T0 @ h3 -> h4
1001
vbic.i32 $D3#lo,#0xfc000000
1002
vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
1003
vadd.i64 $D1,$D1,$T1 @ h0 -> h1
1004
vshl.u32 $H3,$H3,#18
1005
vbic.i32 $D0#lo,#0xfc000000
1006
1007
vshrn.u64 $T0#lo,$D4,#26
1008
vmovn.i64 $D4#lo,$D4
1009
vshr.u64 $T1,$D1,#26
1010
vmovn.i64 $D1#lo,$D1
1011
vadd.i64 $D2,$D2,$T1 @ h1 -> h2
1012
vsri.u32 $H3,$H2,#14
1013
vbic.i32 $D4#lo,#0xfc000000
1014
vshl.u32 $H2,$H2,#12
1015
vbic.i32 $D1#lo,#0xfc000000
1016
1017
vadd.i32 $D0#lo,$D0#lo,$T0#lo
1018
vshl.u32 $T0#lo,$T0#lo,#2
1019
vbic.i32 $H3,#0xfc000000
1020
vshrn.u64 $T1#lo,$D2,#26
1021
vmovn.i64 $D2#lo,$D2
1022
vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
1023
vsri.u32 $H2,$H1,#20
1024
vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
1025
vshl.u32 $H1,$H1,#6
1026
vbic.i32 $D2#lo,#0xfc000000
1027
vbic.i32 $H2,#0xfc000000
1028
1029
vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
1030
vmovn.i64 $D0#lo,$D0
1031
vsri.u32 $H1,$H0,#26
1032
vbic.i32 $H0,#0xfc000000
1033
vshr.u32 $T1#lo,$D3#lo,#26
1034
vbic.i32 $D3#lo,#0xfc000000
1035
vbic.i32 $D0#lo,#0xfc000000
1036
vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
1037
vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
1038
vbic.i32 $H1,#0xfc000000
1039
1040
bhi .Loop_neon
1041
1042
.Lskip_loop:
1043
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1044
@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1045
1046
add $tbl1,$ctx,#(48+0*9*4)
1047
add $tbl0,$ctx,#(48+1*9*4)
1048
adds $len,$len,#32
1049
it ne
1050
movne $len,#0
1051
bne .Long_tail
1052
1053
vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
1054
vadd.i32 $H0#hi,$H0#lo,$D0#lo
1055
vadd.i32 $H3#hi,$H3#lo,$D3#lo
1056
vadd.i32 $H1#hi,$H1#lo,$D1#lo
1057
vadd.i32 $H4#hi,$H4#lo,$D4#lo
1058
1059
.Long_tail:
1060
vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
1061
vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
1062
1063
vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
1064
vmull.u32 $D2,$H2#hi,$R0
1065
vadd.i32 $H0#lo,$H0#lo,$D0#lo
1066
vmull.u32 $D0,$H0#hi,$R0
1067
vadd.i32 $H3#lo,$H3#lo,$D3#lo
1068
vmull.u32 $D3,$H3#hi,$R0
1069
vadd.i32 $H1#lo,$H1#lo,$D1#lo
1070
vmull.u32 $D1,$H1#hi,$R0
1071
vadd.i32 $H4#lo,$H4#lo,$D4#lo
1072
vmull.u32 $D4,$H4#hi,$R0
1073
1074
vmlal.u32 $D0,$H4#hi,$S1
1075
vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
1076
vmlal.u32 $D3,$H2#hi,$R1
1077
vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
1078
vmlal.u32 $D1,$H0#hi,$R1
1079
vmlal.u32 $D4,$H3#hi,$R1
1080
vmlal.u32 $D2,$H1#hi,$R1
1081
1082
vmlal.u32 $D3,$H1#hi,$R2
1083
vld1.32 ${S4}[1],[$tbl1,:32]
1084
vmlal.u32 $D0,$H3#hi,$S2
1085
vld1.32 ${S4}[0],[$tbl0,:32]
1086
vmlal.u32 $D4,$H2#hi,$R2
1087
vmlal.u32 $D1,$H4#hi,$S2
1088
vmlal.u32 $D2,$H0#hi,$R2
1089
1090
vmlal.u32 $D3,$H0#hi,$R3
1091
it ne
1092
addne $tbl1,$ctx,#(48+2*9*4)
1093
vmlal.u32 $D0,$H2#hi,$S3
1094
it ne
1095
addne $tbl0,$ctx,#(48+3*9*4)
1096
vmlal.u32 $D4,$H1#hi,$R3
1097
vmlal.u32 $D1,$H3#hi,$S3
1098
vmlal.u32 $D2,$H4#hi,$S3
1099
1100
vmlal.u32 $D3,$H4#hi,$S4
1101
vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
1102
vmlal.u32 $D0,$H1#hi,$S4
1103
vshr.u64 $MASK,$MASK,#38
1104
vmlal.u32 $D4,$H0#hi,$R4
1105
vmlal.u32 $D1,$H2#hi,$S4
1106
vmlal.u32 $D2,$H3#hi,$S4
1107
1108
beq .Lshort_tail
1109
1110
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1111
@ (hash+inp[0:1])*r^4:r^3 and accumulate
1112
1113
vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
1114
vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
1115
1116
vmlal.u32 $D2,$H2#lo,$R0
1117
vmlal.u32 $D0,$H0#lo,$R0
1118
vmlal.u32 $D3,$H3#lo,$R0
1119
vmlal.u32 $D1,$H1#lo,$R0
1120
vmlal.u32 $D4,$H4#lo,$R0
1121
1122
vmlal.u32 $D0,$H4#lo,$S1
1123
vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
1124
vmlal.u32 $D3,$H2#lo,$R1
1125
vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
1126
vmlal.u32 $D1,$H0#lo,$R1
1127
vmlal.u32 $D4,$H3#lo,$R1
1128
vmlal.u32 $D2,$H1#lo,$R1
1129
1130
vmlal.u32 $D3,$H1#lo,$R2
1131
vld1.32 ${S4}[1],[$tbl1,:32]
1132
vmlal.u32 $D0,$H3#lo,$S2
1133
vld1.32 ${S4}[0],[$tbl0,:32]
1134
vmlal.u32 $D4,$H2#lo,$R2
1135
vmlal.u32 $D1,$H4#lo,$S2
1136
vmlal.u32 $D2,$H0#lo,$R2
1137
1138
vmlal.u32 $D3,$H0#lo,$R3
1139
vmlal.u32 $D0,$H2#lo,$S3
1140
vmlal.u32 $D4,$H1#lo,$R3
1141
vmlal.u32 $D1,$H3#lo,$S3
1142
vmlal.u32 $D2,$H4#lo,$S3
1143
1144
vmlal.u32 $D3,$H4#lo,$S4
1145
vorn $MASK,$MASK,$MASK @ all-ones
1146
vmlal.u32 $D0,$H1#lo,$S4
1147
vshr.u64 $MASK,$MASK,#38
1148
vmlal.u32 $D4,$H0#lo,$R4
1149
vmlal.u32 $D1,$H2#lo,$S4
1150
vmlal.u32 $D2,$H3#lo,$S4
1151
1152
.Lshort_tail:
1153
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1154
@ horizontal addition
1155
1156
vadd.i64 $D3#lo,$D3#lo,$D3#hi
1157
vadd.i64 $D0#lo,$D0#lo,$D0#hi
1158
vadd.i64 $D4#lo,$D4#lo,$D4#hi
1159
vadd.i64 $D1#lo,$D1#lo,$D1#hi
1160
vadd.i64 $D2#lo,$D2#lo,$D2#hi
1161
1162
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1163
@ lazy reduction, but without narrowing
1164
1165
vshr.u64 $T0,$D3,#26
1166
vand.i64 $D3,$D3,$MASK
1167
vshr.u64 $T1,$D0,#26
1168
vand.i64 $D0,$D0,$MASK
1169
vadd.i64 $D4,$D4,$T0 @ h3 -> h4
1170
vadd.i64 $D1,$D1,$T1 @ h0 -> h1
1171
1172
vshr.u64 $T0,$D4,#26
1173
vand.i64 $D4,$D4,$MASK
1174
vshr.u64 $T1,$D1,#26
1175
vand.i64 $D1,$D1,$MASK
1176
vadd.i64 $D2,$D2,$T1 @ h1 -> h2
1177
1178
vadd.i64 $D0,$D0,$T0
1179
vshl.u64 $T0,$T0,#2
1180
vshr.u64 $T1,$D2,#26
1181
vand.i64 $D2,$D2,$MASK
1182
vadd.i64 $D0,$D0,$T0 @ h4 -> h0
1183
vadd.i64 $D3,$D3,$T1 @ h2 -> h3
1184
1185
vshr.u64 $T0,$D0,#26
1186
vand.i64 $D0,$D0,$MASK
1187
vshr.u64 $T1,$D3,#26
1188
vand.i64 $D3,$D3,$MASK
1189
vadd.i64 $D1,$D1,$T0 @ h0 -> h1
1190
vadd.i64 $D4,$D4,$T1 @ h3 -> h4
1191
1192
cmp $len,#0
1193
bne .Leven
1194
1195
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1196
@ store hash value
1197
1198
vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
1199
vst1.32 {$D4#lo[0]},[$ctx]
1200
1201
vldmia sp!,{d8-d15} @ epilogue
1202
ldmia sp!,{r4-r7}
1203
ret @ bx lr
1204
.size poly1305_blocks_neon,.-poly1305_blocks_neon
1205
1206
.align 5
1207
.Lzeros:
1208
.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1209
#ifndef __KERNEL__
1210
.LOPENSSL_armcap:
1211
# ifdef _WIN32
1212
.word OPENSSL_armcap_P
1213
# else
1214
.word OPENSSL_armcap_P-.Lpoly1305_init
1215
# endif
1216
.comm OPENSSL_armcap_P,4,4
1217
.hidden OPENSSL_armcap_P
1218
#endif
1219
#endif
1220
___
1221
} }
1222
$code.=<<___;
1223
.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
1224
.align 2
1225
___
1226
1227
foreach (split("\n",$code)) {
1228
s/\`([^\`]*)\`/eval $1/geo;
1229
1230
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
1231
s/\bret\b/bx lr/go or
1232
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
1233
1234
print $_,"\n";
1235
}
1236
close STDOUT; # enforce flush
1237
1238