Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/arm/poly1305-armv4.S
39482 views
1
/* Do not modify. This file is auto-generated from poly1305-armv4.pl. */
2
#include "arm_arch.h"
3
4
#if defined(__thumb2__)
5
.syntax unified
6
.thumb
7
#else
8
.code 32
9
#endif
10
11
.text
12
13
.globl poly1305_emit
14
.globl poly1305_blocks
15
.globl poly1305_init
16
.type poly1305_init,%function
17
.align 5
18
poly1305_init:
19
.Lpoly1305_init:
20
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
21
22
eor r3,r3,r3
23
cmp r1,#0
24
str r3,[r0,#0] @ zero hash value
25
str r3,[r0,#4]
26
str r3,[r0,#8]
27
str r3,[r0,#12]
28
str r3,[r0,#16]
29
str r3,[r0,#36] @ is_base2_26
30
add r0,r0,#20
31
32
#ifdef __thumb2__
33
it eq
34
#endif
35
moveq r0,#0
36
beq .Lno_key
37
38
#if __ARM_MAX_ARCH__>=7
39
adr r11,.Lpoly1305_init
40
ldr r12,.LOPENSSL_armcap
41
#endif
42
ldrb r4,[r1,#0]
43
mov r10,#0x0fffffff
44
ldrb r5,[r1,#1]
45
and r3,r10,#-4 @ 0x0ffffffc
46
ldrb r6,[r1,#2]
47
ldrb r7,[r1,#3]
48
orr r4,r4,r5,lsl#8
49
ldrb r5,[r1,#4]
50
orr r4,r4,r6,lsl#16
51
ldrb r6,[r1,#5]
52
orr r4,r4,r7,lsl#24
53
ldrb r7,[r1,#6]
54
and r4,r4,r10
55
56
#if __ARM_MAX_ARCH__>=7
57
# if !defined(_WIN32)
58
ldr r12,[r11,r12] @ OPENSSL_armcap_P
59
# endif
60
# if defined(__APPLE__) || defined(_WIN32)
61
ldr r12,[r12]
62
# endif
63
#endif
64
ldrb r8,[r1,#7]
65
orr r5,r5,r6,lsl#8
66
ldrb r6,[r1,#8]
67
orr r5,r5,r7,lsl#16
68
ldrb r7,[r1,#9]
69
orr r5,r5,r8,lsl#24
70
ldrb r8,[r1,#10]
71
and r5,r5,r3
72
73
#if __ARM_MAX_ARCH__>=7
74
tst r12,#ARMV7_NEON @ check for NEON
75
# ifdef __thumb2__
76
adr r9,.Lpoly1305_blocks_neon
77
adr r11,.Lpoly1305_blocks
78
adr r12,.Lpoly1305_emit
79
adr r10,.Lpoly1305_emit_neon
80
itt ne
81
movne r11,r9
82
movne r12,r10
83
orr r11,r11,#1 @ thumb-ify address
84
orr r12,r12,#1
85
# else
86
addeq r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
87
addne r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init)
88
addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
89
addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
90
# endif
91
#endif
92
ldrb r9,[r1,#11]
93
orr r6,r6,r7,lsl#8
94
ldrb r7,[r1,#12]
95
orr r6,r6,r8,lsl#16
96
ldrb r8,[r1,#13]
97
orr r6,r6,r9,lsl#24
98
ldrb r9,[r1,#14]
99
and r6,r6,r3
100
101
ldrb r10,[r1,#15]
102
orr r7,r7,r8,lsl#8
103
str r4,[r0,#0]
104
orr r7,r7,r9,lsl#16
105
str r5,[r0,#4]
106
orr r7,r7,r10,lsl#24
107
str r6,[r0,#8]
108
and r7,r7,r3
109
str r7,[r0,#12]
110
#if __ARM_MAX_ARCH__>=7
111
stmia r2,{r11,r12} @ fill functions table
112
mov r0,#1
113
#else
114
mov r0,#0
115
#endif
116
.Lno_key:
117
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
118
#if __ARM_ARCH__>=5
119
bx lr @ bx lr
120
#else
121
tst lr,#1
122
moveq pc,lr @ be binary compatible with V4, yet
123
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
124
#endif
125
.size poly1305_init,.-poly1305_init
126
.type poly1305_blocks,%function
127
.align 5
128
poly1305_blocks:
129
.Lpoly1305_blocks:
130
stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
131
132
ands r2,r2,#-16
133
beq .Lno_data
134
135
cmp r3,#0
136
add r2,r2,r1 @ end pointer
137
sub sp,sp,#32
138
139
ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12} @ load context
140
141
str r0,[sp,#12] @ offload stuff
142
mov lr,r1
143
str r2,[sp,#16]
144
str r10,[sp,#20]
145
str r11,[sp,#24]
146
str r12,[sp,#28]
147
b .Loop
148
149
.Loop:
150
#if __ARM_ARCH__<7
151
ldrb r0,[lr],#16 @ load input
152
# ifdef __thumb2__
153
it hi
154
# endif
155
addhi r8,r8,#1 @ 1<<128
156
ldrb r1,[lr,#-15]
157
ldrb r2,[lr,#-14]
158
ldrb r3,[lr,#-13]
159
orr r1,r0,r1,lsl#8
160
ldrb r0,[lr,#-12]
161
orr r2,r1,r2,lsl#16
162
ldrb r1,[lr,#-11]
163
orr r3,r2,r3,lsl#24
164
ldrb r2,[lr,#-10]
165
adds r4,r4,r3 @ accumulate input
166
167
ldrb r3,[lr,#-9]
168
orr r1,r0,r1,lsl#8
169
ldrb r0,[lr,#-8]
170
orr r2,r1,r2,lsl#16
171
ldrb r1,[lr,#-7]
172
orr r3,r2,r3,lsl#24
173
ldrb r2,[lr,#-6]
174
adcs r5,r5,r3
175
176
ldrb r3,[lr,#-5]
177
orr r1,r0,r1,lsl#8
178
ldrb r0,[lr,#-4]
179
orr r2,r1,r2,lsl#16
180
ldrb r1,[lr,#-3]
181
orr r3,r2,r3,lsl#24
182
ldrb r2,[lr,#-2]
183
adcs r6,r6,r3
184
185
ldrb r3,[lr,#-1]
186
orr r1,r0,r1,lsl#8
187
str lr,[sp,#8] @ offload input pointer
188
orr r2,r1,r2,lsl#16
189
add r10,r10,r10,lsr#2
190
orr r3,r2,r3,lsl#24
191
#else
192
ldr r0,[lr],#16 @ load input
193
# ifdef __thumb2__
194
it hi
195
# endif
196
addhi r8,r8,#1 @ padbit
197
ldr r1,[lr,#-12]
198
ldr r2,[lr,#-8]
199
ldr r3,[lr,#-4]
200
# ifdef __ARMEB__
201
rev r0,r0
202
rev r1,r1
203
rev r2,r2
204
rev r3,r3
205
# endif
206
adds r4,r4,r0 @ accumulate input
207
str lr,[sp,#8] @ offload input pointer
208
adcs r5,r5,r1
209
add r10,r10,r10,lsr#2
210
adcs r6,r6,r2
211
#endif
212
add r11,r11,r11,lsr#2
213
adcs r7,r7,r3
214
add r12,r12,r12,lsr#2
215
216
umull r2,r3,r5,r9
217
adc r8,r8,#0
218
umull r0,r1,r4,r9
219
umlal r2,r3,r8,r10
220
umlal r0,r1,r7,r10
221
ldr r10,[sp,#20] @ reload r10
222
umlal r2,r3,r6,r12
223
umlal r0,r1,r5,r12
224
umlal r2,r3,r7,r11
225
umlal r0,r1,r6,r11
226
umlal r2,r3,r4,r10
227
str r0,[sp,#0] @ future r4
228
mul r0,r11,r8
229
ldr r11,[sp,#24] @ reload r11
230
adds r2,r2,r1 @ d1+=d0>>32
231
eor r1,r1,r1
232
adc lr,r3,#0 @ future r6
233
str r2,[sp,#4] @ future r5
234
235
mul r2,r12,r8
236
eor r3,r3,r3
237
umlal r0,r1,r7,r12
238
ldr r12,[sp,#28] @ reload r12
239
umlal r2,r3,r7,r9
240
umlal r0,r1,r6,r9
241
umlal r2,r3,r6,r10
242
umlal r0,r1,r5,r10
243
umlal r2,r3,r5,r11
244
umlal r0,r1,r4,r11
245
umlal r2,r3,r4,r12
246
ldr r4,[sp,#0]
247
mul r8,r9,r8
248
ldr r5,[sp,#4]
249
250
adds r6,lr,r0 @ d2+=d1>>32
251
ldr lr,[sp,#8] @ reload input pointer
252
adc r1,r1,#0
253
adds r7,r2,r1 @ d3+=d2>>32
254
ldr r0,[sp,#16] @ reload end pointer
255
adc r3,r3,#0
256
add r8,r8,r3 @ h4+=d3>>32
257
258
and r1,r8,#-4
259
and r8,r8,#3
260
add r1,r1,r1,lsr#2 @ *=5
261
adds r4,r4,r1
262
adcs r5,r5,#0
263
adcs r6,r6,#0
264
adcs r7,r7,#0
265
adc r8,r8,#0
266
267
cmp r0,lr @ done yet?
268
bhi .Loop
269
270
ldr r0,[sp,#12]
271
add sp,sp,#32
272
stmia r0,{r4,r5,r6,r7,r8} @ store the result
273
274
.Lno_data:
275
#if __ARM_ARCH__>=5
276
ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc}
277
#else
278
ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
279
tst lr,#1
280
moveq pc,lr @ be binary compatible with V4, yet
281
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
282
#endif
283
.size poly1305_blocks,.-poly1305_blocks
284
.type poly1305_emit,%function
285
.align 5
286
poly1305_emit:
287
.Lpoly1305_emit:
288
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
289
.Lpoly1305_emit_enter:
290
291
ldmia r0,{r3,r4,r5,r6,r7}
292
adds r8,r3,#5 @ compare to modulus
293
adcs r9,r4,#0
294
adcs r10,r5,#0
295
adcs r11,r6,#0
296
adc r7,r7,#0
297
tst r7,#4 @ did it carry/borrow?
298
299
#ifdef __thumb2__
300
it ne
301
#endif
302
movne r3,r8
303
ldr r8,[r2,#0]
304
#ifdef __thumb2__
305
it ne
306
#endif
307
movne r4,r9
308
ldr r9,[r2,#4]
309
#ifdef __thumb2__
310
it ne
311
#endif
312
movne r5,r10
313
ldr r10,[r2,#8]
314
#ifdef __thumb2__
315
it ne
316
#endif
317
movne r6,r11
318
ldr r11,[r2,#12]
319
320
adds r3,r3,r8
321
adcs r4,r4,r9
322
adcs r5,r5,r10
323
adc r6,r6,r11
324
325
#if __ARM_ARCH__>=7
326
# ifdef __ARMEB__
327
rev r3,r3
328
rev r4,r4
329
rev r5,r5
330
rev r6,r6
331
# endif
332
str r3,[r1,#0]
333
str r4,[r1,#4]
334
str r5,[r1,#8]
335
str r6,[r1,#12]
336
#else
337
strb r3,[r1,#0]
338
mov r3,r3,lsr#8
339
strb r4,[r1,#4]
340
mov r4,r4,lsr#8
341
strb r5,[r1,#8]
342
mov r5,r5,lsr#8
343
strb r6,[r1,#12]
344
mov r6,r6,lsr#8
345
346
strb r3,[r1,#1]
347
mov r3,r3,lsr#8
348
strb r4,[r1,#5]
349
mov r4,r4,lsr#8
350
strb r5,[r1,#9]
351
mov r5,r5,lsr#8
352
strb r6,[r1,#13]
353
mov r6,r6,lsr#8
354
355
strb r3,[r1,#2]
356
mov r3,r3,lsr#8
357
strb r4,[r1,#6]
358
mov r4,r4,lsr#8
359
strb r5,[r1,#10]
360
mov r5,r5,lsr#8
361
strb r6,[r1,#14]
362
mov r6,r6,lsr#8
363
364
strb r3,[r1,#3]
365
strb r4,[r1,#7]
366
strb r5,[r1,#11]
367
strb r6,[r1,#15]
368
#endif
369
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
370
#if __ARM_ARCH__>=5
371
bx lr @ bx lr
372
#else
373
tst lr,#1
374
moveq pc,lr @ be binary compatible with V4, yet
375
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
376
#endif
377
.size poly1305_emit,.-poly1305_emit
378
#if __ARM_MAX_ARCH__>=7
379
.fpu neon
380
381
.type poly1305_init_neon,%function
382
.align 5
383
poly1305_init_neon:
384
ldr r4,[r0,#20] @ load key base 2^32
385
ldr r5,[r0,#24]
386
ldr r6,[r0,#28]
387
ldr r7,[r0,#32]
388
389
and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
390
mov r3,r4,lsr#26
391
mov r4,r5,lsr#20
392
orr r3,r3,r5,lsl#6
393
mov r5,r6,lsr#14
394
orr r4,r4,r6,lsl#12
395
mov r6,r7,lsr#8
396
orr r5,r5,r7,lsl#18
397
and r3,r3,#0x03ffffff
398
and r4,r4,#0x03ffffff
399
and r5,r5,#0x03ffffff
400
401
vdup.32 d0,r2 @ r^1 in both lanes
402
add r2,r3,r3,lsl#2 @ *5
403
vdup.32 d1,r3
404
add r3,r4,r4,lsl#2
405
vdup.32 d2,r2
406
vdup.32 d3,r4
407
add r4,r5,r5,lsl#2
408
vdup.32 d4,r3
409
vdup.32 d5,r5
410
add r5,r6,r6,lsl#2
411
vdup.32 d6,r4
412
vdup.32 d7,r6
413
vdup.32 d8,r5
414
415
mov r5,#2 @ counter
416
417
.Lsquare_neon:
418
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
419
@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
420
@ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
421
@ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
422
@ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
423
@ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
424
425
vmull.u32 q5,d0,d0[1]
426
vmull.u32 q6,d1,d0[1]
427
vmull.u32 q7,d3,d0[1]
428
vmull.u32 q8,d5,d0[1]
429
vmull.u32 q9,d7,d0[1]
430
431
vmlal.u32 q5,d7,d2[1]
432
vmlal.u32 q6,d0,d1[1]
433
vmlal.u32 q7,d1,d1[1]
434
vmlal.u32 q8,d3,d1[1]
435
vmlal.u32 q9,d5,d1[1]
436
437
vmlal.u32 q5,d5,d4[1]
438
vmlal.u32 q6,d7,d4[1]
439
vmlal.u32 q8,d1,d3[1]
440
vmlal.u32 q7,d0,d3[1]
441
vmlal.u32 q9,d3,d3[1]
442
443
vmlal.u32 q5,d3,d6[1]
444
vmlal.u32 q8,d0,d5[1]
445
vmlal.u32 q6,d5,d6[1]
446
vmlal.u32 q7,d7,d6[1]
447
vmlal.u32 q9,d1,d5[1]
448
449
vmlal.u32 q8,d7,d8[1]
450
vmlal.u32 q5,d1,d8[1]
451
vmlal.u32 q6,d3,d8[1]
452
vmlal.u32 q7,d5,d8[1]
453
vmlal.u32 q9,d0,d7[1]
454
455
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
456
@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
457
@ and P. Schwabe
458
@
459
@ H0>>+H1>>+H2>>+H3>>+H4
460
@ H3>>+H4>>*5+H0>>+H1
461
@
462
@ Trivia.
463
@
464
@ Result of multiplication of n-bit number by m-bit number is
465
@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
466
@ m-bit number multiplied by 2^n is still n+m bits wide.
467
@
468
@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
469
@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
470
@ one is n+1 bits wide.
471
@
472
@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
473
@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
474
@ can be 27. However! In cases when their width exceeds 26 bits
475
@ they are limited by 2^26+2^6. This in turn means that *sum*
476
@ of the products with these values can still be viewed as sum
477
@ of 52-bit numbers as long as the amount of addends is not a
478
@ power of 2. For example,
479
@
480
@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
481
@
482
@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
483
@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
484
@ 8 * (2^52) or 2^55. However, the value is then multiplied by
485
@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
486
@ which is less than 32 * (2^52) or 2^57. And when processing
487
@ data we are looking at triple as many addends...
488
@
489
@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
490
@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
491
@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
492
@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
493
@ instruction accepts 2x32-bit input and writes 2x64-bit result.
494
@ This means that result of reduction have to be compressed upon
495
@ loop wrap-around. This can be done in the process of reduction
496
@ to minimize amount of instructions [as well as amount of
497
@ 128-bit instructions, which benefits low-end processors], but
498
@ one has to watch for H2 (which is narrower than H0) and 5*H4
499
@ not being wider than 58 bits, so that result of right shift
500
@ by 26 bits fits in 32 bits. This is also useful on x86,
501
@ because it allows to use paddd in place for paddq, which
502
@ benefits Atom, where paddq is ridiculously slow.
503
504
vshr.u64 q15,q8,#26
505
vmovn.i64 d16,q8
506
vshr.u64 q4,q5,#26
507
vmovn.i64 d10,q5
508
vadd.i64 q9,q9,q15 @ h3 -> h4
509
vbic.i32 d16,#0xfc000000 @ &=0x03ffffff
510
vadd.i64 q6,q6,q4 @ h0 -> h1
511
vbic.i32 d10,#0xfc000000
512
513
vshrn.u64 d30,q9,#26
514
vmovn.i64 d18,q9
515
vshr.u64 q4,q6,#26
516
vmovn.i64 d12,q6
517
vadd.i64 q7,q7,q4 @ h1 -> h2
518
vbic.i32 d18,#0xfc000000
519
vbic.i32 d12,#0xfc000000
520
521
vadd.i32 d10,d10,d30
522
vshl.u32 d30,d30,#2
523
vshrn.u64 d8,q7,#26
524
vmovn.i64 d14,q7
525
vadd.i32 d10,d10,d30 @ h4 -> h0
526
vadd.i32 d16,d16,d8 @ h2 -> h3
527
vbic.i32 d14,#0xfc000000
528
529
vshr.u32 d30,d10,#26
530
vbic.i32 d10,#0xfc000000
531
vshr.u32 d8,d16,#26
532
vbic.i32 d16,#0xfc000000
533
vadd.i32 d12,d12,d30 @ h0 -> h1
534
vadd.i32 d18,d18,d8 @ h3 -> h4
535
536
subs r5,r5,#1
537
beq .Lsquare_break_neon
538
539
add r6,r0,#(48+0*9*4)
540
add r7,r0,#(48+1*9*4)
541
542
vtrn.32 d0,d10 @ r^2:r^1
543
vtrn.32 d3,d14
544
vtrn.32 d5,d16
545
vtrn.32 d1,d12
546
vtrn.32 d7,d18
547
548
vshl.u32 d4,d3,#2 @ *5
549
vshl.u32 d6,d5,#2
550
vshl.u32 d2,d1,#2
551
vshl.u32 d8,d7,#2
552
vadd.i32 d4,d4,d3
553
vadd.i32 d2,d2,d1
554
vadd.i32 d6,d6,d5
555
vadd.i32 d8,d8,d7
556
557
vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
558
vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
559
vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
560
vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
561
vst1.32 {d8[0]},[r6,:32]
562
vst1.32 {d8[1]},[r7,:32]
563
564
b .Lsquare_neon
565
566
.align 4
567
.Lsquare_break_neon:
568
add r6,r0,#(48+2*4*9)
569
add r7,r0,#(48+3*4*9)
570
571
vmov d0,d10 @ r^4:r^3
572
vshl.u32 d2,d12,#2 @ *5
573
vmov d1,d12
574
vshl.u32 d4,d14,#2
575
vmov d3,d14
576
vshl.u32 d6,d16,#2
577
vmov d5,d16
578
vshl.u32 d8,d18,#2
579
vmov d7,d18
580
vadd.i32 d2,d2,d12
581
vadd.i32 d4,d4,d14
582
vadd.i32 d6,d6,d16
583
vadd.i32 d8,d8,d18
584
585
vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
586
vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
587
vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
588
vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
589
vst1.32 {d8[0]},[r6]
590
vst1.32 {d8[1]},[r7]
591
592
bx lr @ bx lr
593
.size poly1305_init_neon,.-poly1305_init_neon
594
595
.type poly1305_blocks_neon,%function
596
.align 5
597
poly1305_blocks_neon:
598
.Lpoly1305_blocks_neon:
599
ldr ip,[r0,#36] @ is_base2_26
600
ands r2,r2,#-16
601
beq .Lno_data_neon
602
603
cmp r2,#64
604
bhs .Lenter_neon
605
tst ip,ip @ is_base2_26?
606
beq .Lpoly1305_blocks
607
608
.Lenter_neon:
609
stmdb sp!,{r4,r5,r6,r7}
610
vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
611
612
tst ip,ip @ is_base2_26?
613
bne .Lbase2_26_neon
614
615
stmdb sp!,{r1,r2,r3,lr}
616
bl poly1305_init_neon
617
618
ldr r4,[r0,#0] @ load hash value base 2^32
619
ldr r5,[r0,#4]
620
ldr r6,[r0,#8]
621
ldr r7,[r0,#12]
622
ldr ip,[r0,#16]
623
624
and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
625
mov r3,r4,lsr#26
626
veor d10,d10,d10
627
mov r4,r5,lsr#20
628
orr r3,r3,r5,lsl#6
629
veor d12,d12,d12
630
mov r5,r6,lsr#14
631
orr r4,r4,r6,lsl#12
632
veor d14,d14,d14
633
mov r6,r7,lsr#8
634
orr r5,r5,r7,lsl#18
635
veor d16,d16,d16
636
and r3,r3,#0x03ffffff
637
orr r6,r6,ip,lsl#24
638
veor d18,d18,d18
639
and r4,r4,#0x03ffffff
640
mov r1,#1
641
and r5,r5,#0x03ffffff
642
str r1,[r0,#36] @ is_base2_26
643
644
vmov.32 d10[0],r2
645
vmov.32 d12[0],r3
646
vmov.32 d14[0],r4
647
vmov.32 d16[0],r5
648
vmov.32 d18[0],r6
649
adr r5,.Lzeros
650
651
ldmia sp!,{r1,r2,r3,lr}
652
b .Lbase2_32_neon
653
654
.align 4
655
.Lbase2_26_neon:
656
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
657
@ load hash value
658
659
veor d10,d10,d10
660
veor d12,d12,d12
661
veor d14,d14,d14
662
veor d16,d16,d16
663
veor d18,d18,d18
664
vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
665
adr r5,.Lzeros
666
vld1.32 {d18[0]},[r0]
667
sub r0,r0,#16 @ rewind
668
669
.Lbase2_32_neon:
670
add r4,r1,#32
671
mov r3,r3,lsl#24
672
tst r2,#31
673
beq .Leven
674
675
vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]!
676
vmov.32 d28[0],r3
677
sub r2,r2,#16
678
add r4,r1,#32
679
680
# ifdef __ARMEB__
681
vrev32.8 q10,q10
682
vrev32.8 q13,q13
683
vrev32.8 q11,q11
684
vrev32.8 q12,q12
685
# endif
686
vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26
687
vshl.u32 d26,d26,#18
688
689
vsri.u32 d26,d24,#14
690
vshl.u32 d24,d24,#12
691
vadd.i32 d29,d28,d18 @ add hash value and move to #hi
692
693
vbic.i32 d26,#0xfc000000
694
vsri.u32 d24,d22,#20
695
vshl.u32 d22,d22,#6
696
697
vbic.i32 d24,#0xfc000000
698
vsri.u32 d22,d20,#26
699
vadd.i32 d27,d26,d16
700
701
vbic.i32 d20,#0xfc000000
702
vbic.i32 d22,#0xfc000000
703
vadd.i32 d25,d24,d14
704
705
vadd.i32 d21,d20,d10
706
vadd.i32 d23,d22,d12
707
708
mov r7,r5
709
add r6,r0,#48
710
711
cmp r2,r2
712
b .Long_tail
713
714
.align 4
715
.Leven:
716
subs r2,r2,#64
717
it lo
718
movlo r4,r5
719
720
vmov.i32 q14,#1<<24 @ padbit, yes, always
721
vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
722
add r1,r1,#64
723
vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
724
add r4,r4,#64
725
itt hi
726
addhi r7,r0,#(48+1*9*4)
727
addhi r6,r0,#(48+3*9*4)
728
729
# ifdef __ARMEB__
730
vrev32.8 q10,q10
731
vrev32.8 q13,q13
732
vrev32.8 q11,q11
733
vrev32.8 q12,q12
734
# endif
735
vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
736
vshl.u32 q13,q13,#18
737
738
vsri.u32 q13,q12,#14
739
vshl.u32 q12,q12,#12
740
741
vbic.i32 q13,#0xfc000000
742
vsri.u32 q12,q11,#20
743
vshl.u32 q11,q11,#6
744
745
vbic.i32 q12,#0xfc000000
746
vsri.u32 q11,q10,#26
747
748
vbic.i32 q10,#0xfc000000
749
vbic.i32 q11,#0xfc000000
750
751
bls .Lskip_loop
752
753
vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2
754
vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
755
vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
756
vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
757
b .Loop_neon
758
759
.align 5
760
.Loop_neon:
761
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
762
@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
763
@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
764
@ ___________________/
765
@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
766
@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
767
@ ___________________/ ____________________/
768
@
769
@ Note that we start with inp[2:3]*r^2. This is because it
770
@ doesn't depend on reduction in previous iteration.
771
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
772
@ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
773
@ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
774
@ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
775
@ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
776
@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
777
778
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
779
@ inp[2:3]*r^2
780
781
vadd.i32 d24,d24,d14 @ accumulate inp[0:1]
782
vmull.u32 q7,d25,d0[1]
783
vadd.i32 d20,d20,d10
784
vmull.u32 q5,d21,d0[1]
785
vadd.i32 d26,d26,d16
786
vmull.u32 q8,d27,d0[1]
787
vmlal.u32 q7,d23,d1[1]
788
vadd.i32 d22,d22,d12
789
vmull.u32 q6,d23,d0[1]
790
791
vadd.i32 d28,d28,d18
792
vmull.u32 q9,d29,d0[1]
793
subs r2,r2,#64
794
vmlal.u32 q5,d29,d2[1]
795
it lo
796
movlo r4,r5
797
vmlal.u32 q8,d25,d1[1]
798
vld1.32 d8[1],[r7,:32]
799
vmlal.u32 q6,d21,d1[1]
800
vmlal.u32 q9,d27,d1[1]
801
802
vmlal.u32 q5,d27,d4[1]
803
vmlal.u32 q8,d23,d3[1]
804
vmlal.u32 q9,d25,d3[1]
805
vmlal.u32 q6,d29,d4[1]
806
vmlal.u32 q7,d21,d3[1]
807
808
vmlal.u32 q8,d21,d5[1]
809
vmlal.u32 q5,d25,d6[1]
810
vmlal.u32 q9,d23,d5[1]
811
vmlal.u32 q6,d27,d6[1]
812
vmlal.u32 q7,d29,d6[1]
813
814
vmlal.u32 q8,d29,d8[1]
815
vmlal.u32 q5,d23,d8[1]
816
vmlal.u32 q9,d21,d7[1]
817
vmlal.u32 q6,d25,d8[1]
818
vmlal.u32 q7,d27,d8[1]
819
820
vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
821
add r4,r4,#64
822
823
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
824
@ (hash+inp[0:1])*r^4 and accumulate
825
826
vmlal.u32 q8,d26,d0[0]
827
vmlal.u32 q5,d20,d0[0]
828
vmlal.u32 q9,d28,d0[0]
829
vmlal.u32 q6,d22,d0[0]
830
vmlal.u32 q7,d24,d0[0]
831
vld1.32 d8[0],[r6,:32]
832
833
vmlal.u32 q8,d24,d1[0]
834
vmlal.u32 q5,d28,d2[0]
835
vmlal.u32 q9,d26,d1[0]
836
vmlal.u32 q6,d20,d1[0]
837
vmlal.u32 q7,d22,d1[0]
838
839
vmlal.u32 q8,d22,d3[0]
840
vmlal.u32 q5,d26,d4[0]
841
vmlal.u32 q9,d24,d3[0]
842
vmlal.u32 q6,d28,d4[0]
843
vmlal.u32 q7,d20,d3[0]
844
845
vmlal.u32 q8,d20,d5[0]
846
vmlal.u32 q5,d24,d6[0]
847
vmlal.u32 q9,d22,d5[0]
848
vmlal.u32 q6,d26,d6[0]
849
vmlal.u32 q8,d28,d8[0]
850
851
vmlal.u32 q7,d28,d6[0]
852
vmlal.u32 q5,d22,d8[0]
853
vmlal.u32 q9,d20,d7[0]
854
vmov.i32 q14,#1<<24 @ padbit, yes, always
855
vmlal.u32 q6,d24,d8[0]
856
vmlal.u32 q7,d26,d8[0]
857
858
vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
859
add r1,r1,#64
860
# ifdef __ARMEB__
861
vrev32.8 q10,q10
862
vrev32.8 q11,q11
863
vrev32.8 q12,q12
864
vrev32.8 q13,q13
865
# endif
866
867
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
868
@ lazy reduction interleaved with base 2^32 -> base 2^26 of
869
@ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
870
871
vshr.u64 q15,q8,#26
872
vmovn.i64 d16,q8
873
vshr.u64 q4,q5,#26
874
vmovn.i64 d10,q5
875
vadd.i64 q9,q9,q15 @ h3 -> h4
876
vbic.i32 d16,#0xfc000000
877
vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
878
vadd.i64 q6,q6,q4 @ h0 -> h1
879
vshl.u32 q13,q13,#18
880
vbic.i32 d10,#0xfc000000
881
882
vshrn.u64 d30,q9,#26
883
vmovn.i64 d18,q9
884
vshr.u64 q4,q6,#26
885
vmovn.i64 d12,q6
886
vadd.i64 q7,q7,q4 @ h1 -> h2
887
vsri.u32 q13,q12,#14
888
vbic.i32 d18,#0xfc000000
889
vshl.u32 q12,q12,#12
890
vbic.i32 d12,#0xfc000000
891
892
vadd.i32 d10,d10,d30
893
vshl.u32 d30,d30,#2
894
vbic.i32 q13,#0xfc000000
895
vshrn.u64 d8,q7,#26
896
vmovn.i64 d14,q7
897
vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec]
898
vsri.u32 q12,q11,#20
899
vadd.i32 d16,d16,d8 @ h2 -> h3
900
vshl.u32 q11,q11,#6
901
vbic.i32 d14,#0xfc000000
902
vbic.i32 q12,#0xfc000000
903
904
vshrn.u64 d30,q5,#26 @ re-narrow
905
vmovn.i64 d10,q5
906
vsri.u32 q11,q10,#26
907
vbic.i32 q10,#0xfc000000
908
vshr.u32 d8,d16,#26
909
vbic.i32 d16,#0xfc000000
910
vbic.i32 d10,#0xfc000000
911
vadd.i32 d12,d12,d30 @ h0 -> h1
912
vadd.i32 d18,d18,d8 @ h3 -> h4
913
vbic.i32 q11,#0xfc000000
914
915
bhi .Loop_neon
916
917
.Lskip_loop:
918
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
919
@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
920
921
add r7,r0,#(48+0*9*4)
922
add r6,r0,#(48+1*9*4)
923
adds r2,r2,#32
924
it ne
925
movne r2,#0
926
bne .Long_tail
927
928
vadd.i32 d25,d24,d14 @ add hash value and move to #hi
929
vadd.i32 d21,d20,d10
930
vadd.i32 d27,d26,d16
931
vadd.i32 d23,d22,d12
932
vadd.i32 d29,d28,d18
933
934
.Long_tail:
935
vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1
936
vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2
937
938
vadd.i32 d24,d24,d14 @ can be redundant
939
vmull.u32 q7,d25,d0
940
vadd.i32 d20,d20,d10
941
vmull.u32 q5,d21,d0
942
vadd.i32 d26,d26,d16
943
vmull.u32 q8,d27,d0
944
vadd.i32 d22,d22,d12
945
vmull.u32 q6,d23,d0
946
vadd.i32 d28,d28,d18
947
vmull.u32 q9,d29,d0
948
949
vmlal.u32 q5,d29,d2
950
vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
951
vmlal.u32 q8,d25,d1
952
vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
953
vmlal.u32 q6,d21,d1
954
vmlal.u32 q9,d27,d1
955
vmlal.u32 q7,d23,d1
956
957
vmlal.u32 q8,d23,d3
958
vld1.32 d8[1],[r7,:32]
959
vmlal.u32 q5,d27,d4
960
vld1.32 d8[0],[r6,:32]
961
vmlal.u32 q9,d25,d3
962
vmlal.u32 q6,d29,d4
963
vmlal.u32 q7,d21,d3
964
965
vmlal.u32 q8,d21,d5
966
it ne
967
addne r7,r0,#(48+2*9*4)
968
vmlal.u32 q5,d25,d6
969
it ne
970
addne r6,r0,#(48+3*9*4)
971
vmlal.u32 q9,d23,d5
972
vmlal.u32 q6,d27,d6
973
vmlal.u32 q7,d29,d6
974
975
vmlal.u32 q8,d29,d8
976
vorn q0,q0,q0 @ all-ones, can be redundant
977
vmlal.u32 q5,d23,d8
978
vshr.u64 q0,q0,#38
979
vmlal.u32 q9,d21,d7
980
vmlal.u32 q6,d25,d8
981
vmlal.u32 q7,d27,d8
982
983
beq .Lshort_tail
984
985
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
986
@ (hash+inp[0:1])*r^4:r^3 and accumulate
987
988
vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3
989
vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
990
991
vmlal.u32 q7,d24,d0
992
vmlal.u32 q5,d20,d0
993
vmlal.u32 q8,d26,d0
994
vmlal.u32 q6,d22,d0
995
vmlal.u32 q9,d28,d0
996
997
vmlal.u32 q5,d28,d2
998
vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
999
vmlal.u32 q8,d24,d1
1000
vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
1001
vmlal.u32 q6,d20,d1
1002
vmlal.u32 q9,d26,d1
1003
vmlal.u32 q7,d22,d1
1004
1005
vmlal.u32 q8,d22,d3
1006
vld1.32 d8[1],[r7,:32]
1007
vmlal.u32 q5,d26,d4
1008
vld1.32 d8[0],[r6,:32]
1009
vmlal.u32 q9,d24,d3
1010
vmlal.u32 q6,d28,d4
1011
vmlal.u32 q7,d20,d3
1012
1013
vmlal.u32 q8,d20,d5
1014
vmlal.u32 q5,d24,d6
1015
vmlal.u32 q9,d22,d5
1016
vmlal.u32 q6,d26,d6
1017
vmlal.u32 q7,d28,d6
1018
1019
vmlal.u32 q8,d28,d8
1020
vorn q0,q0,q0 @ all-ones
1021
vmlal.u32 q5,d22,d8
1022
vshr.u64 q0,q0,#38
1023
vmlal.u32 q9,d20,d7
1024
vmlal.u32 q6,d24,d8
1025
vmlal.u32 q7,d26,d8
1026
1027
.Lshort_tail:
1028
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1029
@ horizontal addition
1030
1031
vadd.i64 d16,d16,d17
1032
vadd.i64 d10,d10,d11
1033
vadd.i64 d18,d18,d19
1034
vadd.i64 d12,d12,d13
1035
vadd.i64 d14,d14,d15
1036
1037
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1038
@ lazy reduction, but without narrowing
1039
1040
vshr.u64 q15,q8,#26
1041
vand.i64 q8,q8,q0
1042
vshr.u64 q4,q5,#26
1043
vand.i64 q5,q5,q0
1044
vadd.i64 q9,q9,q15 @ h3 -> h4
1045
vadd.i64 q6,q6,q4 @ h0 -> h1
1046
1047
vshr.u64 q15,q9,#26
1048
vand.i64 q9,q9,q0
1049
vshr.u64 q4,q6,#26
1050
vand.i64 q6,q6,q0
1051
vadd.i64 q7,q7,q4 @ h1 -> h2
1052
1053
vadd.i64 q5,q5,q15
1054
vshl.u64 q15,q15,#2
1055
vshr.u64 q4,q7,#26
1056
vand.i64 q7,q7,q0
1057
vadd.i64 q5,q5,q15 @ h4 -> h0
1058
vadd.i64 q8,q8,q4 @ h2 -> h3
1059
1060
vshr.u64 q15,q5,#26
1061
vand.i64 q5,q5,q0
1062
vshr.u64 q4,q8,#26
1063
vand.i64 q8,q8,q0
1064
vadd.i64 q6,q6,q15 @ h0 -> h1
1065
vadd.i64 q9,q9,q4 @ h3 -> h4
1066
1067
cmp r2,#0
1068
bne .Leven
1069
1070
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1071
@ store hash value
1072
1073
vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
1074
vst1.32 {d18[0]},[r0]
1075
1076
vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ epilogue
1077
ldmia sp!,{r4,r5,r6,r7}
1078
.Lno_data_neon:
1079
bx lr @ bx lr
1080
.size poly1305_blocks_neon,.-poly1305_blocks_neon
1081
1082
.type poly1305_emit_neon,%function
1083
.align 5
1084
poly1305_emit_neon:
1085
.Lpoly1305_emit_neon:
1086
ldr ip,[r0,#36] @ is_base2_26
1087
1088
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1089
1090
tst ip,ip
1091
beq .Lpoly1305_emit_enter
1092
1093
ldmia r0,{r3,r4,r5,r6,r7}
1094
eor r8,r8,r8
1095
1096
adds r3,r3,r4,lsl#26 @ base 2^26 -> base 2^32
1097
mov r4,r4,lsr#6
1098
adcs r4,r4,r5,lsl#20
1099
mov r5,r5,lsr#12
1100
adcs r5,r5,r6,lsl#14
1101
mov r6,r6,lsr#18
1102
adcs r6,r6,r7,lsl#8
1103
adc r7,r8,r7,lsr#24 @ can be partially reduced ...
1104
1105
and r8,r7,#-4 @ ... so reduce
1106
and r7,r6,#3
1107
add r8,r8,r8,lsr#2 @ *= 5
1108
adds r3,r3,r8
1109
adcs r4,r4,#0
1110
adcs r5,r5,#0
1111
adcs r6,r6,#0
1112
adc r7,r7,#0
1113
1114
adds r8,r3,#5 @ compare to modulus
1115
adcs r9,r4,#0
1116
adcs r10,r5,#0
1117
adcs r11,r6,#0
1118
adc r7,r7,#0
1119
tst r7,#4 @ did it carry/borrow?
1120
1121
it ne
1122
movne r3,r8
1123
ldr r8,[r2,#0]
1124
it ne
1125
movne r4,r9
1126
ldr r9,[r2,#4]
1127
it ne
1128
movne r5,r10
1129
ldr r10,[r2,#8]
1130
it ne
1131
movne r6,r11
1132
ldr r11,[r2,#12]
1133
1134
adds r3,r3,r8 @ accumulate nonce
1135
adcs r4,r4,r9
1136
adcs r5,r5,r10
1137
adc r6,r6,r11
1138
1139
# ifdef __ARMEB__
1140
rev r3,r3
1141
rev r4,r4
1142
rev r5,r5
1143
rev r6,r6
1144
# endif
1145
str r3,[r1,#0] @ store the result
1146
str r4,[r1,#4]
1147
str r5,[r1,#8]
1148
str r6,[r1,#12]
1149
1150
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1151
bx lr @ bx lr
1152
.size poly1305_emit_neon,.-poly1305_emit_neon
1153
1154
.align 5
1155
.Lzeros:
1156
.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1157
.LOPENSSL_armcap:
1158
# ifdef _WIN32
1159
.word OPENSSL_armcap_P
1160
# else
1161
.word OPENSSL_armcap_P-.Lpoly1305_init
1162
# endif
1163
#endif
1164
.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1165
.align 2
1166
.align 2
1167
#if __ARM_MAX_ARCH__>=7
1168
1169
.hidden OPENSSL_armcap_P
1170
#endif
1171
1172