Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/aarch64/poly1305-armv8.S
39536 views
1
/* Do not modify. This file is auto-generated from poly1305-armv8.pl. */
2
#include "arm_arch.h"
3
4
.text
5
6
// forward "declarations" are required for Apple
7
8
.hidden OPENSSL_armcap_P
9
.globl poly1305_init
10
.hidden poly1305_init
11
.globl poly1305_blocks
12
.hidden poly1305_blocks
13
.globl poly1305_emit
14
.hidden poly1305_emit
15
16
.type poly1305_init,%function
17
.align 5
18
poly1305_init:
19
AARCH64_VALID_CALL_TARGET
20
cmp x1,xzr
21
stp xzr,xzr,[x0] // zero hash value
22
stp xzr,xzr,[x0,#16] // [along with is_base2_26]
23
24
csel x0,xzr,x0,eq
25
b.eq .Lno_key
26
27
adrp x17,OPENSSL_armcap_P
28
ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
29
30
ldp x7,x8,[x1] // load key
31
mov x9,#0xfffffffc0fffffff
32
movk x9,#0x0fff,lsl#48
33
#ifdef __AARCH64EB__
34
rev x7,x7 // flip bytes
35
rev x8,x8
36
#endif
37
and x7,x7,x9 // &=0ffffffc0fffffff
38
and x9,x9,#-4
39
and x8,x8,x9 // &=0ffffffc0ffffffc
40
stp x7,x8,[x0,#32] // save key value
41
42
tst w17,#ARMV7_NEON
43
44
adrp x12,poly1305_blocks
45
add x12,x12,#:lo12:.Lpoly1305_blocks
46
adrp x7,poly1305_blocks_neon
47
add x7,x7,#:lo12:.Lpoly1305_blocks_neon
48
adrp x13,poly1305_emit
49
add x13,x13,#:lo12:.Lpoly1305_emit
50
adrp x8,poly1305_emit_neon
51
add x8,x8,#:lo12:.Lpoly1305_emit_neon
52
53
csel x12,x12,x7,eq
54
csel x13,x13,x8,eq
55
56
#ifdef __ILP32__
57
stp w12,w13,[x2]
58
#else
59
stp x12,x13,[x2]
60
#endif
61
62
mov x0,#1
63
.Lno_key:
64
ret
65
.size poly1305_init,.-poly1305_init
66
67
.type poly1305_blocks,%function
68
.align 5
69
poly1305_blocks:
70
.Lpoly1305_blocks:
71
// The symbol .Lpoly1305_blocks is not a .globl symbol
72
// but a pointer to it is returned by poly1305_init
73
AARCH64_VALID_CALL_TARGET
74
ands x2,x2,#-16
75
b.eq .Lno_data
76
77
ldp x4,x5,[x0] // load hash value
78
ldp x7,x8,[x0,#32] // load key value
79
ldr x6,[x0,#16]
80
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
81
b .Loop
82
83
.align 5
84
.Loop:
85
ldp x10,x11,[x1],#16 // load input
86
sub x2,x2,#16
87
#ifdef __AARCH64EB__
88
rev x10,x10
89
rev x11,x11
90
#endif
91
adds x4,x4,x10 // accumulate input
92
adcs x5,x5,x11
93
94
mul x12,x4,x7 // h0*r0
95
adc x6,x6,x3
96
umulh x13,x4,x7
97
98
mul x10,x5,x9 // h1*5*r1
99
umulh x11,x5,x9
100
101
adds x12,x12,x10
102
mul x10,x4,x8 // h0*r1
103
adc x13,x13,x11
104
umulh x14,x4,x8
105
106
adds x13,x13,x10
107
mul x10,x5,x7 // h1*r0
108
adc x14,x14,xzr
109
umulh x11,x5,x7
110
111
adds x13,x13,x10
112
mul x10,x6,x9 // h2*5*r1
113
adc x14,x14,x11
114
mul x11,x6,x7 // h2*r0
115
116
adds x13,x13,x10
117
adc x14,x14,x11
118
119
and x10,x14,#-4 // final reduction
120
and x6,x14,#3
121
add x10,x10,x14,lsr#2
122
adds x4,x12,x10
123
adcs x5,x13,xzr
124
adc x6,x6,xzr
125
126
cbnz x2,.Loop
127
128
stp x4,x5,[x0] // store hash value
129
str x6,[x0,#16]
130
131
.Lno_data:
132
ret
133
.size poly1305_blocks,.-poly1305_blocks
134
135
.type poly1305_emit,%function
136
.align 5
137
poly1305_emit:
138
.Lpoly1305_emit:
139
// The symbol .poly1305_emit is not a .globl symbol
140
// but a pointer to it is returned by poly1305_init
141
AARCH64_VALID_CALL_TARGET
142
ldp x4,x5,[x0] // load hash base 2^64
143
ldr x6,[x0,#16]
144
ldp x10,x11,[x2] // load nonce
145
146
adds x12,x4,#5 // compare to modulus
147
adcs x13,x5,xzr
148
adc x14,x6,xzr
149
150
tst x14,#-4 // see if it's carried/borrowed
151
152
csel x4,x4,x12,eq
153
csel x5,x5,x13,eq
154
155
#ifdef __AARCH64EB__
156
ror x10,x10,#32 // flip nonce words
157
ror x11,x11,#32
158
#endif
159
adds x4,x4,x10 // accumulate nonce
160
adc x5,x5,x11
161
#ifdef __AARCH64EB__
162
rev x4,x4 // flip output bytes
163
rev x5,x5
164
#endif
165
stp x4,x5,[x1] // write result
166
167
ret
168
.size poly1305_emit,.-poly1305_emit
169
.type poly1305_mult,%function
170
.align 5
171
poly1305_mult:
172
mul x12,x4,x7 // h0*r0
173
umulh x13,x4,x7
174
175
mul x10,x5,x9 // h1*5*r1
176
umulh x11,x5,x9
177
178
adds x12,x12,x10
179
mul x10,x4,x8 // h0*r1
180
adc x13,x13,x11
181
umulh x14,x4,x8
182
183
adds x13,x13,x10
184
mul x10,x5,x7 // h1*r0
185
adc x14,x14,xzr
186
umulh x11,x5,x7
187
188
adds x13,x13,x10
189
mul x10,x6,x9 // h2*5*r1
190
adc x14,x14,x11
191
mul x11,x6,x7 // h2*r0
192
193
adds x13,x13,x10
194
adc x14,x14,x11
195
196
and x10,x14,#-4 // final reduction
197
and x6,x14,#3
198
add x10,x10,x14,lsr#2
199
adds x4,x12,x10
200
adcs x5,x13,xzr
201
adc x6,x6,xzr
202
203
ret
204
.size poly1305_mult,.-poly1305_mult
205
206
.type poly1305_splat,%function
207
.align 5
208
poly1305_splat:
209
and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
210
ubfx x13,x4,#26,#26
211
extr x14,x5,x4,#52
212
and x14,x14,#0x03ffffff
213
ubfx x15,x5,#14,#26
214
extr x16,x6,x5,#40
215
216
str w12,[x0,#16*0] // r0
217
add w12,w13,w13,lsl#2 // r1*5
218
str w13,[x0,#16*1] // r1
219
add w13,w14,w14,lsl#2 // r2*5
220
str w12,[x0,#16*2] // s1
221
str w14,[x0,#16*3] // r2
222
add w14,w15,w15,lsl#2 // r3*5
223
str w13,[x0,#16*4] // s2
224
str w15,[x0,#16*5] // r3
225
add w15,w16,w16,lsl#2 // r4*5
226
str w14,[x0,#16*6] // s3
227
str w16,[x0,#16*7] // r4
228
str w15,[x0,#16*8] // s4
229
230
ret
231
.size poly1305_splat,.-poly1305_splat
232
233
.type poly1305_blocks_neon,%function
234
.align 5
235
poly1305_blocks_neon:
236
.Lpoly1305_blocks_neon:
237
// The symbol .Lpoly1305_blocks_neon is not a .globl symbol
238
// but a pointer to it is returned by poly1305_init
239
AARCH64_VALID_CALL_TARGET
240
ldr x17,[x0,#24]
241
cmp x2,#128
242
b.hs .Lblocks_neon
243
cbz x17,.Lpoly1305_blocks
244
245
.Lblocks_neon:
246
AARCH64_SIGN_LINK_REGISTER
247
stp x29,x30,[sp,#-80]!
248
add x29,sp,#0
249
250
ands x2,x2,#-16
251
b.eq .Lno_data_neon
252
253
cbz x17,.Lbase2_64_neon
254
255
ldp w10,w11,[x0] // load hash value base 2^26
256
ldp w12,w13,[x0,#8]
257
ldr w14,[x0,#16]
258
259
tst x2,#31
260
b.eq .Leven_neon
261
262
ldp x7,x8,[x0,#32] // load key value
263
264
add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
265
lsr x5,x12,#12
266
adds x4,x4,x12,lsl#52
267
add x5,x5,x13,lsl#14
268
adc x5,x5,xzr
269
lsr x6,x14,#24
270
adds x5,x5,x14,lsl#40
271
adc x14,x6,xzr // can be partially reduced...
272
273
ldp x12,x13,[x1],#16 // load input
274
sub x2,x2,#16
275
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
276
277
and x10,x14,#-4 // ... so reduce
278
and x6,x14,#3
279
add x10,x10,x14,lsr#2
280
adds x4,x4,x10
281
adcs x5,x5,xzr
282
adc x6,x6,xzr
283
284
#ifdef __AARCH64EB__
285
rev x12,x12
286
rev x13,x13
287
#endif
288
adds x4,x4,x12 // accumulate input
289
adcs x5,x5,x13
290
adc x6,x6,x3
291
292
bl poly1305_mult
293
ldr x30,[sp,#8]
294
295
cbz x3,.Lstore_base2_64_neon
296
297
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
298
ubfx x11,x4,#26,#26
299
extr x12,x5,x4,#52
300
and x12,x12,#0x03ffffff
301
ubfx x13,x5,#14,#26
302
extr x14,x6,x5,#40
303
304
cbnz x2,.Leven_neon
305
306
stp w10,w11,[x0] // store hash value base 2^26
307
stp w12,w13,[x0,#8]
308
str w14,[x0,#16]
309
b .Lno_data_neon
310
311
.align 4
312
.Lstore_base2_64_neon:
313
stp x4,x5,[x0] // store hash value base 2^64
314
stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed
315
b .Lno_data_neon
316
317
.align 4
318
.Lbase2_64_neon:
319
ldp x7,x8,[x0,#32] // load key value
320
321
ldp x4,x5,[x0] // load hash value base 2^64
322
ldr x6,[x0,#16]
323
324
tst x2,#31
325
b.eq .Linit_neon
326
327
ldp x12,x13,[x1],#16 // load input
328
sub x2,x2,#16
329
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
330
#ifdef __AARCH64EB__
331
rev x12,x12
332
rev x13,x13
333
#endif
334
adds x4,x4,x12 // accumulate input
335
adcs x5,x5,x13
336
adc x6,x6,x3
337
338
bl poly1305_mult
339
340
.Linit_neon:
341
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
342
ubfx x11,x4,#26,#26
343
extr x12,x5,x4,#52
344
and x12,x12,#0x03ffffff
345
ubfx x13,x5,#14,#26
346
extr x14,x6,x5,#40
347
348
stp d8,d9,[sp,#16] // meet ABI requirements
349
stp d10,d11,[sp,#32]
350
stp d12,d13,[sp,#48]
351
stp d14,d15,[sp,#64]
352
353
fmov d24,x10
354
fmov d25,x11
355
fmov d26,x12
356
fmov d27,x13
357
fmov d28,x14
358
359
////////////////////////////////// initialize r^n table
360
mov x4,x7 // r^1
361
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
362
mov x5,x8
363
mov x6,xzr
364
add x0,x0,#48+12
365
bl poly1305_splat
366
367
bl poly1305_mult // r^2
368
sub x0,x0,#4
369
bl poly1305_splat
370
371
bl poly1305_mult // r^3
372
sub x0,x0,#4
373
bl poly1305_splat
374
375
bl poly1305_mult // r^4
376
sub x0,x0,#4
377
bl poly1305_splat
378
ldr x30,[sp,#8]
379
380
add x16,x1,#32
381
adrp x17,.Lzeros
382
add x17,x17,#:lo12:.Lzeros
383
subs x2,x2,#64
384
csel x16,x17,x16,lo
385
386
mov x4,#1
387
stur x4,[x0,#-24] // set is_base2_26
388
sub x0,x0,#48 // restore original x0
389
b .Ldo_neon
390
391
.align 4
392
.Leven_neon:
393
add x16,x1,#32
394
adrp x17,.Lzeros
395
add x17,x17,#:lo12:.Lzeros
396
subs x2,x2,#64
397
csel x16,x17,x16,lo
398
399
stp d8,d9,[sp,#16] // meet ABI requirements
400
stp d10,d11,[sp,#32]
401
stp d12,d13,[sp,#48]
402
stp d14,d15,[sp,#64]
403
404
fmov d24,x10
405
fmov d25,x11
406
fmov d26,x12
407
fmov d27,x13
408
fmov d28,x14
409
410
.Ldo_neon:
411
ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
412
ldp x9,x13,[x16],#48
413
414
lsl x3,x3,#24
415
add x15,x0,#48
416
417
#ifdef __AARCH64EB__
418
rev x8,x8
419
rev x12,x12
420
rev x9,x9
421
rev x13,x13
422
#endif
423
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
424
and x5,x9,#0x03ffffff
425
ubfx x6,x8,#26,#26
426
ubfx x7,x9,#26,#26
427
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
428
extr x8,x12,x8,#52
429
extr x9,x13,x9,#52
430
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
431
fmov d14,x4
432
and x8,x8,#0x03ffffff
433
and x9,x9,#0x03ffffff
434
ubfx x10,x12,#14,#26
435
ubfx x11,x13,#14,#26
436
add x12,x3,x12,lsr#40
437
add x13,x3,x13,lsr#40
438
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
439
fmov d15,x6
440
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
441
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
442
fmov d16,x8
443
fmov d17,x10
444
fmov d18,x12
445
446
ldp x8,x12,[x1],#16 // inp[0:1]
447
ldp x9,x13,[x1],#48
448
449
ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
450
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
451
ld1 {v8.4s},[x15]
452
453
#ifdef __AARCH64EB__
454
rev x8,x8
455
rev x12,x12
456
rev x9,x9
457
rev x13,x13
458
#endif
459
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
460
and x5,x9,#0x03ffffff
461
ubfx x6,x8,#26,#26
462
ubfx x7,x9,#26,#26
463
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
464
extr x8,x12,x8,#52
465
extr x9,x13,x9,#52
466
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
467
fmov d9,x4
468
and x8,x8,#0x03ffffff
469
and x9,x9,#0x03ffffff
470
ubfx x10,x12,#14,#26
471
ubfx x11,x13,#14,#26
472
add x12,x3,x12,lsr#40
473
add x13,x3,x13,lsr#40
474
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
475
fmov d10,x6
476
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
477
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
478
movi v31.2d,#-1
479
fmov d11,x8
480
fmov d12,x10
481
fmov d13,x12
482
ushr v31.2d,v31.2d,#38
483
484
b.ls .Lskip_loop
485
486
.align 4
487
.Loop_neon:
488
////////////////////////////////////////////////////////////////
489
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
490
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
491
// ___________________/
492
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
493
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
494
// ___________________/ ____________________/
495
//
496
// Note that we start with inp[2:3]*r^2. This is because it
497
// doesn't depend on reduction in previous iteration.
498
////////////////////////////////////////////////////////////////
499
// d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
500
// d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
501
// d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
502
// d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
503
// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
504
505
subs x2,x2,#64
506
umull v23.2d,v14.2s,v7.s[2]
507
csel x16,x17,x16,lo
508
umull v22.2d,v14.2s,v5.s[2]
509
umull v21.2d,v14.2s,v3.s[2]
510
ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
511
umull v20.2d,v14.2s,v1.s[2]
512
ldp x9,x13,[x16],#48
513
umull v19.2d,v14.2s,v0.s[2]
514
#ifdef __AARCH64EB__
515
rev x8,x8
516
rev x12,x12
517
rev x9,x9
518
rev x13,x13
519
#endif
520
521
umlal v23.2d,v15.2s,v5.s[2]
522
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
523
umlal v22.2d,v15.2s,v3.s[2]
524
and x5,x9,#0x03ffffff
525
umlal v21.2d,v15.2s,v1.s[2]
526
ubfx x6,x8,#26,#26
527
umlal v20.2d,v15.2s,v0.s[2]
528
ubfx x7,x9,#26,#26
529
umlal v19.2d,v15.2s,v8.s[2]
530
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
531
532
umlal v23.2d,v16.2s,v3.s[2]
533
extr x8,x12,x8,#52
534
umlal v22.2d,v16.2s,v1.s[2]
535
extr x9,x13,x9,#52
536
umlal v21.2d,v16.2s,v0.s[2]
537
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
538
umlal v20.2d,v16.2s,v8.s[2]
539
fmov d14,x4
540
umlal v19.2d,v16.2s,v6.s[2]
541
and x8,x8,#0x03ffffff
542
543
umlal v23.2d,v17.2s,v1.s[2]
544
and x9,x9,#0x03ffffff
545
umlal v22.2d,v17.2s,v0.s[2]
546
ubfx x10,x12,#14,#26
547
umlal v21.2d,v17.2s,v8.s[2]
548
ubfx x11,x13,#14,#26
549
umlal v20.2d,v17.2s,v6.s[2]
550
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
551
umlal v19.2d,v17.2s,v4.s[2]
552
fmov d15,x6
553
554
add v11.2s,v11.2s,v26.2s
555
add x12,x3,x12,lsr#40
556
umlal v23.2d,v18.2s,v0.s[2]
557
add x13,x3,x13,lsr#40
558
umlal v22.2d,v18.2s,v8.s[2]
559
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
560
umlal v21.2d,v18.2s,v6.s[2]
561
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
562
umlal v20.2d,v18.2s,v4.s[2]
563
fmov d16,x8
564
umlal v19.2d,v18.2s,v2.s[2]
565
fmov d17,x10
566
567
////////////////////////////////////////////////////////////////
568
// (hash+inp[0:1])*r^4 and accumulate
569
570
add v9.2s,v9.2s,v24.2s
571
fmov d18,x12
572
umlal v22.2d,v11.2s,v1.s[0]
573
ldp x8,x12,[x1],#16 // inp[0:1]
574
umlal v19.2d,v11.2s,v6.s[0]
575
ldp x9,x13,[x1],#48
576
umlal v23.2d,v11.2s,v3.s[0]
577
umlal v20.2d,v11.2s,v8.s[0]
578
umlal v21.2d,v11.2s,v0.s[0]
579
#ifdef __AARCH64EB__
580
rev x8,x8
581
rev x12,x12
582
rev x9,x9
583
rev x13,x13
584
#endif
585
586
add v10.2s,v10.2s,v25.2s
587
umlal v22.2d,v9.2s,v5.s[0]
588
umlal v23.2d,v9.2s,v7.s[0]
589
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
590
umlal v21.2d,v9.2s,v3.s[0]
591
and x5,x9,#0x03ffffff
592
umlal v19.2d,v9.2s,v0.s[0]
593
ubfx x6,x8,#26,#26
594
umlal v20.2d,v9.2s,v1.s[0]
595
ubfx x7,x9,#26,#26
596
597
add v12.2s,v12.2s,v27.2s
598
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
599
umlal v22.2d,v10.2s,v3.s[0]
600
extr x8,x12,x8,#52
601
umlal v23.2d,v10.2s,v5.s[0]
602
extr x9,x13,x9,#52
603
umlal v19.2d,v10.2s,v8.s[0]
604
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
605
umlal v21.2d,v10.2s,v1.s[0]
606
fmov d9,x4
607
umlal v20.2d,v10.2s,v0.s[0]
608
and x8,x8,#0x03ffffff
609
610
add v13.2s,v13.2s,v28.2s
611
and x9,x9,#0x03ffffff
612
umlal v22.2d,v12.2s,v0.s[0]
613
ubfx x10,x12,#14,#26
614
umlal v19.2d,v12.2s,v4.s[0]
615
ubfx x11,x13,#14,#26
616
umlal v23.2d,v12.2s,v1.s[0]
617
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
618
umlal v20.2d,v12.2s,v6.s[0]
619
fmov d10,x6
620
umlal v21.2d,v12.2s,v8.s[0]
621
add x12,x3,x12,lsr#40
622
623
umlal v22.2d,v13.2s,v8.s[0]
624
add x13,x3,x13,lsr#40
625
umlal v19.2d,v13.2s,v2.s[0]
626
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
627
umlal v23.2d,v13.2s,v0.s[0]
628
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
629
umlal v20.2d,v13.2s,v4.s[0]
630
fmov d11,x8
631
umlal v21.2d,v13.2s,v6.s[0]
632
fmov d12,x10
633
fmov d13,x12
634
635
/////////////////////////////////////////////////////////////////
636
// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
637
// and P. Schwabe
638
//
639
// [see discussion in poly1305-armv4 module]
640
641
ushr v29.2d,v22.2d,#26
642
xtn v27.2s,v22.2d
643
ushr v30.2d,v19.2d,#26
644
and v19.16b,v19.16b,v31.16b
645
add v23.2d,v23.2d,v29.2d // h3 -> h4
646
bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
647
add v20.2d,v20.2d,v30.2d // h0 -> h1
648
649
ushr v29.2d,v23.2d,#26
650
xtn v28.2s,v23.2d
651
ushr v30.2d,v20.2d,#26
652
xtn v25.2s,v20.2d
653
bic v28.2s,#0xfc,lsl#24
654
add v21.2d,v21.2d,v30.2d // h1 -> h2
655
656
add v19.2d,v19.2d,v29.2d
657
shl v29.2d,v29.2d,#2
658
shrn v30.2s,v21.2d,#26
659
xtn v26.2s,v21.2d
660
add v19.2d,v19.2d,v29.2d // h4 -> h0
661
bic v25.2s,#0xfc,lsl#24
662
add v27.2s,v27.2s,v30.2s // h2 -> h3
663
bic v26.2s,#0xfc,lsl#24
664
665
shrn v29.2s,v19.2d,#26
666
xtn v24.2s,v19.2d
667
ushr v30.2s,v27.2s,#26
668
bic v27.2s,#0xfc,lsl#24
669
bic v24.2s,#0xfc,lsl#24
670
add v25.2s,v25.2s,v29.2s // h0 -> h1
671
add v28.2s,v28.2s,v30.2s // h3 -> h4
672
673
b.hi .Loop_neon
674
675
.Lskip_loop:
676
dup v16.2d,v16.d[0]
677
add v11.2s,v11.2s,v26.2s
678
679
////////////////////////////////////////////////////////////////
680
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
681
682
adds x2,x2,#32
683
b.ne .Long_tail
684
685
dup v16.2d,v11.d[0]
686
add v14.2s,v9.2s,v24.2s
687
add v17.2s,v12.2s,v27.2s
688
add v15.2s,v10.2s,v25.2s
689
add v18.2s,v13.2s,v28.2s
690
691
.Long_tail:
692
dup v14.2d,v14.d[0]
693
umull2 v19.2d,v16.4s,v6.4s
694
umull2 v22.2d,v16.4s,v1.4s
695
umull2 v23.2d,v16.4s,v3.4s
696
umull2 v21.2d,v16.4s,v0.4s
697
umull2 v20.2d,v16.4s,v8.4s
698
699
dup v15.2d,v15.d[0]
700
umlal2 v19.2d,v14.4s,v0.4s
701
umlal2 v21.2d,v14.4s,v3.4s
702
umlal2 v22.2d,v14.4s,v5.4s
703
umlal2 v23.2d,v14.4s,v7.4s
704
umlal2 v20.2d,v14.4s,v1.4s
705
706
dup v17.2d,v17.d[0]
707
umlal2 v19.2d,v15.4s,v8.4s
708
umlal2 v22.2d,v15.4s,v3.4s
709
umlal2 v21.2d,v15.4s,v1.4s
710
umlal2 v23.2d,v15.4s,v5.4s
711
umlal2 v20.2d,v15.4s,v0.4s
712
713
dup v18.2d,v18.d[0]
714
umlal2 v22.2d,v17.4s,v0.4s
715
umlal2 v23.2d,v17.4s,v1.4s
716
umlal2 v19.2d,v17.4s,v4.4s
717
umlal2 v20.2d,v17.4s,v6.4s
718
umlal2 v21.2d,v17.4s,v8.4s
719
720
umlal2 v22.2d,v18.4s,v8.4s
721
umlal2 v19.2d,v18.4s,v2.4s
722
umlal2 v23.2d,v18.4s,v0.4s
723
umlal2 v20.2d,v18.4s,v4.4s
724
umlal2 v21.2d,v18.4s,v6.4s
725
726
b.eq .Lshort_tail
727
728
////////////////////////////////////////////////////////////////
729
// (hash+inp[0:1])*r^4:r^3 and accumulate
730
731
add v9.2s,v9.2s,v24.2s
732
umlal v22.2d,v11.2s,v1.2s
733
umlal v19.2d,v11.2s,v6.2s
734
umlal v23.2d,v11.2s,v3.2s
735
umlal v20.2d,v11.2s,v8.2s
736
umlal v21.2d,v11.2s,v0.2s
737
738
add v10.2s,v10.2s,v25.2s
739
umlal v22.2d,v9.2s,v5.2s
740
umlal v19.2d,v9.2s,v0.2s
741
umlal v23.2d,v9.2s,v7.2s
742
umlal v20.2d,v9.2s,v1.2s
743
umlal v21.2d,v9.2s,v3.2s
744
745
add v12.2s,v12.2s,v27.2s
746
umlal v22.2d,v10.2s,v3.2s
747
umlal v19.2d,v10.2s,v8.2s
748
umlal v23.2d,v10.2s,v5.2s
749
umlal v20.2d,v10.2s,v0.2s
750
umlal v21.2d,v10.2s,v1.2s
751
752
add v13.2s,v13.2s,v28.2s
753
umlal v22.2d,v12.2s,v0.2s
754
umlal v19.2d,v12.2s,v4.2s
755
umlal v23.2d,v12.2s,v1.2s
756
umlal v20.2d,v12.2s,v6.2s
757
umlal v21.2d,v12.2s,v8.2s
758
759
umlal v22.2d,v13.2s,v8.2s
760
umlal v19.2d,v13.2s,v2.2s
761
umlal v23.2d,v13.2s,v0.2s
762
umlal v20.2d,v13.2s,v4.2s
763
umlal v21.2d,v13.2s,v6.2s
764
765
.Lshort_tail:
766
////////////////////////////////////////////////////////////////
767
// horizontal add
768
769
addp v22.2d,v22.2d,v22.2d
770
ldp d8,d9,[sp,#16] // meet ABI requirements
771
addp v19.2d,v19.2d,v19.2d
772
ldp d10,d11,[sp,#32]
773
addp v23.2d,v23.2d,v23.2d
774
ldp d12,d13,[sp,#48]
775
addp v20.2d,v20.2d,v20.2d
776
ldp d14,d15,[sp,#64]
777
addp v21.2d,v21.2d,v21.2d
778
779
////////////////////////////////////////////////////////////////
780
// lazy reduction, but without narrowing
781
782
ushr v29.2d,v22.2d,#26
783
and v22.16b,v22.16b,v31.16b
784
ushr v30.2d,v19.2d,#26
785
and v19.16b,v19.16b,v31.16b
786
787
add v23.2d,v23.2d,v29.2d // h3 -> h4
788
add v20.2d,v20.2d,v30.2d // h0 -> h1
789
790
ushr v29.2d,v23.2d,#26
791
and v23.16b,v23.16b,v31.16b
792
ushr v30.2d,v20.2d,#26
793
and v20.16b,v20.16b,v31.16b
794
add v21.2d,v21.2d,v30.2d // h1 -> h2
795
796
add v19.2d,v19.2d,v29.2d
797
shl v29.2d,v29.2d,#2
798
ushr v30.2d,v21.2d,#26
799
and v21.16b,v21.16b,v31.16b
800
add v19.2d,v19.2d,v29.2d // h4 -> h0
801
add v22.2d,v22.2d,v30.2d // h2 -> h3
802
803
ushr v29.2d,v19.2d,#26
804
and v19.16b,v19.16b,v31.16b
805
ushr v30.2d,v22.2d,#26
806
and v22.16b,v22.16b,v31.16b
807
add v20.2d,v20.2d,v29.2d // h0 -> h1
808
add v23.2d,v23.2d,v30.2d // h3 -> h4
809
810
////////////////////////////////////////////////////////////////
811
// write the result, can be partially reduced
812
813
st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
814
st1 {v23.s}[0],[x0]
815
816
.Lno_data_neon:
817
ldr x29,[sp],#80
818
AARCH64_VALIDATE_LINK_REGISTER
819
ret
820
.size poly1305_blocks_neon,.-poly1305_blocks_neon
821
822
.type poly1305_emit_neon,%function
823
.align 5
824
poly1305_emit_neon:
825
.Lpoly1305_emit_neon:
826
// The symbol .Lpoly1305_emit_neon is not a .globl symbol
827
// but a pointer to it is returned by poly1305_init
828
AARCH64_VALID_CALL_TARGET
829
ldr x17,[x0,#24]
830
cbz x17,poly1305_emit
831
832
ldp w10,w11,[x0] // load hash value base 2^26
833
ldp w12,w13,[x0,#8]
834
ldr w14,[x0,#16]
835
836
add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
837
lsr x5,x12,#12
838
adds x4,x4,x12,lsl#52
839
add x5,x5,x13,lsl#14
840
adc x5,x5,xzr
841
lsr x6,x14,#24
842
adds x5,x5,x14,lsl#40
843
adc x6,x6,xzr // can be partially reduced...
844
845
ldp x10,x11,[x2] // load nonce
846
847
and x12,x6,#-4 // ... so reduce
848
add x12,x12,x6,lsr#2
849
and x6,x6,#3
850
adds x4,x4,x12
851
adcs x5,x5,xzr
852
adc x6,x6,xzr
853
854
adds x12,x4,#5 // compare to modulus
855
adcs x13,x5,xzr
856
adc x14,x6,xzr
857
858
tst x14,#-4 // see if it's carried/borrowed
859
860
csel x4,x4,x12,eq
861
csel x5,x5,x13,eq
862
863
#ifdef __AARCH64EB__
864
ror x10,x10,#32 // flip nonce words
865
ror x11,x11,#32
866
#endif
867
adds x4,x4,x10 // accumulate nonce
868
adc x5,x5,x11
869
#ifdef __AARCH64EB__
870
rev x4,x4 // flip output bytes
871
rev x5,x5
872
#endif
873
stp x4,x5,[x1] // write result
874
875
ret
876
.size poly1305_emit_neon,.-poly1305_emit_neon
877
878
.section .rodata
879
880
.align 5
881
.Lzeros:
882
.long 0,0,0,0,0,0,0,0
883
.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
884
.align 2
885
.align 2
886
887