Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/aarch64/chacha-armv8.S
39536 views
1
/* Do not modify. This file is auto-generated from chacha-armv8.pl. */
2
#include "arm_arch.h"
3
#ifndef __KERNEL__
4
5
.hidden OPENSSL_armcap_P
6
7
8
#endif
9
10
.section .rodata
11
12
.align 5
13
.Lsigma:
14
.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
15
.Lone:
16
.long 1,2,3,4
17
.Lrot24:
18
.long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
19
.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0
20
.align 2
21
22
.text
23
24
.globl ChaCha20_ctr32_dflt
25
.type ChaCha20_ctr32_dflt,%function
26
.align 5
27
ChaCha20_ctr32_dflt:
28
AARCH64_SIGN_LINK_REGISTER
29
cmp x2,#192
30
b.lo .Lshort
31
#ifndef __KERNEL__
32
adrp x17,OPENSSL_armcap_P
33
ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
34
.Lcheck_neon:
35
tst w17,#ARMV7_NEON
36
b.ne .LChaCha20_neon
37
#endif
38
39
.Lshort:
40
stp x29,x30,[sp,#-96]!
41
add x29,sp,#0
42
43
adrp x5,.Lsigma
44
add x5,x5,#:lo12:.Lsigma
45
stp x19,x20,[sp,#16]
46
stp x21,x22,[sp,#32]
47
stp x23,x24,[sp,#48]
48
stp x25,x26,[sp,#64]
49
stp x27,x28,[sp,#80]
50
sub sp,sp,#64
51
52
ldp x22,x23,[x5] // load sigma
53
ldp x24,x25,[x3] // load key
54
ldp x26,x27,[x3,#16]
55
ldp x28,x30,[x4] // load counter
56
#ifdef __AARCH64EB__
57
ror x24,x24,#32
58
ror x25,x25,#32
59
ror x26,x26,#32
60
ror x27,x27,#32
61
ror x28,x28,#32
62
ror x30,x30,#32
63
#endif
64
65
.Loop_outer:
66
mov w5,w22 // unpack key block
67
lsr x6,x22,#32
68
mov w7,w23
69
lsr x8,x23,#32
70
mov w9,w24
71
lsr x10,x24,#32
72
mov w11,w25
73
lsr x12,x25,#32
74
mov w13,w26
75
lsr x14,x26,#32
76
mov w15,w27
77
lsr x16,x27,#32
78
mov w17,w28
79
lsr x19,x28,#32
80
mov w20,w30
81
lsr x21,x30,#32
82
83
mov x4,#10
84
subs x2,x2,#64
85
.Loop:
86
sub x4,x4,#1
87
add w5,w5,w9
88
add w6,w6,w10
89
add w7,w7,w11
90
add w8,w8,w12
91
eor w17,w17,w5
92
eor w19,w19,w6
93
eor w20,w20,w7
94
eor w21,w21,w8
95
ror w17,w17,#16
96
ror w19,w19,#16
97
ror w20,w20,#16
98
ror w21,w21,#16
99
add w13,w13,w17
100
add w14,w14,w19
101
add w15,w15,w20
102
add w16,w16,w21
103
eor w9,w9,w13
104
eor w10,w10,w14
105
eor w11,w11,w15
106
eor w12,w12,w16
107
ror w9,w9,#20
108
ror w10,w10,#20
109
ror w11,w11,#20
110
ror w12,w12,#20
111
add w5,w5,w9
112
add w6,w6,w10
113
add w7,w7,w11
114
add w8,w8,w12
115
eor w17,w17,w5
116
eor w19,w19,w6
117
eor w20,w20,w7
118
eor w21,w21,w8
119
ror w17,w17,#24
120
ror w19,w19,#24
121
ror w20,w20,#24
122
ror w21,w21,#24
123
add w13,w13,w17
124
add w14,w14,w19
125
add w15,w15,w20
126
add w16,w16,w21
127
eor w9,w9,w13
128
eor w10,w10,w14
129
eor w11,w11,w15
130
eor w12,w12,w16
131
ror w9,w9,#25
132
ror w10,w10,#25
133
ror w11,w11,#25
134
ror w12,w12,#25
135
add w5,w5,w10
136
add w6,w6,w11
137
add w7,w7,w12
138
add w8,w8,w9
139
eor w21,w21,w5
140
eor w17,w17,w6
141
eor w19,w19,w7
142
eor w20,w20,w8
143
ror w21,w21,#16
144
ror w17,w17,#16
145
ror w19,w19,#16
146
ror w20,w20,#16
147
add w15,w15,w21
148
add w16,w16,w17
149
add w13,w13,w19
150
add w14,w14,w20
151
eor w10,w10,w15
152
eor w11,w11,w16
153
eor w12,w12,w13
154
eor w9,w9,w14
155
ror w10,w10,#20
156
ror w11,w11,#20
157
ror w12,w12,#20
158
ror w9,w9,#20
159
add w5,w5,w10
160
add w6,w6,w11
161
add w7,w7,w12
162
add w8,w8,w9
163
eor w21,w21,w5
164
eor w17,w17,w6
165
eor w19,w19,w7
166
eor w20,w20,w8
167
ror w21,w21,#24
168
ror w17,w17,#24
169
ror w19,w19,#24
170
ror w20,w20,#24
171
add w15,w15,w21
172
add w16,w16,w17
173
add w13,w13,w19
174
add w14,w14,w20
175
eor w10,w10,w15
176
eor w11,w11,w16
177
eor w12,w12,w13
178
eor w9,w9,w14
179
ror w10,w10,#25
180
ror w11,w11,#25
181
ror w12,w12,#25
182
ror w9,w9,#25
183
cbnz x4,.Loop
184
185
add w5,w5,w22 // accumulate key block
186
add x6,x6,x22,lsr#32
187
add w7,w7,w23
188
add x8,x8,x23,lsr#32
189
add w9,w9,w24
190
add x10,x10,x24,lsr#32
191
add w11,w11,w25
192
add x12,x12,x25,lsr#32
193
add w13,w13,w26
194
add x14,x14,x26,lsr#32
195
add w15,w15,w27
196
add x16,x16,x27,lsr#32
197
add w17,w17,w28
198
add x19,x19,x28,lsr#32
199
add w20,w20,w30
200
add x21,x21,x30,lsr#32
201
202
b.lo .Ltail
203
204
add x5,x5,x6,lsl#32 // pack
205
add x7,x7,x8,lsl#32
206
ldp x6,x8,[x1,#0] // load input
207
add x9,x9,x10,lsl#32
208
add x11,x11,x12,lsl#32
209
ldp x10,x12,[x1,#16]
210
add x13,x13,x14,lsl#32
211
add x15,x15,x16,lsl#32
212
ldp x14,x16,[x1,#32]
213
add x17,x17,x19,lsl#32
214
add x20,x20,x21,lsl#32
215
ldp x19,x21,[x1,#48]
216
add x1,x1,#64
217
#ifdef __AARCH64EB__
218
rev x5,x5
219
rev x7,x7
220
rev x9,x9
221
rev x11,x11
222
rev x13,x13
223
rev x15,x15
224
rev x17,x17
225
rev x20,x20
226
#endif
227
eor x5,x5,x6
228
eor x7,x7,x8
229
eor x9,x9,x10
230
eor x11,x11,x12
231
eor x13,x13,x14
232
eor x15,x15,x16
233
eor x17,x17,x19
234
eor x20,x20,x21
235
236
stp x5,x7,[x0,#0] // store output
237
add x28,x28,#1 // increment counter
238
stp x9,x11,[x0,#16]
239
stp x13,x15,[x0,#32]
240
stp x17,x20,[x0,#48]
241
add x0,x0,#64
242
243
b.hi .Loop_outer
244
245
ldp x19,x20,[x29,#16]
246
add sp,sp,#64
247
ldp x21,x22,[x29,#32]
248
ldp x23,x24,[x29,#48]
249
ldp x25,x26,[x29,#64]
250
ldp x27,x28,[x29,#80]
251
ldp x29,x30,[sp],#96
252
.Labort:
253
AARCH64_VALIDATE_LINK_REGISTER
254
ret
255
256
.align 4
257
.Ltail:
258
add x2,x2,#64
259
.Less_than_64:
260
sub x0,x0,#1
261
add x1,x1,x2
262
add x0,x0,x2
263
add x4,sp,x2
264
neg x2,x2
265
266
add x5,x5,x6,lsl#32 // pack
267
add x7,x7,x8,lsl#32
268
add x9,x9,x10,lsl#32
269
add x11,x11,x12,lsl#32
270
add x13,x13,x14,lsl#32
271
add x15,x15,x16,lsl#32
272
add x17,x17,x19,lsl#32
273
add x20,x20,x21,lsl#32
274
#ifdef __AARCH64EB__
275
rev x5,x5
276
rev x7,x7
277
rev x9,x9
278
rev x11,x11
279
rev x13,x13
280
rev x15,x15
281
rev x17,x17
282
rev x20,x20
283
#endif
284
stp x5,x7,[sp,#0]
285
stp x9,x11,[sp,#16]
286
stp x13,x15,[sp,#32]
287
stp x17,x20,[sp,#48]
288
289
.Loop_tail:
290
ldrb w10,[x1,x2]
291
ldrb w11,[x4,x2]
292
add x2,x2,#1
293
eor w10,w10,w11
294
strb w10,[x0,x2]
295
cbnz x2,.Loop_tail
296
297
stp xzr,xzr,[sp,#0]
298
stp xzr,xzr,[sp,#16]
299
stp xzr,xzr,[sp,#32]
300
stp xzr,xzr,[sp,#48]
301
302
ldp x19,x20,[x29,#16]
303
add sp,sp,#64
304
ldp x21,x22,[x29,#32]
305
ldp x23,x24,[x29,#48]
306
ldp x25,x26,[x29,#64]
307
ldp x27,x28,[x29,#80]
308
ldp x29,x30,[sp],#96
309
AARCH64_VALIDATE_LINK_REGISTER
310
ret
311
.size ChaCha20_ctr32_dflt,.-ChaCha20_ctr32_dflt
312
313
.globl ChaCha20_ctr32
314
.type ChaCha20_ctr32,%function
315
.align 5
316
ChaCha20_ctr32:
317
AARCH64_SIGN_LINK_REGISTER
318
cbz x2,.Labort
319
cmp x2,#192
320
b.lo .Lshort
321
#ifndef __KERNEL__
322
adrp x17,OPENSSL_armcap_P
323
ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
324
tst w17,#ARMV8_SVE
325
b.eq .Lcheck_neon
326
stp x29,x30,[sp,#-16]!
327
sub sp,sp,#16
328
// SVE handling will inevitably increment the counter
329
// Neon/Scalar code that follows to process tail data needs to
330
// use new counter, unfortunately the input counter buffer
331
// pointed to by ctr is meant to be read-only per API contract
332
// we have to copy the buffer to stack to be writable by SVE
333
ldp x5,x6,[x4]
334
stp x5,x6,[sp]
335
mov x4,sp
336
bl ChaCha20_ctr32_sve
337
cbz x2,1f
338
bl ChaCha20_ctr32_dflt
339
1:
340
add sp,sp,#16
341
ldp x29,x30,[sp],#16
342
AARCH64_VALIDATE_LINK_REGISTER
343
ret
344
#endif
345
b .Lshort
346
.size ChaCha20_ctr32,.-ChaCha20_ctr32
347
348
#ifdef __KERNEL__
349
.globl ChaCha20_neon
350
#endif
351
.type ChaCha20_neon,%function
352
.align 5
353
ChaCha20_neon:
354
AARCH64_SIGN_LINK_REGISTER
355
.LChaCha20_neon:
356
stp x29,x30,[sp,#-96]!
357
add x29,sp,#0
358
359
adrp x5,.Lsigma
360
add x5,x5,#:lo12:.Lsigma
361
stp x19,x20,[sp,#16]
362
stp x21,x22,[sp,#32]
363
stp x23,x24,[sp,#48]
364
stp x25,x26,[sp,#64]
365
stp x27,x28,[sp,#80]
366
cmp x2,#512
367
b.hs .L512_or_more_neon
368
369
sub sp,sp,#64
370
371
ldp x22,x23,[x5] // load sigma
372
ld1 {v0.4s},[x5],#16
373
ldp x24,x25,[x3] // load key
374
ldp x26,x27,[x3,#16]
375
ld1 {v1.4s,v2.4s},[x3]
376
ldp x28,x30,[x4] // load counter
377
ld1 {v3.4s},[x4]
378
stp d8,d9,[sp] // meet ABI requirements
379
ld1 {v8.4s,v9.4s},[x5]
380
#ifdef __AARCH64EB__
381
rev64 v0.4s,v0.4s
382
ror x24,x24,#32
383
ror x25,x25,#32
384
ror x26,x26,#32
385
ror x27,x27,#32
386
ror x28,x28,#32
387
ror x30,x30,#32
388
#endif
389
390
.Loop_outer_neon:
391
dup v16.4s,v0.s[0] // unpack key block
392
mov w5,w22
393
dup v20.4s,v0.s[1]
394
lsr x6,x22,#32
395
dup v24.4s,v0.s[2]
396
mov w7,w23
397
dup v28.4s,v0.s[3]
398
lsr x8,x23,#32
399
dup v17.4s,v1.s[0]
400
mov w9,w24
401
dup v21.4s,v1.s[1]
402
lsr x10,x24,#32
403
dup v25.4s,v1.s[2]
404
mov w11,w25
405
dup v29.4s,v1.s[3]
406
lsr x12,x25,#32
407
dup v19.4s,v3.s[0]
408
mov w13,w26
409
dup v23.4s,v3.s[1]
410
lsr x14,x26,#32
411
dup v27.4s,v3.s[2]
412
mov w15,w27
413
dup v31.4s,v3.s[3]
414
lsr x16,x27,#32
415
add v19.4s,v19.4s,v8.4s
416
mov w17,w28
417
dup v18.4s,v2.s[0]
418
lsr x19,x28,#32
419
dup v22.4s,v2.s[1]
420
mov w20,w30
421
dup v26.4s,v2.s[2]
422
lsr x21,x30,#32
423
dup v30.4s,v2.s[3]
424
425
mov x4,#10
426
subs x2,x2,#320
427
.Loop_neon:
428
sub x4,x4,#1
429
add v16.4s,v16.4s,v17.4s
430
add w5,w5,w9
431
add v20.4s,v20.4s,v21.4s
432
add w6,w6,w10
433
add v24.4s,v24.4s,v25.4s
434
add w7,w7,w11
435
add v28.4s,v28.4s,v29.4s
436
add w8,w8,w12
437
eor v19.16b,v19.16b,v16.16b
438
eor w17,w17,w5
439
eor v23.16b,v23.16b,v20.16b
440
eor w19,w19,w6
441
eor v27.16b,v27.16b,v24.16b
442
eor w20,w20,w7
443
eor v31.16b,v31.16b,v28.16b
444
eor w21,w21,w8
445
rev32 v19.8h,v19.8h
446
ror w17,w17,#16
447
rev32 v23.8h,v23.8h
448
ror w19,w19,#16
449
rev32 v27.8h,v27.8h
450
ror w20,w20,#16
451
rev32 v31.8h,v31.8h
452
ror w21,w21,#16
453
add v18.4s,v18.4s,v19.4s
454
add w13,w13,w17
455
add v22.4s,v22.4s,v23.4s
456
add w14,w14,w19
457
add v26.4s,v26.4s,v27.4s
458
add w15,w15,w20
459
add v30.4s,v30.4s,v31.4s
460
add w16,w16,w21
461
eor v4.16b,v17.16b,v18.16b
462
eor w9,w9,w13
463
eor v5.16b,v21.16b,v22.16b
464
eor w10,w10,w14
465
eor v6.16b,v25.16b,v26.16b
466
eor w11,w11,w15
467
eor v7.16b,v29.16b,v30.16b
468
eor w12,w12,w16
469
ushr v17.4s,v4.4s,#20
470
ror w9,w9,#20
471
ushr v21.4s,v5.4s,#20
472
ror w10,w10,#20
473
ushr v25.4s,v6.4s,#20
474
ror w11,w11,#20
475
ushr v29.4s,v7.4s,#20
476
ror w12,w12,#20
477
sli v17.4s,v4.4s,#12
478
add w5,w5,w9
479
sli v21.4s,v5.4s,#12
480
add w6,w6,w10
481
sli v25.4s,v6.4s,#12
482
add w7,w7,w11
483
sli v29.4s,v7.4s,#12
484
add w8,w8,w12
485
add v16.4s,v16.4s,v17.4s
486
eor w17,w17,w5
487
add v20.4s,v20.4s,v21.4s
488
eor w19,w19,w6
489
add v24.4s,v24.4s,v25.4s
490
eor w20,w20,w7
491
add v28.4s,v28.4s,v29.4s
492
eor w21,w21,w8
493
eor v4.16b,v19.16b,v16.16b
494
ror w17,w17,#24
495
eor v5.16b,v23.16b,v20.16b
496
ror w19,w19,#24
497
eor v6.16b,v27.16b,v24.16b
498
ror w20,w20,#24
499
eor v7.16b,v31.16b,v28.16b
500
ror w21,w21,#24
501
tbl v19.16b,{v4.16b},v9.16b
502
add w13,w13,w17
503
tbl v23.16b,{v5.16b},v9.16b
504
add w14,w14,w19
505
tbl v27.16b,{v6.16b},v9.16b
506
add w15,w15,w20
507
tbl v31.16b,{v7.16b},v9.16b
508
add w16,w16,w21
509
add v18.4s,v18.4s,v19.4s
510
eor w9,w9,w13
511
add v22.4s,v22.4s,v23.4s
512
eor w10,w10,w14
513
add v26.4s,v26.4s,v27.4s
514
eor w11,w11,w15
515
add v30.4s,v30.4s,v31.4s
516
eor w12,w12,w16
517
eor v4.16b,v17.16b,v18.16b
518
ror w9,w9,#25
519
eor v5.16b,v21.16b,v22.16b
520
ror w10,w10,#25
521
eor v6.16b,v25.16b,v26.16b
522
ror w11,w11,#25
523
eor v7.16b,v29.16b,v30.16b
524
ror w12,w12,#25
525
ushr v17.4s,v4.4s,#25
526
ushr v21.4s,v5.4s,#25
527
ushr v25.4s,v6.4s,#25
528
ushr v29.4s,v7.4s,#25
529
sli v17.4s,v4.4s,#7
530
sli v21.4s,v5.4s,#7
531
sli v25.4s,v6.4s,#7
532
sli v29.4s,v7.4s,#7
533
add v16.4s,v16.4s,v21.4s
534
add w5,w5,w10
535
add v20.4s,v20.4s,v25.4s
536
add w6,w6,w11
537
add v24.4s,v24.4s,v29.4s
538
add w7,w7,w12
539
add v28.4s,v28.4s,v17.4s
540
add w8,w8,w9
541
eor v31.16b,v31.16b,v16.16b
542
eor w21,w21,w5
543
eor v19.16b,v19.16b,v20.16b
544
eor w17,w17,w6
545
eor v23.16b,v23.16b,v24.16b
546
eor w19,w19,w7
547
eor v27.16b,v27.16b,v28.16b
548
eor w20,w20,w8
549
rev32 v31.8h,v31.8h
550
ror w21,w21,#16
551
rev32 v19.8h,v19.8h
552
ror w17,w17,#16
553
rev32 v23.8h,v23.8h
554
ror w19,w19,#16
555
rev32 v27.8h,v27.8h
556
ror w20,w20,#16
557
add v26.4s,v26.4s,v31.4s
558
add w15,w15,w21
559
add v30.4s,v30.4s,v19.4s
560
add w16,w16,w17
561
add v18.4s,v18.4s,v23.4s
562
add w13,w13,w19
563
add v22.4s,v22.4s,v27.4s
564
add w14,w14,w20
565
eor v4.16b,v21.16b,v26.16b
566
eor w10,w10,w15
567
eor v5.16b,v25.16b,v30.16b
568
eor w11,w11,w16
569
eor v6.16b,v29.16b,v18.16b
570
eor w12,w12,w13
571
eor v7.16b,v17.16b,v22.16b
572
eor w9,w9,w14
573
ushr v21.4s,v4.4s,#20
574
ror w10,w10,#20
575
ushr v25.4s,v5.4s,#20
576
ror w11,w11,#20
577
ushr v29.4s,v6.4s,#20
578
ror w12,w12,#20
579
ushr v17.4s,v7.4s,#20
580
ror w9,w9,#20
581
sli v21.4s,v4.4s,#12
582
add w5,w5,w10
583
sli v25.4s,v5.4s,#12
584
add w6,w6,w11
585
sli v29.4s,v6.4s,#12
586
add w7,w7,w12
587
sli v17.4s,v7.4s,#12
588
add w8,w8,w9
589
add v16.4s,v16.4s,v21.4s
590
eor w21,w21,w5
591
add v20.4s,v20.4s,v25.4s
592
eor w17,w17,w6
593
add v24.4s,v24.4s,v29.4s
594
eor w19,w19,w7
595
add v28.4s,v28.4s,v17.4s
596
eor w20,w20,w8
597
eor v4.16b,v31.16b,v16.16b
598
ror w21,w21,#24
599
eor v5.16b,v19.16b,v20.16b
600
ror w17,w17,#24
601
eor v6.16b,v23.16b,v24.16b
602
ror w19,w19,#24
603
eor v7.16b,v27.16b,v28.16b
604
ror w20,w20,#24
605
tbl v31.16b,{v4.16b},v9.16b
606
add w15,w15,w21
607
tbl v19.16b,{v5.16b},v9.16b
608
add w16,w16,w17
609
tbl v23.16b,{v6.16b},v9.16b
610
add w13,w13,w19
611
tbl v27.16b,{v7.16b},v9.16b
612
add w14,w14,w20
613
add v26.4s,v26.4s,v31.4s
614
eor w10,w10,w15
615
add v30.4s,v30.4s,v19.4s
616
eor w11,w11,w16
617
add v18.4s,v18.4s,v23.4s
618
eor w12,w12,w13
619
add v22.4s,v22.4s,v27.4s
620
eor w9,w9,w14
621
eor v4.16b,v21.16b,v26.16b
622
ror w10,w10,#25
623
eor v5.16b,v25.16b,v30.16b
624
ror w11,w11,#25
625
eor v6.16b,v29.16b,v18.16b
626
ror w12,w12,#25
627
eor v7.16b,v17.16b,v22.16b
628
ror w9,w9,#25
629
ushr v21.4s,v4.4s,#25
630
ushr v25.4s,v5.4s,#25
631
ushr v29.4s,v6.4s,#25
632
ushr v17.4s,v7.4s,#25
633
sli v21.4s,v4.4s,#7
634
sli v25.4s,v5.4s,#7
635
sli v29.4s,v6.4s,#7
636
sli v17.4s,v7.4s,#7
637
cbnz x4,.Loop_neon
638
639
add v19.4s,v19.4s,v8.4s
640
641
zip1 v4.4s,v16.4s,v20.4s // transpose data
642
zip1 v5.4s,v24.4s,v28.4s
643
zip2 v6.4s,v16.4s,v20.4s
644
zip2 v7.4s,v24.4s,v28.4s
645
zip1 v16.2d,v4.2d,v5.2d
646
zip2 v20.2d,v4.2d,v5.2d
647
zip1 v24.2d,v6.2d,v7.2d
648
zip2 v28.2d,v6.2d,v7.2d
649
650
zip1 v4.4s,v17.4s,v21.4s
651
zip1 v5.4s,v25.4s,v29.4s
652
zip2 v6.4s,v17.4s,v21.4s
653
zip2 v7.4s,v25.4s,v29.4s
654
zip1 v17.2d,v4.2d,v5.2d
655
zip2 v21.2d,v4.2d,v5.2d
656
zip1 v25.2d,v6.2d,v7.2d
657
zip2 v29.2d,v6.2d,v7.2d
658
659
zip1 v4.4s,v18.4s,v22.4s
660
add w5,w5,w22 // accumulate key block
661
zip1 v5.4s,v26.4s,v30.4s
662
add x6,x6,x22,lsr#32
663
zip2 v6.4s,v18.4s,v22.4s
664
add w7,w7,w23
665
zip2 v7.4s,v26.4s,v30.4s
666
add x8,x8,x23,lsr#32
667
zip1 v18.2d,v4.2d,v5.2d
668
add w9,w9,w24
669
zip2 v22.2d,v4.2d,v5.2d
670
add x10,x10,x24,lsr#32
671
zip1 v26.2d,v6.2d,v7.2d
672
add w11,w11,w25
673
zip2 v30.2d,v6.2d,v7.2d
674
add x12,x12,x25,lsr#32
675
676
zip1 v4.4s,v19.4s,v23.4s
677
add w13,w13,w26
678
zip1 v5.4s,v27.4s,v31.4s
679
add x14,x14,x26,lsr#32
680
zip2 v6.4s,v19.4s,v23.4s
681
add w15,w15,w27
682
zip2 v7.4s,v27.4s,v31.4s
683
add x16,x16,x27,lsr#32
684
zip1 v19.2d,v4.2d,v5.2d
685
add w17,w17,w28
686
zip2 v23.2d,v4.2d,v5.2d
687
add x19,x19,x28,lsr#32
688
zip1 v27.2d,v6.2d,v7.2d
689
add w20,w20,w30
690
zip2 v31.2d,v6.2d,v7.2d
691
add x21,x21,x30,lsr#32
692
693
b.lo .Ltail_neon
694
695
add x5,x5,x6,lsl#32 // pack
696
add x7,x7,x8,lsl#32
697
ldp x6,x8,[x1,#0] // load input
698
add v16.4s,v16.4s,v0.4s // accumulate key block
699
add x9,x9,x10,lsl#32
700
add x11,x11,x12,lsl#32
701
ldp x10,x12,[x1,#16]
702
add v17.4s,v17.4s,v1.4s
703
add x13,x13,x14,lsl#32
704
add x15,x15,x16,lsl#32
705
ldp x14,x16,[x1,#32]
706
add v18.4s,v18.4s,v2.4s
707
add x17,x17,x19,lsl#32
708
add x20,x20,x21,lsl#32
709
ldp x19,x21,[x1,#48]
710
add v19.4s,v19.4s,v3.4s
711
add x1,x1,#64
712
#ifdef __AARCH64EB__
713
rev x5,x5
714
rev x7,x7
715
rev x9,x9
716
rev x11,x11
717
rev x13,x13
718
rev x15,x15
719
rev x17,x17
720
rev x20,x20
721
#endif
722
ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
723
eor x5,x5,x6
724
add v20.4s,v20.4s,v0.4s
725
eor x7,x7,x8
726
add v21.4s,v21.4s,v1.4s
727
eor x9,x9,x10
728
add v22.4s,v22.4s,v2.4s
729
eor x11,x11,x12
730
add v23.4s,v23.4s,v3.4s
731
eor x13,x13,x14
732
eor v16.16b,v16.16b,v4.16b
733
movi v4.4s,#5
734
eor x15,x15,x16
735
eor v17.16b,v17.16b,v5.16b
736
eor x17,x17,x19
737
eor v18.16b,v18.16b,v6.16b
738
eor x20,x20,x21
739
eor v19.16b,v19.16b,v7.16b
740
add v8.4s,v8.4s,v4.4s // += 5
741
ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
742
743
stp x5,x7,[x0,#0] // store output
744
add x28,x28,#5 // increment counter
745
stp x9,x11,[x0,#16]
746
stp x13,x15,[x0,#32]
747
stp x17,x20,[x0,#48]
748
add x0,x0,#64
749
750
st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
751
add v24.4s,v24.4s,v0.4s
752
add v25.4s,v25.4s,v1.4s
753
add v26.4s,v26.4s,v2.4s
754
add v27.4s,v27.4s,v3.4s
755
ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64
756
757
eor v20.16b,v20.16b,v4.16b
758
eor v21.16b,v21.16b,v5.16b
759
eor v22.16b,v22.16b,v6.16b
760
eor v23.16b,v23.16b,v7.16b
761
st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
762
add v28.4s,v28.4s,v0.4s
763
add v29.4s,v29.4s,v1.4s
764
add v30.4s,v30.4s,v2.4s
765
add v31.4s,v31.4s,v3.4s
766
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
767
768
eor v24.16b,v24.16b,v16.16b
769
eor v25.16b,v25.16b,v17.16b
770
eor v26.16b,v26.16b,v18.16b
771
eor v27.16b,v27.16b,v19.16b
772
st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64
773
774
eor v28.16b,v28.16b,v20.16b
775
eor v29.16b,v29.16b,v21.16b
776
eor v30.16b,v30.16b,v22.16b
777
eor v31.16b,v31.16b,v23.16b
778
st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64
779
780
b.hi .Loop_outer_neon
781
782
ldp d8,d9,[sp] // meet ABI requirements
783
784
ldp x19,x20,[x29,#16]
785
add sp,sp,#64
786
ldp x21,x22,[x29,#32]
787
ldp x23,x24,[x29,#48]
788
ldp x25,x26,[x29,#64]
789
ldp x27,x28,[x29,#80]
790
ldp x29,x30,[sp],#96
791
AARCH64_VALIDATE_LINK_REGISTER
792
ret
793
794
.align 4
795
.Ltail_neon:
796
add x2,x2,#320
797
ldp d8,d9,[sp] // meet ABI requirements
798
cmp x2,#64
799
b.lo .Less_than_64
800
801
add x5,x5,x6,lsl#32 // pack
802
add x7,x7,x8,lsl#32
803
ldp x6,x8,[x1,#0] // load input
804
add x9,x9,x10,lsl#32
805
add x11,x11,x12,lsl#32
806
ldp x10,x12,[x1,#16]
807
add x13,x13,x14,lsl#32
808
add x15,x15,x16,lsl#32
809
ldp x14,x16,[x1,#32]
810
add x17,x17,x19,lsl#32
811
add x20,x20,x21,lsl#32
812
ldp x19,x21,[x1,#48]
813
add x1,x1,#64
814
#ifdef __AARCH64EB__
815
rev x5,x5
816
rev x7,x7
817
rev x9,x9
818
rev x11,x11
819
rev x13,x13
820
rev x15,x15
821
rev x17,x17
822
rev x20,x20
823
#endif
824
eor x5,x5,x6
825
eor x7,x7,x8
826
eor x9,x9,x10
827
eor x11,x11,x12
828
eor x13,x13,x14
829
eor x15,x15,x16
830
eor x17,x17,x19
831
eor x20,x20,x21
832
833
stp x5,x7,[x0,#0] // store output
834
add v16.4s,v16.4s,v0.4s // accumulate key block
835
stp x9,x11,[x0,#16]
836
add v17.4s,v17.4s,v1.4s
837
stp x13,x15,[x0,#32]
838
add v18.4s,v18.4s,v2.4s
839
stp x17,x20,[x0,#48]
840
add v19.4s,v19.4s,v3.4s
841
add x0,x0,#64
842
b.eq .Ldone_neon
843
sub x2,x2,#64
844
cmp x2,#64
845
b.lo .Last_neon
846
847
ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
848
eor v16.16b,v16.16b,v4.16b
849
eor v17.16b,v17.16b,v5.16b
850
eor v18.16b,v18.16b,v6.16b
851
eor v19.16b,v19.16b,v7.16b
852
st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
853
b.eq .Ldone_neon
854
855
add v16.4s,v20.4s,v0.4s
856
add v17.4s,v21.4s,v1.4s
857
sub x2,x2,#64
858
add v18.4s,v22.4s,v2.4s
859
cmp x2,#64
860
add v19.4s,v23.4s,v3.4s
861
b.lo .Last_neon
862
863
ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
864
eor v20.16b,v16.16b,v4.16b
865
eor v21.16b,v17.16b,v5.16b
866
eor v22.16b,v18.16b,v6.16b
867
eor v23.16b,v19.16b,v7.16b
868
st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
869
b.eq .Ldone_neon
870
871
add v16.4s,v24.4s,v0.4s
872
add v17.4s,v25.4s,v1.4s
873
sub x2,x2,#64
874
add v18.4s,v26.4s,v2.4s
875
cmp x2,#64
876
add v19.4s,v27.4s,v3.4s
877
b.lo .Last_neon
878
879
ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
880
eor v24.16b,v16.16b,v4.16b
881
eor v25.16b,v17.16b,v5.16b
882
eor v26.16b,v18.16b,v6.16b
883
eor v27.16b,v19.16b,v7.16b
884
st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64
885
b.eq .Ldone_neon
886
887
add v16.4s,v28.4s,v0.4s
888
add v17.4s,v29.4s,v1.4s
889
add v18.4s,v30.4s,v2.4s
890
add v19.4s,v31.4s,v3.4s
891
sub x2,x2,#64
892
893
.Last_neon:
894
st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
895
896
sub x0,x0,#1
897
add x1,x1,x2
898
add x0,x0,x2
899
add x4,sp,x2
900
neg x2,x2
901
902
.Loop_tail_neon:
903
ldrb w10,[x1,x2]
904
ldrb w11,[x4,x2]
905
add x2,x2,#1
906
eor w10,w10,w11
907
strb w10,[x0,x2]
908
cbnz x2,.Loop_tail_neon
909
910
stp xzr,xzr,[sp,#0]
911
stp xzr,xzr,[sp,#16]
912
stp xzr,xzr,[sp,#32]
913
stp xzr,xzr,[sp,#48]
914
915
.Ldone_neon:
916
ldp x19,x20,[x29,#16]
917
add sp,sp,#64
918
ldp x21,x22,[x29,#32]
919
ldp x23,x24,[x29,#48]
920
ldp x25,x26,[x29,#64]
921
ldp x27,x28,[x29,#80]
922
ldp x29,x30,[sp],#96
923
AARCH64_VALIDATE_LINK_REGISTER
924
ret
925
.size ChaCha20_neon,.-ChaCha20_neon
926
.type ChaCha20_512_neon,%function
927
.align 5
928
ChaCha20_512_neon:
929
AARCH64_SIGN_LINK_REGISTER
930
stp x29,x30,[sp,#-96]!
931
add x29,sp,#0
932
933
adrp x5,.Lsigma
934
add x5,x5,#:lo12:.Lsigma
935
stp x19,x20,[sp,#16]
936
stp x21,x22,[sp,#32]
937
stp x23,x24,[sp,#48]
938
stp x25,x26,[sp,#64]
939
stp x27,x28,[sp,#80]
940
941
.L512_or_more_neon:
942
sub sp,sp,#128+64
943
944
eor v7.16b,v7.16b,v7.16b
945
ldp x22,x23,[x5] // load sigma
946
ld1 {v0.4s},[x5],#16
947
ldp x24,x25,[x3] // load key
948
ldp x26,x27,[x3,#16]
949
ld1 {v1.4s,v2.4s},[x3]
950
ldp x28,x30,[x4] // load counter
951
ld1 {v3.4s},[x4]
952
ld1 {v7.s}[0],[x5]
953
add x3,x5,#16 // .Lrot24
954
#ifdef __AARCH64EB__
955
rev64 v0.4s,v0.4s
956
ror x24,x24,#32
957
ror x25,x25,#32
958
ror x26,x26,#32
959
ror x27,x27,#32
960
ror x28,x28,#32
961
ror x30,x30,#32
962
#endif
963
add v3.4s,v3.4s,v7.4s // += 1
964
stp q0,q1,[sp,#0] // off-load key block, invariant part
965
add v3.4s,v3.4s,v7.4s // not typo
966
str q2,[sp,#32]
967
add v4.4s,v3.4s,v7.4s
968
add v5.4s,v4.4s,v7.4s
969
add v6.4s,v5.4s,v7.4s
970
shl v7.4s,v7.4s,#2 // 1 -> 4
971
972
stp d8,d9,[sp,#128+0] // meet ABI requirements
973
stp d10,d11,[sp,#128+16]
974
stp d12,d13,[sp,#128+32]
975
stp d14,d15,[sp,#128+48]
976
977
sub x2,x2,#512 // not typo
978
979
.Loop_outer_512_neon:
980
mov v8.16b,v0.16b
981
mov v12.16b,v0.16b
982
mov v16.16b,v0.16b
983
mov v20.16b,v0.16b
984
mov v24.16b,v0.16b
985
mov v28.16b,v0.16b
986
mov v9.16b,v1.16b
987
mov w5,w22 // unpack key block
988
mov v13.16b,v1.16b
989
lsr x6,x22,#32
990
mov v17.16b,v1.16b
991
mov w7,w23
992
mov v21.16b,v1.16b
993
lsr x8,x23,#32
994
mov v25.16b,v1.16b
995
mov w9,w24
996
mov v29.16b,v1.16b
997
lsr x10,x24,#32
998
mov v11.16b,v3.16b
999
mov w11,w25
1000
mov v15.16b,v4.16b
1001
lsr x12,x25,#32
1002
mov v19.16b,v5.16b
1003
mov w13,w26
1004
mov v23.16b,v6.16b
1005
lsr x14,x26,#32
1006
mov v10.16b,v2.16b
1007
mov w15,w27
1008
mov v14.16b,v2.16b
1009
lsr x16,x27,#32
1010
add v27.4s,v11.4s,v7.4s // +4
1011
mov w17,w28
1012
add v31.4s,v15.4s,v7.4s // +4
1013
lsr x19,x28,#32
1014
mov v18.16b,v2.16b
1015
mov w20,w30
1016
mov v22.16b,v2.16b
1017
lsr x21,x30,#32
1018
mov v26.16b,v2.16b
1019
stp q3,q4,[sp,#48] // off-load key block, variable part
1020
mov v30.16b,v2.16b
1021
stp q5,q6,[sp,#80]
1022
1023
mov x4,#5
1024
ld1 {v6.4s},[x3]
1025
subs x2,x2,#512
1026
.Loop_upper_neon:
1027
sub x4,x4,#1
1028
add v8.4s,v8.4s,v9.4s
1029
add w5,w5,w9
1030
add v12.4s,v12.4s,v13.4s
1031
add w6,w6,w10
1032
add v16.4s,v16.4s,v17.4s
1033
add w7,w7,w11
1034
add v20.4s,v20.4s,v21.4s
1035
add w8,w8,w12
1036
add v24.4s,v24.4s,v25.4s
1037
eor w17,w17,w5
1038
add v28.4s,v28.4s,v29.4s
1039
eor w19,w19,w6
1040
eor v11.16b,v11.16b,v8.16b
1041
eor w20,w20,w7
1042
eor v15.16b,v15.16b,v12.16b
1043
eor w21,w21,w8
1044
eor v19.16b,v19.16b,v16.16b
1045
ror w17,w17,#16
1046
eor v23.16b,v23.16b,v20.16b
1047
ror w19,w19,#16
1048
eor v27.16b,v27.16b,v24.16b
1049
ror w20,w20,#16
1050
eor v31.16b,v31.16b,v28.16b
1051
ror w21,w21,#16
1052
rev32 v11.8h,v11.8h
1053
add w13,w13,w17
1054
rev32 v15.8h,v15.8h
1055
add w14,w14,w19
1056
rev32 v19.8h,v19.8h
1057
add w15,w15,w20
1058
rev32 v23.8h,v23.8h
1059
add w16,w16,w21
1060
rev32 v27.8h,v27.8h
1061
eor w9,w9,w13
1062
rev32 v31.8h,v31.8h
1063
eor w10,w10,w14
1064
add v10.4s,v10.4s,v11.4s
1065
eor w11,w11,w15
1066
add v14.4s,v14.4s,v15.4s
1067
eor w12,w12,w16
1068
add v18.4s,v18.4s,v19.4s
1069
ror w9,w9,#20
1070
add v22.4s,v22.4s,v23.4s
1071
ror w10,w10,#20
1072
add v26.4s,v26.4s,v27.4s
1073
ror w11,w11,#20
1074
add v30.4s,v30.4s,v31.4s
1075
ror w12,w12,#20
1076
eor v0.16b,v9.16b,v10.16b
1077
add w5,w5,w9
1078
eor v1.16b,v13.16b,v14.16b
1079
add w6,w6,w10
1080
eor v2.16b,v17.16b,v18.16b
1081
add w7,w7,w11
1082
eor v3.16b,v21.16b,v22.16b
1083
add w8,w8,w12
1084
eor v4.16b,v25.16b,v26.16b
1085
eor w17,w17,w5
1086
eor v5.16b,v29.16b,v30.16b
1087
eor w19,w19,w6
1088
ushr v9.4s,v0.4s,#20
1089
eor w20,w20,w7
1090
ushr v13.4s,v1.4s,#20
1091
eor w21,w21,w8
1092
ushr v17.4s,v2.4s,#20
1093
ror w17,w17,#24
1094
ushr v21.4s,v3.4s,#20
1095
ror w19,w19,#24
1096
ushr v25.4s,v4.4s,#20
1097
ror w20,w20,#24
1098
ushr v29.4s,v5.4s,#20
1099
ror w21,w21,#24
1100
sli v9.4s,v0.4s,#12
1101
add w13,w13,w17
1102
sli v13.4s,v1.4s,#12
1103
add w14,w14,w19
1104
sli v17.4s,v2.4s,#12
1105
add w15,w15,w20
1106
sli v21.4s,v3.4s,#12
1107
add w16,w16,w21
1108
sli v25.4s,v4.4s,#12
1109
eor w9,w9,w13
1110
sli v29.4s,v5.4s,#12
1111
eor w10,w10,w14
1112
add v8.4s,v8.4s,v9.4s
1113
eor w11,w11,w15
1114
add v12.4s,v12.4s,v13.4s
1115
eor w12,w12,w16
1116
add v16.4s,v16.4s,v17.4s
1117
ror w9,w9,#25
1118
add v20.4s,v20.4s,v21.4s
1119
ror w10,w10,#25
1120
add v24.4s,v24.4s,v25.4s
1121
ror w11,w11,#25
1122
add v28.4s,v28.4s,v29.4s
1123
ror w12,w12,#25
1124
eor v11.16b,v11.16b,v8.16b
1125
add w5,w5,w10
1126
eor v15.16b,v15.16b,v12.16b
1127
add w6,w6,w11
1128
eor v19.16b,v19.16b,v16.16b
1129
add w7,w7,w12
1130
eor v23.16b,v23.16b,v20.16b
1131
add w8,w8,w9
1132
eor v27.16b,v27.16b,v24.16b
1133
eor w21,w21,w5
1134
eor v31.16b,v31.16b,v28.16b
1135
eor w17,w17,w6
1136
tbl v11.16b,{v11.16b},v6.16b
1137
eor w19,w19,w7
1138
tbl v15.16b,{v15.16b},v6.16b
1139
eor w20,w20,w8
1140
tbl v19.16b,{v19.16b},v6.16b
1141
ror w21,w21,#16
1142
tbl v23.16b,{v23.16b},v6.16b
1143
ror w17,w17,#16
1144
tbl v27.16b,{v27.16b},v6.16b
1145
ror w19,w19,#16
1146
tbl v31.16b,{v31.16b},v6.16b
1147
ror w20,w20,#16
1148
add v10.4s,v10.4s,v11.4s
1149
add w15,w15,w21
1150
add v14.4s,v14.4s,v15.4s
1151
add w16,w16,w17
1152
add v18.4s,v18.4s,v19.4s
1153
add w13,w13,w19
1154
add v22.4s,v22.4s,v23.4s
1155
add w14,w14,w20
1156
add v26.4s,v26.4s,v27.4s
1157
eor w10,w10,w15
1158
add v30.4s,v30.4s,v31.4s
1159
eor w11,w11,w16
1160
eor v0.16b,v9.16b,v10.16b
1161
eor w12,w12,w13
1162
eor v1.16b,v13.16b,v14.16b
1163
eor w9,w9,w14
1164
eor v2.16b,v17.16b,v18.16b
1165
ror w10,w10,#20
1166
eor v3.16b,v21.16b,v22.16b
1167
ror w11,w11,#20
1168
eor v4.16b,v25.16b,v26.16b
1169
ror w12,w12,#20
1170
eor v5.16b,v29.16b,v30.16b
1171
ror w9,w9,#20
1172
ushr v9.4s,v0.4s,#25
1173
add w5,w5,w10
1174
ushr v13.4s,v1.4s,#25
1175
add w6,w6,w11
1176
ushr v17.4s,v2.4s,#25
1177
add w7,w7,w12
1178
ushr v21.4s,v3.4s,#25
1179
add w8,w8,w9
1180
ushr v25.4s,v4.4s,#25
1181
eor w21,w21,w5
1182
ushr v29.4s,v5.4s,#25
1183
eor w17,w17,w6
1184
sli v9.4s,v0.4s,#7
1185
eor w19,w19,w7
1186
sli v13.4s,v1.4s,#7
1187
eor w20,w20,w8
1188
sli v17.4s,v2.4s,#7
1189
ror w21,w21,#24
1190
sli v21.4s,v3.4s,#7
1191
ror w17,w17,#24
1192
sli v25.4s,v4.4s,#7
1193
ror w19,w19,#24
1194
sli v29.4s,v5.4s,#7
1195
ror w20,w20,#24
1196
ext v10.16b,v10.16b,v10.16b,#8
1197
add w15,w15,w21
1198
ext v14.16b,v14.16b,v14.16b,#8
1199
add w16,w16,w17
1200
ext v18.16b,v18.16b,v18.16b,#8
1201
add w13,w13,w19
1202
ext v22.16b,v22.16b,v22.16b,#8
1203
add w14,w14,w20
1204
ext v26.16b,v26.16b,v26.16b,#8
1205
eor w10,w10,w15
1206
ext v30.16b,v30.16b,v30.16b,#8
1207
eor w11,w11,w16
1208
ext v11.16b,v11.16b,v11.16b,#12
1209
eor w12,w12,w13
1210
ext v15.16b,v15.16b,v15.16b,#12
1211
eor w9,w9,w14
1212
ext v19.16b,v19.16b,v19.16b,#12
1213
ror w10,w10,#25
1214
ext v23.16b,v23.16b,v23.16b,#12
1215
ror w11,w11,#25
1216
ext v27.16b,v27.16b,v27.16b,#12
1217
ror w12,w12,#25
1218
ext v31.16b,v31.16b,v31.16b,#12
1219
ror w9,w9,#25
1220
ext v9.16b,v9.16b,v9.16b,#4
1221
ext v13.16b,v13.16b,v13.16b,#4
1222
ext v17.16b,v17.16b,v17.16b,#4
1223
ext v21.16b,v21.16b,v21.16b,#4
1224
ext v25.16b,v25.16b,v25.16b,#4
1225
ext v29.16b,v29.16b,v29.16b,#4
1226
add v8.4s,v8.4s,v9.4s
1227
add w5,w5,w9
1228
add v12.4s,v12.4s,v13.4s
1229
add w6,w6,w10
1230
add v16.4s,v16.4s,v17.4s
1231
add w7,w7,w11
1232
add v20.4s,v20.4s,v21.4s
1233
add w8,w8,w12
1234
add v24.4s,v24.4s,v25.4s
1235
eor w17,w17,w5
1236
add v28.4s,v28.4s,v29.4s
1237
eor w19,w19,w6
1238
eor v11.16b,v11.16b,v8.16b
1239
eor w20,w20,w7
1240
eor v15.16b,v15.16b,v12.16b
1241
eor w21,w21,w8
1242
eor v19.16b,v19.16b,v16.16b
1243
ror w17,w17,#16
1244
eor v23.16b,v23.16b,v20.16b
1245
ror w19,w19,#16
1246
eor v27.16b,v27.16b,v24.16b
1247
ror w20,w20,#16
1248
eor v31.16b,v31.16b,v28.16b
1249
ror w21,w21,#16
1250
rev32 v11.8h,v11.8h
1251
add w13,w13,w17
1252
rev32 v15.8h,v15.8h
1253
add w14,w14,w19
1254
rev32 v19.8h,v19.8h
1255
add w15,w15,w20
1256
rev32 v23.8h,v23.8h
1257
add w16,w16,w21
1258
rev32 v27.8h,v27.8h
1259
eor w9,w9,w13
1260
rev32 v31.8h,v31.8h
1261
eor w10,w10,w14
1262
add v10.4s,v10.4s,v11.4s
1263
eor w11,w11,w15
1264
add v14.4s,v14.4s,v15.4s
1265
eor w12,w12,w16
1266
add v18.4s,v18.4s,v19.4s
1267
ror w9,w9,#20
1268
add v22.4s,v22.4s,v23.4s
1269
ror w10,w10,#20
1270
add v26.4s,v26.4s,v27.4s
1271
ror w11,w11,#20
1272
add v30.4s,v30.4s,v31.4s
1273
ror w12,w12,#20
1274
eor v0.16b,v9.16b,v10.16b
1275
add w5,w5,w9
1276
eor v1.16b,v13.16b,v14.16b
1277
add w6,w6,w10
1278
eor v2.16b,v17.16b,v18.16b
1279
add w7,w7,w11
1280
eor v3.16b,v21.16b,v22.16b
1281
add w8,w8,w12
1282
eor v4.16b,v25.16b,v26.16b
1283
eor w17,w17,w5
1284
eor v5.16b,v29.16b,v30.16b
1285
eor w19,w19,w6
1286
ushr v9.4s,v0.4s,#20
1287
eor w20,w20,w7
1288
ushr v13.4s,v1.4s,#20
1289
eor w21,w21,w8
1290
ushr v17.4s,v2.4s,#20
1291
ror w17,w17,#24
1292
ushr v21.4s,v3.4s,#20
1293
ror w19,w19,#24
1294
ushr v25.4s,v4.4s,#20
1295
ror w20,w20,#24
1296
ushr v29.4s,v5.4s,#20
1297
ror w21,w21,#24
1298
sli v9.4s,v0.4s,#12
1299
add w13,w13,w17
1300
sli v13.4s,v1.4s,#12
1301
add w14,w14,w19
1302
sli v17.4s,v2.4s,#12
1303
add w15,w15,w20
1304
sli v21.4s,v3.4s,#12
1305
add w16,w16,w21
1306
sli v25.4s,v4.4s,#12
1307
eor w9,w9,w13
1308
sli v29.4s,v5.4s,#12
1309
eor w10,w10,w14
1310
add v8.4s,v8.4s,v9.4s
1311
eor w11,w11,w15
1312
add v12.4s,v12.4s,v13.4s
1313
eor w12,w12,w16
1314
add v16.4s,v16.4s,v17.4s
1315
ror w9,w9,#25
1316
add v20.4s,v20.4s,v21.4s
1317
ror w10,w10,#25
1318
add v24.4s,v24.4s,v25.4s
1319
ror w11,w11,#25
1320
add v28.4s,v28.4s,v29.4s
1321
ror w12,w12,#25
1322
eor v11.16b,v11.16b,v8.16b
1323
add w5,w5,w10
1324
eor v15.16b,v15.16b,v12.16b
1325
add w6,w6,w11
1326
eor v19.16b,v19.16b,v16.16b
1327
add w7,w7,w12
1328
eor v23.16b,v23.16b,v20.16b
1329
add w8,w8,w9
1330
eor v27.16b,v27.16b,v24.16b
1331
eor w21,w21,w5
1332
eor v31.16b,v31.16b,v28.16b
1333
eor w17,w17,w6
1334
tbl v11.16b,{v11.16b},v6.16b
1335
eor w19,w19,w7
1336
tbl v15.16b,{v15.16b},v6.16b
1337
eor w20,w20,w8
1338
tbl v19.16b,{v19.16b},v6.16b
1339
ror w21,w21,#16
1340
tbl v23.16b,{v23.16b},v6.16b
1341
ror w17,w17,#16
1342
tbl v27.16b,{v27.16b},v6.16b
1343
ror w19,w19,#16
1344
tbl v31.16b,{v31.16b},v6.16b
1345
ror w20,w20,#16
1346
add v10.4s,v10.4s,v11.4s
1347
add w15,w15,w21
1348
add v14.4s,v14.4s,v15.4s
1349
add w16,w16,w17
1350
add v18.4s,v18.4s,v19.4s
1351
add w13,w13,w19
1352
add v22.4s,v22.4s,v23.4s
1353
add w14,w14,w20
1354
add v26.4s,v26.4s,v27.4s
1355
eor w10,w10,w15
1356
add v30.4s,v30.4s,v31.4s
1357
eor w11,w11,w16
1358
eor v0.16b,v9.16b,v10.16b
1359
eor w12,w12,w13
1360
eor v1.16b,v13.16b,v14.16b
1361
eor w9,w9,w14
1362
eor v2.16b,v17.16b,v18.16b
1363
ror w10,w10,#20
1364
eor v3.16b,v21.16b,v22.16b
1365
ror w11,w11,#20
1366
eor v4.16b,v25.16b,v26.16b
1367
ror w12,w12,#20
1368
eor v5.16b,v29.16b,v30.16b
1369
ror w9,w9,#20
1370
ushr v9.4s,v0.4s,#25
1371
add w5,w5,w10
1372
ushr v13.4s,v1.4s,#25
1373
add w6,w6,w11
1374
ushr v17.4s,v2.4s,#25
1375
add w7,w7,w12
1376
ushr v21.4s,v3.4s,#25
1377
add w8,w8,w9
1378
ushr v25.4s,v4.4s,#25
1379
eor w21,w21,w5
1380
ushr v29.4s,v5.4s,#25
1381
eor w17,w17,w6
1382
sli v9.4s,v0.4s,#7
1383
eor w19,w19,w7
1384
sli v13.4s,v1.4s,#7
1385
eor w20,w20,w8
1386
sli v17.4s,v2.4s,#7
1387
ror w21,w21,#24
1388
sli v21.4s,v3.4s,#7
1389
ror w17,w17,#24
1390
sli v25.4s,v4.4s,#7
1391
ror w19,w19,#24
1392
sli v29.4s,v5.4s,#7
1393
ror w20,w20,#24
1394
ext v10.16b,v10.16b,v10.16b,#8
1395
add w15,w15,w21
1396
ext v14.16b,v14.16b,v14.16b,#8
1397
add w16,w16,w17
1398
ext v18.16b,v18.16b,v18.16b,#8
1399
add w13,w13,w19
1400
ext v22.16b,v22.16b,v22.16b,#8
1401
add w14,w14,w20
1402
ext v26.16b,v26.16b,v26.16b,#8
1403
eor w10,w10,w15
1404
ext v30.16b,v30.16b,v30.16b,#8
1405
eor w11,w11,w16
1406
ext v11.16b,v11.16b,v11.16b,#4
1407
eor w12,w12,w13
1408
ext v15.16b,v15.16b,v15.16b,#4
1409
eor w9,w9,w14
1410
ext v19.16b,v19.16b,v19.16b,#4
1411
ror w10,w10,#25
1412
ext v23.16b,v23.16b,v23.16b,#4
1413
ror w11,w11,#25
1414
ext v27.16b,v27.16b,v27.16b,#4
1415
ror w12,w12,#25
1416
ext v31.16b,v31.16b,v31.16b,#4
1417
ror w9,w9,#25
1418
ext v9.16b,v9.16b,v9.16b,#12
1419
ext v13.16b,v13.16b,v13.16b,#12
1420
ext v17.16b,v17.16b,v17.16b,#12
1421
ext v21.16b,v21.16b,v21.16b,#12
1422
ext v25.16b,v25.16b,v25.16b,#12
1423
ext v29.16b,v29.16b,v29.16b,#12
1424
cbnz x4,.Loop_upper_neon
1425
1426
add w5,w5,w22 // accumulate key block
1427
add x6,x6,x22,lsr#32
1428
add w7,w7,w23
1429
add x8,x8,x23,lsr#32
1430
add w9,w9,w24
1431
add x10,x10,x24,lsr#32
1432
add w11,w11,w25
1433
add x12,x12,x25,lsr#32
1434
add w13,w13,w26
1435
add x14,x14,x26,lsr#32
1436
add w15,w15,w27
1437
add x16,x16,x27,lsr#32
1438
add w17,w17,w28
1439
add x19,x19,x28,lsr#32
1440
add w20,w20,w30
1441
add x21,x21,x30,lsr#32
1442
1443
add x5,x5,x6,lsl#32 // pack
1444
add x7,x7,x8,lsl#32
1445
ldp x6,x8,[x1,#0] // load input
1446
add x9,x9,x10,lsl#32
1447
add x11,x11,x12,lsl#32
1448
ldp x10,x12,[x1,#16]
1449
add x13,x13,x14,lsl#32
1450
add x15,x15,x16,lsl#32
1451
ldp x14,x16,[x1,#32]
1452
add x17,x17,x19,lsl#32
1453
add x20,x20,x21,lsl#32
1454
ldp x19,x21,[x1,#48]
1455
add x1,x1,#64
1456
#ifdef __AARCH64EB__
1457
rev x5,x5
1458
rev x7,x7
1459
rev x9,x9
1460
rev x11,x11
1461
rev x13,x13
1462
rev x15,x15
1463
rev x17,x17
1464
rev x20,x20
1465
#endif
1466
eor x5,x5,x6
1467
eor x7,x7,x8
1468
eor x9,x9,x10
1469
eor x11,x11,x12
1470
eor x13,x13,x14
1471
eor x15,x15,x16
1472
eor x17,x17,x19
1473
eor x20,x20,x21
1474
1475
stp x5,x7,[x0,#0] // store output
1476
add x28,x28,#1 // increment counter
1477
mov w5,w22 // unpack key block
1478
lsr x6,x22,#32
1479
stp x9,x11,[x0,#16]
1480
mov w7,w23
1481
lsr x8,x23,#32
1482
stp x13,x15,[x0,#32]
1483
mov w9,w24
1484
lsr x10,x24,#32
1485
stp x17,x20,[x0,#48]
1486
add x0,x0,#64
1487
mov w11,w25
1488
lsr x12,x25,#32
1489
mov w13,w26
1490
lsr x14,x26,#32
1491
mov w15,w27
1492
lsr x16,x27,#32
1493
mov w17,w28
1494
lsr x19,x28,#32
1495
mov w20,w30
1496
lsr x21,x30,#32
1497
1498
mov x4,#5
1499
.Loop_lower_neon:
1500
sub x4,x4,#1
1501
add v8.4s,v8.4s,v9.4s
1502
add w5,w5,w9
1503
add v12.4s,v12.4s,v13.4s
1504
add w6,w6,w10
1505
add v16.4s,v16.4s,v17.4s
1506
add w7,w7,w11
1507
add v20.4s,v20.4s,v21.4s
1508
add w8,w8,w12
1509
add v24.4s,v24.4s,v25.4s
1510
eor w17,w17,w5
1511
add v28.4s,v28.4s,v29.4s
1512
eor w19,w19,w6
1513
eor v11.16b,v11.16b,v8.16b
1514
eor w20,w20,w7
1515
eor v15.16b,v15.16b,v12.16b
1516
eor w21,w21,w8
1517
eor v19.16b,v19.16b,v16.16b
1518
ror w17,w17,#16
1519
eor v23.16b,v23.16b,v20.16b
1520
ror w19,w19,#16
1521
eor v27.16b,v27.16b,v24.16b
1522
ror w20,w20,#16
1523
eor v31.16b,v31.16b,v28.16b
1524
ror w21,w21,#16
1525
rev32 v11.8h,v11.8h
1526
add w13,w13,w17
1527
rev32 v15.8h,v15.8h
1528
add w14,w14,w19
1529
rev32 v19.8h,v19.8h
1530
add w15,w15,w20
1531
rev32 v23.8h,v23.8h
1532
add w16,w16,w21
1533
rev32 v27.8h,v27.8h
1534
eor w9,w9,w13
1535
rev32 v31.8h,v31.8h
1536
eor w10,w10,w14
1537
add v10.4s,v10.4s,v11.4s
1538
eor w11,w11,w15
1539
add v14.4s,v14.4s,v15.4s
1540
eor w12,w12,w16
1541
add v18.4s,v18.4s,v19.4s
1542
ror w9,w9,#20
1543
add v22.4s,v22.4s,v23.4s
1544
ror w10,w10,#20
1545
add v26.4s,v26.4s,v27.4s
1546
ror w11,w11,#20
1547
add v30.4s,v30.4s,v31.4s
1548
ror w12,w12,#20
1549
eor v0.16b,v9.16b,v10.16b
1550
add w5,w5,w9
1551
eor v1.16b,v13.16b,v14.16b
1552
add w6,w6,w10
1553
eor v2.16b,v17.16b,v18.16b
1554
add w7,w7,w11
1555
eor v3.16b,v21.16b,v22.16b
1556
add w8,w8,w12
1557
eor v4.16b,v25.16b,v26.16b
1558
eor w17,w17,w5
1559
eor v5.16b,v29.16b,v30.16b
1560
eor w19,w19,w6
1561
ushr v9.4s,v0.4s,#20
1562
eor w20,w20,w7
1563
ushr v13.4s,v1.4s,#20
1564
eor w21,w21,w8
1565
ushr v17.4s,v2.4s,#20
1566
ror w17,w17,#24
1567
ushr v21.4s,v3.4s,#20
1568
ror w19,w19,#24
1569
ushr v25.4s,v4.4s,#20
1570
ror w20,w20,#24
1571
ushr v29.4s,v5.4s,#20
1572
ror w21,w21,#24
1573
sli v9.4s,v0.4s,#12
1574
add w13,w13,w17
1575
sli v13.4s,v1.4s,#12
1576
add w14,w14,w19
1577
sli v17.4s,v2.4s,#12
1578
add w15,w15,w20
1579
sli v21.4s,v3.4s,#12
1580
add w16,w16,w21
1581
sli v25.4s,v4.4s,#12
1582
eor w9,w9,w13
1583
sli v29.4s,v5.4s,#12
1584
eor w10,w10,w14
1585
add v8.4s,v8.4s,v9.4s
1586
eor w11,w11,w15
1587
add v12.4s,v12.4s,v13.4s
1588
eor w12,w12,w16
1589
add v16.4s,v16.4s,v17.4s
1590
ror w9,w9,#25
1591
add v20.4s,v20.4s,v21.4s
1592
ror w10,w10,#25
1593
add v24.4s,v24.4s,v25.4s
1594
ror w11,w11,#25
1595
add v28.4s,v28.4s,v29.4s
1596
ror w12,w12,#25
1597
eor v11.16b,v11.16b,v8.16b
1598
add w5,w5,w10
1599
eor v15.16b,v15.16b,v12.16b
1600
add w6,w6,w11
1601
eor v19.16b,v19.16b,v16.16b
1602
add w7,w7,w12
1603
eor v23.16b,v23.16b,v20.16b
1604
add w8,w8,w9
1605
eor v27.16b,v27.16b,v24.16b
1606
eor w21,w21,w5
1607
eor v31.16b,v31.16b,v28.16b
1608
eor w17,w17,w6
1609
tbl v11.16b,{v11.16b},v6.16b
1610
eor w19,w19,w7
1611
tbl v15.16b,{v15.16b},v6.16b
1612
eor w20,w20,w8
1613
tbl v19.16b,{v19.16b},v6.16b
1614
ror w21,w21,#16
1615
tbl v23.16b,{v23.16b},v6.16b
1616
ror w17,w17,#16
1617
tbl v27.16b,{v27.16b},v6.16b
1618
ror w19,w19,#16
1619
tbl v31.16b,{v31.16b},v6.16b
1620
ror w20,w20,#16
1621
add v10.4s,v10.4s,v11.4s
1622
add w15,w15,w21
1623
add v14.4s,v14.4s,v15.4s
1624
add w16,w16,w17
1625
add v18.4s,v18.4s,v19.4s
1626
add w13,w13,w19
1627
add v22.4s,v22.4s,v23.4s
1628
add w14,w14,w20
1629
add v26.4s,v26.4s,v27.4s
1630
eor w10,w10,w15
1631
add v30.4s,v30.4s,v31.4s
1632
eor w11,w11,w16
1633
eor v0.16b,v9.16b,v10.16b
1634
eor w12,w12,w13
1635
eor v1.16b,v13.16b,v14.16b
1636
eor w9,w9,w14
1637
eor v2.16b,v17.16b,v18.16b
1638
ror w10,w10,#20
1639
eor v3.16b,v21.16b,v22.16b
1640
ror w11,w11,#20
1641
eor v4.16b,v25.16b,v26.16b
1642
ror w12,w12,#20
1643
eor v5.16b,v29.16b,v30.16b
1644
ror w9,w9,#20
1645
ushr v9.4s,v0.4s,#25
1646
add w5,w5,w10
1647
ushr v13.4s,v1.4s,#25
1648
add w6,w6,w11
1649
ushr v17.4s,v2.4s,#25
1650
add w7,w7,w12
1651
ushr v21.4s,v3.4s,#25
1652
add w8,w8,w9
1653
ushr v25.4s,v4.4s,#25
1654
eor w21,w21,w5
1655
ushr v29.4s,v5.4s,#25
1656
eor w17,w17,w6
1657
sli v9.4s,v0.4s,#7
1658
eor w19,w19,w7
1659
sli v13.4s,v1.4s,#7
1660
eor w20,w20,w8
1661
sli v17.4s,v2.4s,#7
1662
ror w21,w21,#24
1663
sli v21.4s,v3.4s,#7
1664
ror w17,w17,#24
1665
sli v25.4s,v4.4s,#7
1666
ror w19,w19,#24
1667
sli v29.4s,v5.4s,#7
1668
ror w20,w20,#24
1669
ext v10.16b,v10.16b,v10.16b,#8
1670
add w15,w15,w21
1671
ext v14.16b,v14.16b,v14.16b,#8
1672
add w16,w16,w17
1673
ext v18.16b,v18.16b,v18.16b,#8
1674
add w13,w13,w19
1675
ext v22.16b,v22.16b,v22.16b,#8
1676
add w14,w14,w20
1677
ext v26.16b,v26.16b,v26.16b,#8
1678
eor w10,w10,w15
1679
ext v30.16b,v30.16b,v30.16b,#8
1680
eor w11,w11,w16
1681
ext v11.16b,v11.16b,v11.16b,#12
1682
eor w12,w12,w13
1683
ext v15.16b,v15.16b,v15.16b,#12
1684
eor w9,w9,w14
1685
ext v19.16b,v19.16b,v19.16b,#12
1686
ror w10,w10,#25
1687
ext v23.16b,v23.16b,v23.16b,#12
1688
ror w11,w11,#25
1689
ext v27.16b,v27.16b,v27.16b,#12
1690
ror w12,w12,#25
1691
ext v31.16b,v31.16b,v31.16b,#12
1692
ror w9,w9,#25
1693
ext v9.16b,v9.16b,v9.16b,#4
1694
ext v13.16b,v13.16b,v13.16b,#4
1695
ext v17.16b,v17.16b,v17.16b,#4
1696
ext v21.16b,v21.16b,v21.16b,#4
1697
ext v25.16b,v25.16b,v25.16b,#4
1698
ext v29.16b,v29.16b,v29.16b,#4
1699
add v8.4s,v8.4s,v9.4s
1700
add w5,w5,w9
1701
add v12.4s,v12.4s,v13.4s
1702
add w6,w6,w10
1703
add v16.4s,v16.4s,v17.4s
1704
add w7,w7,w11
1705
add v20.4s,v20.4s,v21.4s
1706
add w8,w8,w12
1707
add v24.4s,v24.4s,v25.4s
1708
eor w17,w17,w5
1709
add v28.4s,v28.4s,v29.4s
1710
eor w19,w19,w6
1711
eor v11.16b,v11.16b,v8.16b
1712
eor w20,w20,w7
1713
eor v15.16b,v15.16b,v12.16b
1714
eor w21,w21,w8
1715
eor v19.16b,v19.16b,v16.16b
1716
ror w17,w17,#16
1717
eor v23.16b,v23.16b,v20.16b
1718
ror w19,w19,#16
1719
eor v27.16b,v27.16b,v24.16b
1720
ror w20,w20,#16
1721
eor v31.16b,v31.16b,v28.16b
1722
ror w21,w21,#16
1723
rev32 v11.8h,v11.8h
1724
add w13,w13,w17
1725
rev32 v15.8h,v15.8h
1726
add w14,w14,w19
1727
rev32 v19.8h,v19.8h
1728
add w15,w15,w20
1729
rev32 v23.8h,v23.8h
1730
add w16,w16,w21
1731
rev32 v27.8h,v27.8h
1732
eor w9,w9,w13
1733
rev32 v31.8h,v31.8h
1734
eor w10,w10,w14
1735
add v10.4s,v10.4s,v11.4s
1736
eor w11,w11,w15
1737
add v14.4s,v14.4s,v15.4s
1738
eor w12,w12,w16
1739
add v18.4s,v18.4s,v19.4s
1740
ror w9,w9,#20
1741
add v22.4s,v22.4s,v23.4s
1742
ror w10,w10,#20
1743
add v26.4s,v26.4s,v27.4s
1744
ror w11,w11,#20
1745
add v30.4s,v30.4s,v31.4s
1746
ror w12,w12,#20
1747
eor v0.16b,v9.16b,v10.16b
1748
add w5,w5,w9
1749
eor v1.16b,v13.16b,v14.16b
1750
add w6,w6,w10
1751
eor v2.16b,v17.16b,v18.16b
1752
add w7,w7,w11
1753
eor v3.16b,v21.16b,v22.16b
1754
add w8,w8,w12
1755
eor v4.16b,v25.16b,v26.16b
1756
eor w17,w17,w5
1757
eor v5.16b,v29.16b,v30.16b
1758
eor w19,w19,w6
1759
ushr v9.4s,v0.4s,#20
1760
eor w20,w20,w7
1761
ushr v13.4s,v1.4s,#20
1762
eor w21,w21,w8
1763
ushr v17.4s,v2.4s,#20
1764
ror w17,w17,#24
1765
ushr v21.4s,v3.4s,#20
1766
ror w19,w19,#24
1767
ushr v25.4s,v4.4s,#20
1768
ror w20,w20,#24
1769
ushr v29.4s,v5.4s,#20
1770
ror w21,w21,#24
1771
sli v9.4s,v0.4s,#12
1772
add w13,w13,w17
1773
sli v13.4s,v1.4s,#12
1774
add w14,w14,w19
1775
sli v17.4s,v2.4s,#12
1776
add w15,w15,w20
1777
sli v21.4s,v3.4s,#12
1778
add w16,w16,w21
1779
sli v25.4s,v4.4s,#12
1780
eor w9,w9,w13
1781
sli v29.4s,v5.4s,#12
1782
eor w10,w10,w14
1783
add v8.4s,v8.4s,v9.4s
1784
eor w11,w11,w15
1785
add v12.4s,v12.4s,v13.4s
1786
eor w12,w12,w16
1787
add v16.4s,v16.4s,v17.4s
1788
ror w9,w9,#25
1789
add v20.4s,v20.4s,v21.4s
1790
ror w10,w10,#25
1791
add v24.4s,v24.4s,v25.4s
1792
ror w11,w11,#25
1793
add v28.4s,v28.4s,v29.4s
1794
ror w12,w12,#25
1795
eor v11.16b,v11.16b,v8.16b
1796
add w5,w5,w10
1797
eor v15.16b,v15.16b,v12.16b
1798
add w6,w6,w11
1799
eor v19.16b,v19.16b,v16.16b
1800
add w7,w7,w12
1801
eor v23.16b,v23.16b,v20.16b
1802
add w8,w8,w9
1803
eor v27.16b,v27.16b,v24.16b
1804
eor w21,w21,w5
1805
eor v31.16b,v31.16b,v28.16b
1806
eor w17,w17,w6
1807
tbl v11.16b,{v11.16b},v6.16b
1808
eor w19,w19,w7
1809
tbl v15.16b,{v15.16b},v6.16b
1810
eor w20,w20,w8
1811
tbl v19.16b,{v19.16b},v6.16b
1812
ror w21,w21,#16
1813
tbl v23.16b,{v23.16b},v6.16b
1814
ror w17,w17,#16
1815
tbl v27.16b,{v27.16b},v6.16b
1816
ror w19,w19,#16
1817
tbl v31.16b,{v31.16b},v6.16b
1818
ror w20,w20,#16
1819
add v10.4s,v10.4s,v11.4s
1820
add w15,w15,w21
1821
add v14.4s,v14.4s,v15.4s
1822
add w16,w16,w17
1823
add v18.4s,v18.4s,v19.4s
1824
add w13,w13,w19
1825
add v22.4s,v22.4s,v23.4s
1826
add w14,w14,w20
1827
add v26.4s,v26.4s,v27.4s
1828
eor w10,w10,w15
1829
add v30.4s,v30.4s,v31.4s
1830
eor w11,w11,w16
1831
eor v0.16b,v9.16b,v10.16b
1832
eor w12,w12,w13
1833
eor v1.16b,v13.16b,v14.16b
1834
eor w9,w9,w14
1835
eor v2.16b,v17.16b,v18.16b
1836
ror w10,w10,#20
1837
eor v3.16b,v21.16b,v22.16b
1838
ror w11,w11,#20
1839
eor v4.16b,v25.16b,v26.16b
1840
ror w12,w12,#20
1841
eor v5.16b,v29.16b,v30.16b
1842
ror w9,w9,#20
1843
ushr v9.4s,v0.4s,#25
1844
add w5,w5,w10
1845
ushr v13.4s,v1.4s,#25
1846
add w6,w6,w11
1847
ushr v17.4s,v2.4s,#25
1848
add w7,w7,w12
1849
ushr v21.4s,v3.4s,#25
1850
add w8,w8,w9
1851
ushr v25.4s,v4.4s,#25
1852
eor w21,w21,w5
1853
ushr v29.4s,v5.4s,#25
1854
eor w17,w17,w6
1855
sli v9.4s,v0.4s,#7
1856
eor w19,w19,w7
1857
sli v13.4s,v1.4s,#7
1858
eor w20,w20,w8
1859
sli v17.4s,v2.4s,#7
1860
ror w21,w21,#24
1861
sli v21.4s,v3.4s,#7
1862
ror w17,w17,#24
1863
sli v25.4s,v4.4s,#7
1864
ror w19,w19,#24
1865
sli v29.4s,v5.4s,#7
1866
ror w20,w20,#24
1867
ext v10.16b,v10.16b,v10.16b,#8
1868
add w15,w15,w21
1869
ext v14.16b,v14.16b,v14.16b,#8
1870
add w16,w16,w17
1871
ext v18.16b,v18.16b,v18.16b,#8
1872
add w13,w13,w19
1873
ext v22.16b,v22.16b,v22.16b,#8
1874
add w14,w14,w20
1875
ext v26.16b,v26.16b,v26.16b,#8
1876
eor w10,w10,w15
1877
ext v30.16b,v30.16b,v30.16b,#8
1878
eor w11,w11,w16
1879
ext v11.16b,v11.16b,v11.16b,#4
1880
eor w12,w12,w13
1881
ext v15.16b,v15.16b,v15.16b,#4
1882
eor w9,w9,w14
1883
ext v19.16b,v19.16b,v19.16b,#4
1884
ror w10,w10,#25
1885
ext v23.16b,v23.16b,v23.16b,#4
1886
ror w11,w11,#25
1887
ext v27.16b,v27.16b,v27.16b,#4
1888
ror w12,w12,#25
1889
ext v31.16b,v31.16b,v31.16b,#4
1890
ror w9,w9,#25
1891
ext v9.16b,v9.16b,v9.16b,#12
1892
ext v13.16b,v13.16b,v13.16b,#12
1893
ext v17.16b,v17.16b,v17.16b,#12
1894
ext v21.16b,v21.16b,v21.16b,#12
1895
ext v25.16b,v25.16b,v25.16b,#12
1896
ext v29.16b,v29.16b,v29.16b,#12
1897
cbnz x4,.Loop_lower_neon
1898
1899
add w5,w5,w22 // accumulate key block
1900
ldp q0,q1,[sp,#0]
1901
add x6,x6,x22,lsr#32
1902
ldp q2,q3,[sp,#32]
1903
add w7,w7,w23
1904
ldp q4,q5,[sp,#64]
1905
add x8,x8,x23,lsr#32
1906
ldr q6,[sp,#96]
1907
add v8.4s,v8.4s,v0.4s
1908
add w9,w9,w24
1909
add v12.4s,v12.4s,v0.4s
1910
add x10,x10,x24,lsr#32
1911
add v16.4s,v16.4s,v0.4s
1912
add w11,w11,w25
1913
add v20.4s,v20.4s,v0.4s
1914
add x12,x12,x25,lsr#32
1915
add v24.4s,v24.4s,v0.4s
1916
add w13,w13,w26
1917
add v28.4s,v28.4s,v0.4s
1918
add x14,x14,x26,lsr#32
1919
add v10.4s,v10.4s,v2.4s
1920
add w15,w15,w27
1921
add v14.4s,v14.4s,v2.4s
1922
add x16,x16,x27,lsr#32
1923
add v18.4s,v18.4s,v2.4s
1924
add w17,w17,w28
1925
add v22.4s,v22.4s,v2.4s
1926
add x19,x19,x28,lsr#32
1927
add v26.4s,v26.4s,v2.4s
1928
add w20,w20,w30
1929
add v30.4s,v30.4s,v2.4s
1930
add x21,x21,x30,lsr#32
1931
add v27.4s,v27.4s,v7.4s // +4
1932
add x5,x5,x6,lsl#32 // pack
1933
add v31.4s,v31.4s,v7.4s // +4
1934
add x7,x7,x8,lsl#32
1935
add v11.4s,v11.4s,v3.4s
1936
ldp x6,x8,[x1,#0] // load input
1937
add v15.4s,v15.4s,v4.4s
1938
add x9,x9,x10,lsl#32
1939
add v19.4s,v19.4s,v5.4s
1940
add x11,x11,x12,lsl#32
1941
add v23.4s,v23.4s,v6.4s
1942
ldp x10,x12,[x1,#16]
1943
add v27.4s,v27.4s,v3.4s
1944
add x13,x13,x14,lsl#32
1945
add v31.4s,v31.4s,v4.4s
1946
add x15,x15,x16,lsl#32
1947
add v9.4s,v9.4s,v1.4s
1948
ldp x14,x16,[x1,#32]
1949
add v13.4s,v13.4s,v1.4s
1950
add x17,x17,x19,lsl#32
1951
add v17.4s,v17.4s,v1.4s
1952
add x20,x20,x21,lsl#32
1953
add v21.4s,v21.4s,v1.4s
1954
ldp x19,x21,[x1,#48]
1955
add v25.4s,v25.4s,v1.4s
1956
add x1,x1,#64
1957
add v29.4s,v29.4s,v1.4s
1958
1959
#ifdef __AARCH64EB__
1960
rev x5,x5
1961
rev x7,x7
1962
rev x9,x9
1963
rev x11,x11
1964
rev x13,x13
1965
rev x15,x15
1966
rev x17,x17
1967
rev x20,x20
1968
#endif
1969
ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1970
eor x5,x5,x6
1971
eor x7,x7,x8
1972
eor x9,x9,x10
1973
eor x11,x11,x12
1974
eor x13,x13,x14
1975
eor v8.16b,v8.16b,v0.16b
1976
eor x15,x15,x16
1977
eor v9.16b,v9.16b,v1.16b
1978
eor x17,x17,x19
1979
eor v10.16b,v10.16b,v2.16b
1980
eor x20,x20,x21
1981
eor v11.16b,v11.16b,v3.16b
1982
ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1983
1984
stp x5,x7,[x0,#0] // store output
1985
add x28,x28,#7 // increment counter
1986
stp x9,x11,[x0,#16]
1987
stp x13,x15,[x0,#32]
1988
stp x17,x20,[x0,#48]
1989
add x0,x0,#64
1990
st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1991
1992
ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1993
eor v12.16b,v12.16b,v0.16b
1994
eor v13.16b,v13.16b,v1.16b
1995
eor v14.16b,v14.16b,v2.16b
1996
eor v15.16b,v15.16b,v3.16b
1997
st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1998
1999
ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
2000
eor v16.16b,v16.16b,v8.16b
2001
ldp q0,q1,[sp,#0]
2002
eor v17.16b,v17.16b,v9.16b
2003
ldp q2,q3,[sp,#32]
2004
eor v18.16b,v18.16b,v10.16b
2005
eor v19.16b,v19.16b,v11.16b
2006
st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
2007
2008
ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64
2009
eor v20.16b,v20.16b,v12.16b
2010
eor v21.16b,v21.16b,v13.16b
2011
eor v22.16b,v22.16b,v14.16b
2012
eor v23.16b,v23.16b,v15.16b
2013
st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
2014
2015
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
2016
eor v24.16b,v24.16b,v16.16b
2017
eor v25.16b,v25.16b,v17.16b
2018
eor v26.16b,v26.16b,v18.16b
2019
eor v27.16b,v27.16b,v19.16b
2020
st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64
2021
2022
shl v8.4s,v7.4s,#1 // 4 -> 8
2023
eor v28.16b,v28.16b,v20.16b
2024
eor v29.16b,v29.16b,v21.16b
2025
eor v30.16b,v30.16b,v22.16b
2026
eor v31.16b,v31.16b,v23.16b
2027
st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64
2028
2029
add v3.4s,v3.4s,v8.4s // += 8
2030
add v4.4s,v4.4s,v8.4s
2031
add v5.4s,v5.4s,v8.4s
2032
add v6.4s,v6.4s,v8.4s
2033
2034
b.hs .Loop_outer_512_neon
2035
2036
adds x2,x2,#512
2037
ushr v7.4s,v7.4s,#1 // 4 -> 2
2038
2039
ldp d10,d11,[sp,#128+16] // meet ABI requirements
2040
ldp d12,d13,[sp,#128+32]
2041
ldp d14,d15,[sp,#128+48]
2042
2043
stp q0,q0,[sp,#0] // wipe off-load area
2044
stp q0,q0,[sp,#32]
2045
stp q0,q0,[sp,#64]
2046
2047
b.eq .Ldone_512_neon
2048
2049
sub x3,x3,#16 // .Lone
2050
cmp x2,#192
2051
add sp,sp,#128
2052
sub v3.4s,v3.4s,v7.4s // -= 2
2053
ld1 {v8.4s,v9.4s},[x3]
2054
b.hs .Loop_outer_neon
2055
2056
ldp d8,d9,[sp,#0] // meet ABI requirements
2057
eor v1.16b,v1.16b,v1.16b
2058
eor v2.16b,v2.16b,v2.16b
2059
eor v3.16b,v3.16b,v3.16b
2060
eor v4.16b,v4.16b,v4.16b
2061
eor v5.16b,v5.16b,v5.16b
2062
eor v6.16b,v6.16b,v6.16b
2063
b .Loop_outer
2064
2065
.Ldone_512_neon:
2066
ldp d8,d9,[sp,#128+0] // meet ABI requirements
2067
ldp x19,x20,[x29,#16]
2068
add sp,sp,#128+64
2069
ldp x21,x22,[x29,#32]
2070
ldp x23,x24,[x29,#48]
2071
ldp x25,x26,[x29,#64]
2072
ldp x27,x28,[x29,#80]
2073
ldp x29,x30,[sp],#96
2074
AARCH64_VALIDATE_LINK_REGISTER
2075
ret
2076
.size ChaCha20_512_neon,.-ChaCha20_512_neon
2077
2078