Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/aarch64/keccak1600-armv8.S
39507 views
1
/* Do not modify. This file is auto-generated from keccak1600-armv8.pl. */
2
#include "arm_arch.h"
3
4
.section .rodata
5
6
.align 8 // strategic alignment and padding that allows to use
7
// address value as loop termination condition...
8
.quad 0,0,0,0,0,0,0,0
9
.type iotas,%object
10
iotas:
11
.quad 0x0000000000000001
12
.quad 0x0000000000008082
13
.quad 0x800000000000808a
14
.quad 0x8000000080008000
15
.quad 0x000000000000808b
16
.quad 0x0000000080000001
17
.quad 0x8000000080008081
18
.quad 0x8000000000008009
19
.quad 0x000000000000008a
20
.quad 0x0000000000000088
21
.quad 0x0000000080008009
22
.quad 0x000000008000000a
23
.quad 0x000000008000808b
24
.quad 0x800000000000008b
25
.quad 0x8000000000008089
26
.quad 0x8000000000008003
27
.quad 0x8000000000008002
28
.quad 0x8000000000000080
29
.quad 0x000000000000800a
30
.quad 0x800000008000000a
31
.quad 0x8000000080008081
32
.quad 0x8000000000008080
33
.quad 0x0000000080000001
34
.quad 0x8000000080008008
35
.size iotas,.-iotas
36
.text
37
38
.type KeccakF1600_int,%function
39
.align 5
40
KeccakF1600_int:
41
AARCH64_SIGN_LINK_REGISTER
42
adrp x28,iotas
43
add x28,x28,#:lo12:iotas
44
stp x28,x30,[sp,#16] // 32 bytes on top are mine
45
b .Loop
46
.align 4
47
.Loop:
48
////////////////////////////////////////// Theta
49
eor x26,x0,x5
50
stp x4,x9,[sp,#0] // offload pair...
51
eor x27,x1,x6
52
eor x28,x2,x7
53
eor x30,x3,x8
54
eor x4,x4,x9
55
eor x26,x26,x10
56
eor x27,x27,x11
57
eor x28,x28,x12
58
eor x30,x30,x13
59
eor x4,x4,x14
60
eor x26,x26,x15
61
eor x27,x27,x16
62
eor x28,x28,x17
63
eor x30,x30,x25
64
eor x4,x4,x19
65
eor x26,x26,x20
66
eor x28,x28,x22
67
eor x27,x27,x21
68
eor x30,x30,x23
69
eor x4,x4,x24
70
71
eor x9,x26,x28,ror#63
72
73
eor x1,x1,x9
74
eor x6,x6,x9
75
eor x11,x11,x9
76
eor x16,x16,x9
77
eor x21,x21,x9
78
79
eor x9,x27,x30,ror#63
80
eor x28,x28,x4,ror#63
81
eor x30,x30,x26,ror#63
82
eor x4,x4,x27,ror#63
83
84
eor x27, x2,x9 // mov x27,x2
85
eor x7,x7,x9
86
eor x12,x12,x9
87
eor x17,x17,x9
88
eor x22,x22,x9
89
90
eor x0,x0,x4
91
eor x5,x5,x4
92
eor x10,x10,x4
93
eor x15,x15,x4
94
eor x20,x20,x4
95
ldp x4,x9,[sp,#0] // re-load offloaded data
96
eor x26, x3,x28 // mov x26,x3
97
eor x8,x8,x28
98
eor x13,x13,x28
99
eor x25,x25,x28
100
eor x23,x23,x28
101
102
eor x28, x4,x30 // mov x28,x4
103
eor x9,x9,x30
104
eor x14,x14,x30
105
eor x19,x19,x30
106
eor x24,x24,x30
107
108
////////////////////////////////////////// Rho+Pi
109
mov x30,x1
110
ror x1,x6,#64-44
111
//mov x27,x2
112
ror x2,x12,#64-43
113
//mov x26,x3
114
ror x3,x25,#64-21
115
//mov x28,x4
116
ror x4,x24,#64-14
117
118
ror x6,x9,#64-20
119
ror x12,x13,#64-25
120
ror x25,x17,#64-15
121
ror x24,x21,#64-2
122
123
ror x9,x22,#64-61
124
ror x13,x19,#64-8
125
ror x17,x11,#64-10
126
ror x21,x8,#64-55
127
128
ror x22,x14,#64-39
129
ror x19,x23,#64-56
130
ror x11,x7,#64-6
131
ror x8,x16,#64-45
132
133
ror x14,x20,#64-18
134
ror x23,x15,#64-41
135
ror x7,x10,#64-3
136
ror x16,x5,#64-36
137
138
ror x5,x26,#64-28
139
ror x10,x30,#64-1
140
ror x15,x28,#64-27
141
ror x20,x27,#64-62
142
143
////////////////////////////////////////// Chi+Iota
144
bic x26,x2,x1
145
bic x27,x3,x2
146
bic x28,x0,x4
147
bic x30,x1,x0
148
eor x0,x0,x26
149
bic x26,x4,x3
150
eor x1,x1,x27
151
ldr x27,[sp,#16]
152
eor x3,x3,x28
153
eor x4,x4,x30
154
eor x2,x2,x26
155
ldr x30,[x27],#8 // Iota[i++]
156
157
bic x26,x7,x6
158
tst x27,#255 // are we done?
159
str x27,[sp,#16]
160
bic x27,x8,x7
161
bic x28,x5,x9
162
eor x0,x0,x30 // A[0][0] ^= Iota
163
bic x30,x6,x5
164
eor x5,x5,x26
165
bic x26,x9,x8
166
eor x6,x6,x27
167
eor x8,x8,x28
168
eor x9,x9,x30
169
eor x7,x7,x26
170
171
bic x26,x12,x11
172
bic x27,x13,x12
173
bic x28,x10,x14
174
bic x30,x11,x10
175
eor x10,x10,x26
176
bic x26,x14,x13
177
eor x11,x11,x27
178
eor x13,x13,x28
179
eor x14,x14,x30
180
eor x12,x12,x26
181
182
bic x26,x17,x16
183
bic x27,x25,x17
184
bic x28,x15,x19
185
bic x30,x16,x15
186
eor x15,x15,x26
187
bic x26,x19,x25
188
eor x16,x16,x27
189
eor x25,x25,x28
190
eor x19,x19,x30
191
eor x17,x17,x26
192
193
bic x26,x22,x21
194
bic x27,x23,x22
195
bic x28,x20,x24
196
bic x30,x21,x20
197
eor x20,x20,x26
198
bic x26,x24,x23
199
eor x21,x21,x27
200
eor x23,x23,x28
201
eor x24,x24,x30
202
eor x22,x22,x26
203
204
bne .Loop
205
206
ldr x30,[sp,#24]
207
AARCH64_VALIDATE_LINK_REGISTER
208
ret
209
.size KeccakF1600_int,.-KeccakF1600_int
210
211
.type KeccakF1600,%function
212
.align 5
213
KeccakF1600:
214
AARCH64_SIGN_LINK_REGISTER
215
stp x29,x30,[sp,#-128]!
216
add x29,sp,#0
217
stp x19,x20,[sp,#16]
218
stp x21,x22,[sp,#32]
219
stp x23,x24,[sp,#48]
220
stp x25,x26,[sp,#64]
221
stp x27,x28,[sp,#80]
222
sub sp,sp,#48
223
224
str x0,[sp,#32] // offload argument
225
mov x26,x0
226
ldp x0,x1,[x0,#16*0]
227
ldp x2,x3,[x26,#16*1]
228
ldp x4,x5,[x26,#16*2]
229
ldp x6,x7,[x26,#16*3]
230
ldp x8,x9,[x26,#16*4]
231
ldp x10,x11,[x26,#16*5]
232
ldp x12,x13,[x26,#16*6]
233
ldp x14,x15,[x26,#16*7]
234
ldp x16,x17,[x26,#16*8]
235
ldp x25,x19,[x26,#16*9]
236
ldp x20,x21,[x26,#16*10]
237
ldp x22,x23,[x26,#16*11]
238
ldr x24,[x26,#16*12]
239
240
bl KeccakF1600_int
241
242
ldr x26,[sp,#32]
243
stp x0,x1,[x26,#16*0]
244
stp x2,x3,[x26,#16*1]
245
stp x4,x5,[x26,#16*2]
246
stp x6,x7,[x26,#16*3]
247
stp x8,x9,[x26,#16*4]
248
stp x10,x11,[x26,#16*5]
249
stp x12,x13,[x26,#16*6]
250
stp x14,x15,[x26,#16*7]
251
stp x16,x17,[x26,#16*8]
252
stp x25,x19,[x26,#16*9]
253
stp x20,x21,[x26,#16*10]
254
stp x22,x23,[x26,#16*11]
255
str x24,[x26,#16*12]
256
257
ldp x19,x20,[x29,#16]
258
add sp,sp,#48
259
ldp x21,x22,[x29,#32]
260
ldp x23,x24,[x29,#48]
261
ldp x25,x26,[x29,#64]
262
ldp x27,x28,[x29,#80]
263
ldp x29,x30,[sp],#128
264
AARCH64_VALIDATE_LINK_REGISTER
265
ret
266
.size KeccakF1600,.-KeccakF1600
267
268
.globl SHA3_absorb
269
.type SHA3_absorb,%function
270
.align 5
271
SHA3_absorb:
272
AARCH64_SIGN_LINK_REGISTER
273
stp x29,x30,[sp,#-128]!
274
add x29,sp,#0
275
stp x19,x20,[sp,#16]
276
stp x21,x22,[sp,#32]
277
stp x23,x24,[sp,#48]
278
stp x25,x26,[sp,#64]
279
stp x27,x28,[sp,#80]
280
sub sp,sp,#64
281
282
stp x0,x1,[sp,#32] // offload arguments
283
stp x2,x3,[sp,#48]
284
285
mov x26,x0 // uint64_t A[5][5]
286
mov x27,x1 // const void *inp
287
mov x28,x2 // size_t len
288
mov x30,x3 // size_t bsz
289
ldp x0,x1,[x26,#16*0]
290
ldp x2,x3,[x26,#16*1]
291
ldp x4,x5,[x26,#16*2]
292
ldp x6,x7,[x26,#16*3]
293
ldp x8,x9,[x26,#16*4]
294
ldp x10,x11,[x26,#16*5]
295
ldp x12,x13,[x26,#16*6]
296
ldp x14,x15,[x26,#16*7]
297
ldp x16,x17,[x26,#16*8]
298
ldp x25,x19,[x26,#16*9]
299
ldp x20,x21,[x26,#16*10]
300
ldp x22,x23,[x26,#16*11]
301
ldr x24,[x26,#16*12]
302
b .Loop_absorb
303
304
.align 4
305
.Loop_absorb:
306
subs x26,x28,x30 // len - bsz
307
blo .Labsorbed
308
309
str x26,[sp,#48] // save len - bsz
310
ldr x26,[x27],#8 // *inp++
311
#ifdef __AARCH64EB__
312
rev x26,x26
313
#endif
314
eor x0,x0,x26
315
cmp x30,#8*(0+2)
316
blo .Lprocess_block
317
ldr x26,[x27],#8 // *inp++
318
#ifdef __AARCH64EB__
319
rev x26,x26
320
#endif
321
eor x1,x1,x26
322
beq .Lprocess_block
323
ldr x26,[x27],#8 // *inp++
324
#ifdef __AARCH64EB__
325
rev x26,x26
326
#endif
327
eor x2,x2,x26
328
cmp x30,#8*(2+2)
329
blo .Lprocess_block
330
ldr x26,[x27],#8 // *inp++
331
#ifdef __AARCH64EB__
332
rev x26,x26
333
#endif
334
eor x3,x3,x26
335
beq .Lprocess_block
336
ldr x26,[x27],#8 // *inp++
337
#ifdef __AARCH64EB__
338
rev x26,x26
339
#endif
340
eor x4,x4,x26
341
cmp x30,#8*(4+2)
342
blo .Lprocess_block
343
ldr x26,[x27],#8 // *inp++
344
#ifdef __AARCH64EB__
345
rev x26,x26
346
#endif
347
eor x5,x5,x26
348
beq .Lprocess_block
349
ldr x26,[x27],#8 // *inp++
350
#ifdef __AARCH64EB__
351
rev x26,x26
352
#endif
353
eor x6,x6,x26
354
cmp x30,#8*(6+2)
355
blo .Lprocess_block
356
ldr x26,[x27],#8 // *inp++
357
#ifdef __AARCH64EB__
358
rev x26,x26
359
#endif
360
eor x7,x7,x26
361
beq .Lprocess_block
362
ldr x26,[x27],#8 // *inp++
363
#ifdef __AARCH64EB__
364
rev x26,x26
365
#endif
366
eor x8,x8,x26
367
cmp x30,#8*(8+2)
368
blo .Lprocess_block
369
ldr x26,[x27],#8 // *inp++
370
#ifdef __AARCH64EB__
371
rev x26,x26
372
#endif
373
eor x9,x9,x26
374
beq .Lprocess_block
375
ldr x26,[x27],#8 // *inp++
376
#ifdef __AARCH64EB__
377
rev x26,x26
378
#endif
379
eor x10,x10,x26
380
cmp x30,#8*(10+2)
381
blo .Lprocess_block
382
ldr x26,[x27],#8 // *inp++
383
#ifdef __AARCH64EB__
384
rev x26,x26
385
#endif
386
eor x11,x11,x26
387
beq .Lprocess_block
388
ldr x26,[x27],#8 // *inp++
389
#ifdef __AARCH64EB__
390
rev x26,x26
391
#endif
392
eor x12,x12,x26
393
cmp x30,#8*(12+2)
394
blo .Lprocess_block
395
ldr x26,[x27],#8 // *inp++
396
#ifdef __AARCH64EB__
397
rev x26,x26
398
#endif
399
eor x13,x13,x26
400
beq .Lprocess_block
401
ldr x26,[x27],#8 // *inp++
402
#ifdef __AARCH64EB__
403
rev x26,x26
404
#endif
405
eor x14,x14,x26
406
cmp x30,#8*(14+2)
407
blo .Lprocess_block
408
ldr x26,[x27],#8 // *inp++
409
#ifdef __AARCH64EB__
410
rev x26,x26
411
#endif
412
eor x15,x15,x26
413
beq .Lprocess_block
414
ldr x26,[x27],#8 // *inp++
415
#ifdef __AARCH64EB__
416
rev x26,x26
417
#endif
418
eor x16,x16,x26
419
cmp x30,#8*(16+2)
420
blo .Lprocess_block
421
ldr x26,[x27],#8 // *inp++
422
#ifdef __AARCH64EB__
423
rev x26,x26
424
#endif
425
eor x17,x17,x26
426
beq .Lprocess_block
427
ldr x26,[x27],#8 // *inp++
428
#ifdef __AARCH64EB__
429
rev x26,x26
430
#endif
431
eor x25,x25,x26
432
cmp x30,#8*(18+2)
433
blo .Lprocess_block
434
ldr x26,[x27],#8 // *inp++
435
#ifdef __AARCH64EB__
436
rev x26,x26
437
#endif
438
eor x19,x19,x26
439
beq .Lprocess_block
440
ldr x26,[x27],#8 // *inp++
441
#ifdef __AARCH64EB__
442
rev x26,x26
443
#endif
444
eor x20,x20,x26
445
cmp x30,#8*(20+2)
446
blo .Lprocess_block
447
ldr x26,[x27],#8 // *inp++
448
#ifdef __AARCH64EB__
449
rev x26,x26
450
#endif
451
eor x21,x21,x26
452
beq .Lprocess_block
453
ldr x26,[x27],#8 // *inp++
454
#ifdef __AARCH64EB__
455
rev x26,x26
456
#endif
457
eor x22,x22,x26
458
cmp x30,#8*(22+2)
459
blo .Lprocess_block
460
ldr x26,[x27],#8 // *inp++
461
#ifdef __AARCH64EB__
462
rev x26,x26
463
#endif
464
eor x23,x23,x26
465
beq .Lprocess_block
466
ldr x26,[x27],#8 // *inp++
467
#ifdef __AARCH64EB__
468
rev x26,x26
469
#endif
470
eor x24,x24,x26
471
472
.Lprocess_block:
473
str x27,[sp,#40] // save inp
474
475
bl KeccakF1600_int
476
477
ldr x27,[sp,#40] // restore arguments
478
ldp x28,x30,[sp,#48]
479
b .Loop_absorb
480
481
.align 4
482
.Labsorbed:
483
ldr x27,[sp,#32]
484
stp x0,x1,[x27,#16*0]
485
stp x2,x3,[x27,#16*1]
486
stp x4,x5,[x27,#16*2]
487
stp x6,x7,[x27,#16*3]
488
stp x8,x9,[x27,#16*4]
489
stp x10,x11,[x27,#16*5]
490
stp x12,x13,[x27,#16*6]
491
stp x14,x15,[x27,#16*7]
492
stp x16,x17,[x27,#16*8]
493
stp x25,x19,[x27,#16*9]
494
stp x20,x21,[x27,#16*10]
495
stp x22,x23,[x27,#16*11]
496
str x24,[x27,#16*12]
497
498
mov x0,x28 // return value
499
ldp x19,x20,[x29,#16]
500
add sp,sp,#64
501
ldp x21,x22,[x29,#32]
502
ldp x23,x24,[x29,#48]
503
ldp x25,x26,[x29,#64]
504
ldp x27,x28,[x29,#80]
505
ldp x29,x30,[sp],#128
506
AARCH64_VALIDATE_LINK_REGISTER
507
ret
508
.size SHA3_absorb,.-SHA3_absorb
509
.globl SHA3_squeeze
510
.type SHA3_squeeze,%function
511
.align 5
512
SHA3_squeeze:
513
AARCH64_SIGN_LINK_REGISTER
514
stp x29,x30,[sp,#-48]!
515
add x29,sp,#0
516
stp x19,x20,[sp,#16]
517
stp x21,x22,[sp,#32]
518
519
mov x19,x0 // put aside arguments
520
mov x20,x1
521
mov x21,x2
522
mov x22,x3
523
cmp w4, #0 // w4 = 'next' argument
524
bne .Lnext_block
525
526
.Loop_squeeze:
527
ldr x4,[x0],#8
528
cmp x21,#8
529
blo .Lsqueeze_tail
530
#ifdef __AARCH64EB__
531
rev x4,x4
532
#endif
533
str x4,[x20],#8
534
subs x21,x21,#8
535
beq .Lsqueeze_done
536
537
subs x3,x3,#8
538
bhi .Loop_squeeze
539
.Lnext_block:
540
mov x0,x19
541
bl KeccakF1600
542
mov x0,x19
543
mov x3,x22
544
b .Loop_squeeze
545
546
.align 4
547
.Lsqueeze_tail:
548
strb w4,[x20],#1
549
lsr x4,x4,#8
550
subs x21,x21,#1
551
beq .Lsqueeze_done
552
strb w4,[x20],#1
553
lsr x4,x4,#8
554
subs x21,x21,#1
555
beq .Lsqueeze_done
556
strb w4,[x20],#1
557
lsr x4,x4,#8
558
subs x21,x21,#1
559
beq .Lsqueeze_done
560
strb w4,[x20],#1
561
lsr x4,x4,#8
562
subs x21,x21,#1
563
beq .Lsqueeze_done
564
strb w4,[x20],#1
565
lsr x4,x4,#8
566
subs x21,x21,#1
567
beq .Lsqueeze_done
568
strb w4,[x20],#1
569
lsr x4,x4,#8
570
subs x21,x21,#1
571
beq .Lsqueeze_done
572
strb w4,[x20],#1
573
574
.Lsqueeze_done:
575
ldp x19,x20,[sp,#16]
576
ldp x21,x22,[sp,#32]
577
ldp x29,x30,[sp],#48
578
AARCH64_VALIDATE_LINK_REGISTER
579
ret
580
.size SHA3_squeeze,.-SHA3_squeeze
581
.type KeccakF1600_ce,%function
582
.align 5
583
KeccakF1600_ce:
584
mov x9,#24
585
adrp x10,iotas
586
add x10,x10,#:lo12:iotas
587
b .Loop_ce
588
.align 4
589
.Loop_ce:
590
////////////////////////////////////////////////// Theta
591
.inst 0xce0f2a99 //eor3 v25.16b,v20.16b,v15.16b,v10.16b
592
.inst 0xce102eba //eor3 v26.16b,v21.16b,v16.16b,v11.16b
593
.inst 0xce1132db //eor3 v27.16b,v22.16b,v17.16b,v12.16b
594
.inst 0xce1236fc //eor3 v28.16b,v23.16b,v18.16b,v13.16b
595
.inst 0xce133b1d //eor3 v29.16b,v24.16b,v19.16b,v14.16b
596
.inst 0xce050339 //eor3 v25.16b,v25.16b, v5.16b,v0.16b
597
.inst 0xce06075a //eor3 v26.16b,v26.16b, v6.16b,v1.16b
598
.inst 0xce070b7b //eor3 v27.16b,v27.16b, v7.16b,v2.16b
599
.inst 0xce080f9c //eor3 v28.16b,v28.16b, v8.16b,v3.16b
600
.inst 0xce0913bd //eor3 v29.16b,v29.16b, v9.16b,v4.16b
601
602
.inst 0xce7b8f3e //rax1 v30.16b,v25.16b,v27.16b // D[1]
603
.inst 0xce7c8f5f //rax1 v31.16b,v26.16b,v28.16b // D[2]
604
.inst 0xce7d8f7b //rax1 v27.16b,v27.16b,v29.16b // D[3]
605
.inst 0xce798f9c //rax1 v28.16b,v28.16b,v25.16b // D[4]
606
.inst 0xce7a8fbd //rax1 v29.16b,v29.16b,v26.16b // D[0]
607
608
////////////////////////////////////////////////// Theta+Rho+Pi
609
.inst 0xce9efc39 //xar v25.16b, v1.16b,v30.16b,#64-1 // C[0]=A[2][0]
610
611
.inst 0xce9e50c1 //xar v1.16b,v6.16b,v30.16b,#64-44
612
.inst 0xce9cb126 //xar v6.16b,v9.16b,v28.16b,#64-20
613
.inst 0xce9f0ec9 //xar v9.16b,v22.16b,v31.16b,#64-61
614
.inst 0xce9c65d6 //xar v22.16b,v14.16b,v28.16b,#64-39
615
.inst 0xce9dba8e //xar v14.16b,v20.16b,v29.16b,#64-18
616
617
.inst 0xce9f085a //xar v26.16b, v2.16b,v31.16b,#64-62 // C[1]=A[4][0]
618
619
.inst 0xce9f5582 //xar v2.16b,v12.16b,v31.16b,#64-43
620
.inst 0xce9b9dac //xar v12.16b,v13.16b,v27.16b,#64-25
621
.inst 0xce9ce26d //xar v13.16b,v19.16b,v28.16b,#64-8
622
.inst 0xce9b22f3 //xar v19.16b,v23.16b,v27.16b,#64-56
623
.inst 0xce9d5df7 //xar v23.16b,v15.16b,v29.16b,#64-41
624
625
.inst 0xce9c948f //xar v15.16b,v4.16b,v28.16b,#64-27
626
627
.inst 0xce9ccb1c //xar v28.16b, v24.16b,v28.16b,#64-14 // D[4]=A[0][4]
628
.inst 0xce9efab8 //xar v24.16b,v21.16b,v30.16b,#64-2
629
.inst 0xce9b2508 //xar v8.16b,v8.16b,v27.16b,#64-55 // A[1][3]=A[4][1]
630
.inst 0xce9e4e04 //xar v4.16b,v16.16b,v30.16b,#64-45 // A[0][4]=A[1][3]
631
.inst 0xce9d70b0 //xar v16.16b,v5.16b,v29.16b,#64-36
632
633
.inst 0xce9b9065 //xar v5.16b,v3.16b,v27.16b,#64-28
634
635
eor v0.16b,v0.16b,v29.16b
636
637
.inst 0xce9bae5b //xar v27.16b, v18.16b,v27.16b,#64-21 // D[3]=A[0][3]
638
.inst 0xce9fc623 //xar v3.16b,v17.16b,v31.16b,#64-15 // A[0][3]=A[3][3]
639
.inst 0xce9ed97e //xar v30.16b, v11.16b,v30.16b,#64-10 // D[1]=A[3][2]
640
.inst 0xce9fe8ff //xar v31.16b, v7.16b,v31.16b,#64-6 // D[2]=A[2][1]
641
.inst 0xce9df55d //xar v29.16b, v10.16b,v29.16b,#64-3 // D[0]=A[1][2]
642
643
////////////////////////////////////////////////// Chi+Iota
644
.inst 0xce362354 //bcax v20.16b,v26.16b, v22.16b,v8.16b // A[1][3]=A[4][1]
645
.inst 0xce375915 //bcax v21.16b,v8.16b,v23.16b,v22.16b // A[1][3]=A[4][1]
646
.inst 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b
647
.inst 0xce3a62f7 //bcax v23.16b,v23.16b,v26.16b, v24.16b
648
.inst 0xce286b18 //bcax v24.16b,v24.16b,v8.16b,v26.16b // A[1][3]=A[4][1]
649
650
ld1r {v26.2d},[x10],#8
651
652
.inst 0xce330fd1 //bcax v17.16b,v30.16b, v19.16b,v3.16b // A[0][3]=A[3][3]
653
.inst 0xce2f4c72 //bcax v18.16b,v3.16b,v15.16b,v19.16b // A[0][3]=A[3][3]
654
.inst 0xce303e73 //bcax v19.16b,v19.16b,v16.16b,v15.16b
655
.inst 0xce3e41ef //bcax v15.16b,v15.16b,v30.16b, v16.16b
656
.inst 0xce237a10 //bcax v16.16b,v16.16b,v3.16b,v30.16b // A[0][3]=A[3][3]
657
658
.inst 0xce2c7f2a //bcax v10.16b,v25.16b, v12.16b,v31.16b
659
.inst 0xce2d33eb //bcax v11.16b,v31.16b, v13.16b,v12.16b
660
.inst 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b
661
.inst 0xce3939ad //bcax v13.16b,v13.16b,v25.16b, v14.16b
662
.inst 0xce3f65ce //bcax v14.16b,v14.16b,v31.16b, v25.16b
663
664
.inst 0xce2913a7 //bcax v7.16b,v29.16b, v9.16b,v4.16b // A[0][4]=A[1][3]
665
.inst 0xce252488 //bcax v8.16b,v4.16b,v5.16b,v9.16b // A[0][4]=A[1][3]
666
.inst 0xce261529 //bcax v9.16b,v9.16b,v6.16b,v5.16b
667
.inst 0xce3d18a5 //bcax v5.16b,v5.16b,v29.16b, v6.16b
668
.inst 0xce2474c6 //bcax v6.16b,v6.16b,v4.16b,v29.16b // A[0][4]=A[1][3]
669
670
.inst 0xce207363 //bcax v3.16b,v27.16b, v0.16b,v28.16b
671
.inst 0xce210384 //bcax v4.16b,v28.16b, v1.16b,v0.16b
672
.inst 0xce220400 //bcax v0.16b,v0.16b,v2.16b,v1.16b
673
.inst 0xce3b0821 //bcax v1.16b,v1.16b,v27.16b, v2.16b
674
.inst 0xce3c6c42 //bcax v2.16b,v2.16b,v28.16b, v27.16b
675
676
eor v0.16b,v0.16b,v26.16b
677
678
subs x9,x9,#1
679
bne .Loop_ce
680
681
ret
682
.size KeccakF1600_ce,.-KeccakF1600_ce
683
684
.type KeccakF1600_cext,%function
685
.align 5
686
KeccakF1600_cext:
687
AARCH64_SIGN_LINK_REGISTER
688
stp x29,x30,[sp,#-80]!
689
add x29,sp,#0
690
stp d8,d9,[sp,#16] // per ABI requirement
691
stp d10,d11,[sp,#32]
692
stp d12,d13,[sp,#48]
693
stp d14,d15,[sp,#64]
694
ldp d0,d1,[x0,#8*0]
695
ldp d2,d3,[x0,#8*2]
696
ldp d4,d5,[x0,#8*4]
697
ldp d6,d7,[x0,#8*6]
698
ldp d8,d9,[x0,#8*8]
699
ldp d10,d11,[x0,#8*10]
700
ldp d12,d13,[x0,#8*12]
701
ldp d14,d15,[x0,#8*14]
702
ldp d16,d17,[x0,#8*16]
703
ldp d18,d19,[x0,#8*18]
704
ldp d20,d21,[x0,#8*20]
705
ldp d22,d23,[x0,#8*22]
706
ldr d24,[x0,#8*24]
707
bl KeccakF1600_ce
708
ldr x30,[sp,#8]
709
stp d0,d1,[x0,#8*0]
710
stp d2,d3,[x0,#8*2]
711
stp d4,d5,[x0,#8*4]
712
stp d6,d7,[x0,#8*6]
713
stp d8,d9,[x0,#8*8]
714
stp d10,d11,[x0,#8*10]
715
stp d12,d13,[x0,#8*12]
716
stp d14,d15,[x0,#8*14]
717
stp d16,d17,[x0,#8*16]
718
stp d18,d19,[x0,#8*18]
719
stp d20,d21,[x0,#8*20]
720
stp d22,d23,[x0,#8*22]
721
str d24,[x0,#8*24]
722
723
ldp d8,d9,[sp,#16]
724
ldp d10,d11,[sp,#32]
725
ldp d12,d13,[sp,#48]
726
ldp d14,d15,[sp,#64]
727
ldr x29,[sp],#80
728
AARCH64_VALIDATE_LINK_REGISTER
729
ret
730
.size KeccakF1600_cext,.-KeccakF1600_cext
731
.globl SHA3_absorb_cext
732
.type SHA3_absorb_cext,%function
733
.align 5
734
SHA3_absorb_cext:
735
AARCH64_SIGN_LINK_REGISTER
736
stp x29,x30,[sp,#-80]!
737
add x29,sp,#0
738
stp d8,d9,[sp,#16] // per ABI requirement
739
stp d10,d11,[sp,#32]
740
stp d12,d13,[sp,#48]
741
stp d14,d15,[sp,#64]
742
ldp d0,d1,[x0,#8*0]
743
ldp d2,d3,[x0,#8*2]
744
ldp d4,d5,[x0,#8*4]
745
ldp d6,d7,[x0,#8*6]
746
ldp d8,d9,[x0,#8*8]
747
ldp d10,d11,[x0,#8*10]
748
ldp d12,d13,[x0,#8*12]
749
ldp d14,d15,[x0,#8*14]
750
ldp d16,d17,[x0,#8*16]
751
ldp d18,d19,[x0,#8*18]
752
ldp d20,d21,[x0,#8*20]
753
ldp d22,d23,[x0,#8*22]
754
ldr d24,[x0,#8*24]
755
b .Loop_absorb_ce
756
757
.align 4
758
.Loop_absorb_ce:
759
subs x2,x2,x3 // len - bsz
760
blo .Labsorbed_ce
761
ldr d31,[x1],#8 // *inp++
762
#ifdef __AARCH64EB__
763
rev64 v31.16b,v31.16b
764
#endif
765
eor v0.16b,v0.16b,v31.16b
766
cmp x3,#8*(0+2)
767
blo .Lprocess_block_ce
768
ldr d31,[x1],#8 // *inp++
769
#ifdef __AARCH64EB__
770
rev64 v31.16b,v31.16b
771
#endif
772
eor v1.16b,v1.16b,v31.16b
773
beq .Lprocess_block_ce
774
ldr d31,[x1],#8 // *inp++
775
#ifdef __AARCH64EB__
776
rev64 v31.16b,v31.16b
777
#endif
778
eor v2.16b,v2.16b,v31.16b
779
cmp x3,#8*(2+2)
780
blo .Lprocess_block_ce
781
ldr d31,[x1],#8 // *inp++
782
#ifdef __AARCH64EB__
783
rev64 v31.16b,v31.16b
784
#endif
785
eor v3.16b,v3.16b,v31.16b
786
beq .Lprocess_block_ce
787
ldr d31,[x1],#8 // *inp++
788
#ifdef __AARCH64EB__
789
rev64 v31.16b,v31.16b
790
#endif
791
eor v4.16b,v4.16b,v31.16b
792
cmp x3,#8*(4+2)
793
blo .Lprocess_block_ce
794
ldr d31,[x1],#8 // *inp++
795
#ifdef __AARCH64EB__
796
rev64 v31.16b,v31.16b
797
#endif
798
eor v5.16b,v5.16b,v31.16b
799
beq .Lprocess_block_ce
800
ldr d31,[x1],#8 // *inp++
801
#ifdef __AARCH64EB__
802
rev64 v31.16b,v31.16b
803
#endif
804
eor v6.16b,v6.16b,v31.16b
805
cmp x3,#8*(6+2)
806
blo .Lprocess_block_ce
807
ldr d31,[x1],#8 // *inp++
808
#ifdef __AARCH64EB__
809
rev64 v31.16b,v31.16b
810
#endif
811
eor v7.16b,v7.16b,v31.16b
812
beq .Lprocess_block_ce
813
ldr d31,[x1],#8 // *inp++
814
#ifdef __AARCH64EB__
815
rev64 v31.16b,v31.16b
816
#endif
817
eor v8.16b,v8.16b,v31.16b
818
cmp x3,#8*(8+2)
819
blo .Lprocess_block_ce
820
ldr d31,[x1],#8 // *inp++
821
#ifdef __AARCH64EB__
822
rev64 v31.16b,v31.16b
823
#endif
824
eor v9.16b,v9.16b,v31.16b
825
beq .Lprocess_block_ce
826
ldr d31,[x1],#8 // *inp++
827
#ifdef __AARCH64EB__
828
rev64 v31.16b,v31.16b
829
#endif
830
eor v10.16b,v10.16b,v31.16b
831
cmp x3,#8*(10+2)
832
blo .Lprocess_block_ce
833
ldr d31,[x1],#8 // *inp++
834
#ifdef __AARCH64EB__
835
rev64 v31.16b,v31.16b
836
#endif
837
eor v11.16b,v11.16b,v31.16b
838
beq .Lprocess_block_ce
839
ldr d31,[x1],#8 // *inp++
840
#ifdef __AARCH64EB__
841
rev64 v31.16b,v31.16b
842
#endif
843
eor v12.16b,v12.16b,v31.16b
844
cmp x3,#8*(12+2)
845
blo .Lprocess_block_ce
846
ldr d31,[x1],#8 // *inp++
847
#ifdef __AARCH64EB__
848
rev64 v31.16b,v31.16b
849
#endif
850
eor v13.16b,v13.16b,v31.16b
851
beq .Lprocess_block_ce
852
ldr d31,[x1],#8 // *inp++
853
#ifdef __AARCH64EB__
854
rev64 v31.16b,v31.16b
855
#endif
856
eor v14.16b,v14.16b,v31.16b
857
cmp x3,#8*(14+2)
858
blo .Lprocess_block_ce
859
ldr d31,[x1],#8 // *inp++
860
#ifdef __AARCH64EB__
861
rev64 v31.16b,v31.16b
862
#endif
863
eor v15.16b,v15.16b,v31.16b
864
beq .Lprocess_block_ce
865
ldr d31,[x1],#8 // *inp++
866
#ifdef __AARCH64EB__
867
rev64 v31.16b,v31.16b
868
#endif
869
eor v16.16b,v16.16b,v31.16b
870
cmp x3,#8*(16+2)
871
blo .Lprocess_block_ce
872
ldr d31,[x1],#8 // *inp++
873
#ifdef __AARCH64EB__
874
rev64 v31.16b,v31.16b
875
#endif
876
eor v17.16b,v17.16b,v31.16b
877
beq .Lprocess_block_ce
878
ldr d31,[x1],#8 // *inp++
879
#ifdef __AARCH64EB__
880
rev64 v31.16b,v31.16b
881
#endif
882
eor v18.16b,v18.16b,v31.16b
883
cmp x3,#8*(18+2)
884
blo .Lprocess_block_ce
885
ldr d31,[x1],#8 // *inp++
886
#ifdef __AARCH64EB__
887
rev64 v31.16b,v31.16b
888
#endif
889
eor v19.16b,v19.16b,v31.16b
890
beq .Lprocess_block_ce
891
ldr d31,[x1],#8 // *inp++
892
#ifdef __AARCH64EB__
893
rev64 v31.16b,v31.16b
894
#endif
895
eor v20.16b,v20.16b,v31.16b
896
cmp x3,#8*(20+2)
897
blo .Lprocess_block_ce
898
ldr d31,[x1],#8 // *inp++
899
#ifdef __AARCH64EB__
900
rev64 v31.16b,v31.16b
901
#endif
902
eor v21.16b,v21.16b,v31.16b
903
beq .Lprocess_block_ce
904
ldr d31,[x1],#8 // *inp++
905
#ifdef __AARCH64EB__
906
rev64 v31.16b,v31.16b
907
#endif
908
eor v22.16b,v22.16b,v31.16b
909
cmp x3,#8*(22+2)
910
blo .Lprocess_block_ce
911
ldr d31,[x1],#8 // *inp++
912
#ifdef __AARCH64EB__
913
rev64 v31.16b,v31.16b
914
#endif
915
eor v23.16b,v23.16b,v31.16b
916
beq .Lprocess_block_ce
917
ldr d31,[x1],#8 // *inp++
918
#ifdef __AARCH64EB__
919
rev64 v31.16b,v31.16b
920
#endif
921
eor v24.16b,v24.16b,v31.16b
922
923
.Lprocess_block_ce:
924
925
bl KeccakF1600_ce
926
927
b .Loop_absorb_ce
928
929
.align 4
930
.Labsorbed_ce:
931
stp d0,d1,[x0,#8*0]
932
stp d2,d3,[x0,#8*2]
933
stp d4,d5,[x0,#8*4]
934
stp d6,d7,[x0,#8*6]
935
stp d8,d9,[x0,#8*8]
936
stp d10,d11,[x0,#8*10]
937
stp d12,d13,[x0,#8*12]
938
stp d14,d15,[x0,#8*14]
939
stp d16,d17,[x0,#8*16]
940
stp d18,d19,[x0,#8*18]
941
stp d20,d21,[x0,#8*20]
942
stp d22,d23,[x0,#8*22]
943
str d24,[x0,#8*24]
944
add x0,x2,x3 // return value
945
946
ldp d8,d9,[sp,#16]
947
ldp d10,d11,[sp,#32]
948
ldp d12,d13,[sp,#48]
949
ldp d14,d15,[sp,#64]
950
ldp x29,x30,[sp],#80
951
AARCH64_VALIDATE_LINK_REGISTER
952
ret
953
.size SHA3_absorb_cext,.-SHA3_absorb_cext
954
.globl SHA3_squeeze_cext
955
.type SHA3_squeeze_cext,%function
956
.align 5
957
SHA3_squeeze_cext:
958
AARCH64_SIGN_LINK_REGISTER
959
stp x29,x30,[sp,#-16]!
960
add x29,sp,#0
961
mov x9,x0
962
mov x10,x3
963
964
.Loop_squeeze_ce:
965
ldr x4,[x9],#8
966
cmp x2,#8
967
blo .Lsqueeze_tail_ce
968
#ifdef __AARCH64EB__
969
rev x4,x4
970
#endif
971
str x4,[x1],#8
972
beq .Lsqueeze_done_ce
973
974
sub x2,x2,#8
975
subs x10,x10,#8
976
bhi .Loop_squeeze_ce
977
978
bl KeccakF1600_cext
979
ldr x30,[sp,#8]
980
mov x9,x0
981
mov x10,x3
982
b .Loop_squeeze_ce
983
984
.align 4
985
.Lsqueeze_tail_ce:
986
strb w4,[x1],#1
987
lsr x4,x4,#8
988
subs x2,x2,#1
989
beq .Lsqueeze_done_ce
990
strb w4,[x1],#1
991
lsr x4,x4,#8
992
subs x2,x2,#1
993
beq .Lsqueeze_done_ce
994
strb w4,[x1],#1
995
lsr x4,x4,#8
996
subs x2,x2,#1
997
beq .Lsqueeze_done_ce
998
strb w4,[x1],#1
999
lsr x4,x4,#8
1000
subs x2,x2,#1
1001
beq .Lsqueeze_done_ce
1002
strb w4,[x1],#1
1003
lsr x4,x4,#8
1004
subs x2,x2,#1
1005
beq .Lsqueeze_done_ce
1006
strb w4,[x1],#1
1007
lsr x4,x4,#8
1008
subs x2,x2,#1
1009
beq .Lsqueeze_done_ce
1010
strb w4,[x1],#1
1011
1012
.Lsqueeze_done_ce:
1013
ldr x29,[sp],#16
1014
AARCH64_VALIDATE_LINK_REGISTER
1015
ret
1016
.size SHA3_squeeze_cext,.-SHA3_squeeze_cext
1017
.byte 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1018
.align 2
1019
1020