Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/aarch64/aesv8-armx.S
39536 views
1
/* Do not modify. This file is auto-generated from aesv8-armx.pl. */
2
#include "arm_arch.h"
3
4
#if __ARM_MAX_ARCH__>=7
5
.arch armv8-a+crypto
6
.text
7
.section .rodata
8
.align 5
9
.Lrcon:
10
.long 0x01,0x01,0x01,0x01
11
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
12
.long 0x1b,0x1b,0x1b,0x1b
13
.previous
14
.globl aes_v8_set_encrypt_key
15
.type aes_v8_set_encrypt_key,%function
16
.align 5
17
aes_v8_set_encrypt_key:
18
.Lenc_key:
19
AARCH64_VALID_CALL_TARGET
20
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
21
stp x29,x30,[sp,#-16]!
22
add x29,sp,#0
23
mov x3,#-1
24
cmp x0,#0
25
b.eq .Lenc_key_abort
26
cmp x2,#0
27
b.eq .Lenc_key_abort
28
mov x3,#-2
29
cmp w1,#128
30
b.lt .Lenc_key_abort
31
cmp w1,#256
32
b.gt .Lenc_key_abort
33
tst w1,#0x3f
34
b.ne .Lenc_key_abort
35
36
adrp x3,.Lrcon
37
add x3,x3,#:lo12:.Lrcon
38
cmp w1,#192
39
40
eor v0.16b,v0.16b,v0.16b
41
ld1 {v3.16b},[x0],#16
42
mov w1,#8 // reuse w1
43
ld1 {v1.4s,v2.4s},[x3],#32
44
45
b.lt .Loop128
46
b.eq .L192
47
b .L256
48
49
.align 4
50
.Loop128:
51
tbl v6.16b,{v3.16b},v2.16b
52
ext v5.16b,v0.16b,v3.16b,#12
53
st1 {v3.4s},[x2],#16
54
aese v6.16b,v0.16b
55
subs w1,w1,#1
56
57
eor v3.16b,v3.16b,v5.16b
58
ext v5.16b,v0.16b,v5.16b,#12
59
eor v3.16b,v3.16b,v5.16b
60
ext v5.16b,v0.16b,v5.16b,#12
61
eor v6.16b,v6.16b,v1.16b
62
eor v3.16b,v3.16b,v5.16b
63
shl v1.16b,v1.16b,#1
64
eor v3.16b,v3.16b,v6.16b
65
b.ne .Loop128
66
67
ld1 {v1.4s},[x3]
68
69
tbl v6.16b,{v3.16b},v2.16b
70
ext v5.16b,v0.16b,v3.16b,#12
71
st1 {v3.4s},[x2],#16
72
aese v6.16b,v0.16b
73
74
eor v3.16b,v3.16b,v5.16b
75
ext v5.16b,v0.16b,v5.16b,#12
76
eor v3.16b,v3.16b,v5.16b
77
ext v5.16b,v0.16b,v5.16b,#12
78
eor v6.16b,v6.16b,v1.16b
79
eor v3.16b,v3.16b,v5.16b
80
shl v1.16b,v1.16b,#1
81
eor v3.16b,v3.16b,v6.16b
82
83
tbl v6.16b,{v3.16b},v2.16b
84
ext v5.16b,v0.16b,v3.16b,#12
85
st1 {v3.4s},[x2],#16
86
aese v6.16b,v0.16b
87
88
eor v3.16b,v3.16b,v5.16b
89
ext v5.16b,v0.16b,v5.16b,#12
90
eor v3.16b,v3.16b,v5.16b
91
ext v5.16b,v0.16b,v5.16b,#12
92
eor v6.16b,v6.16b,v1.16b
93
eor v3.16b,v3.16b,v5.16b
94
eor v3.16b,v3.16b,v6.16b
95
st1 {v3.4s},[x2]
96
add x2,x2,#0x50
97
98
mov w12,#10
99
b .Ldone
100
101
.align 4
102
.L192:
103
ld1 {v4.8b},[x0],#8
104
movi v6.16b,#8 // borrow v6.16b
105
st1 {v3.4s},[x2],#16
106
sub v2.16b,v2.16b,v6.16b // adjust the mask
107
108
.Loop192:
109
tbl v6.16b,{v4.16b},v2.16b
110
ext v5.16b,v0.16b,v3.16b,#12
111
#ifdef __AARCH64EB__
112
st1 {v4.4s},[x2],#16
113
sub x2,x2,#8
114
#else
115
st1 {v4.8b},[x2],#8
116
#endif
117
aese v6.16b,v0.16b
118
subs w1,w1,#1
119
120
eor v3.16b,v3.16b,v5.16b
121
ext v5.16b,v0.16b,v5.16b,#12
122
eor v3.16b,v3.16b,v5.16b
123
ext v5.16b,v0.16b,v5.16b,#12
124
eor v3.16b,v3.16b,v5.16b
125
126
dup v5.4s,v3.s[3]
127
eor v5.16b,v5.16b,v4.16b
128
eor v6.16b,v6.16b,v1.16b
129
ext v4.16b,v0.16b,v4.16b,#12
130
shl v1.16b,v1.16b,#1
131
eor v4.16b,v4.16b,v5.16b
132
eor v3.16b,v3.16b,v6.16b
133
eor v4.16b,v4.16b,v6.16b
134
st1 {v3.4s},[x2],#16
135
b.ne .Loop192
136
137
mov w12,#12
138
add x2,x2,#0x20
139
b .Ldone
140
141
.align 4
142
.L256:
143
ld1 {v4.16b},[x0]
144
mov w1,#7
145
mov w12,#14
146
st1 {v3.4s},[x2],#16
147
148
.Loop256:
149
tbl v6.16b,{v4.16b},v2.16b
150
ext v5.16b,v0.16b,v3.16b,#12
151
st1 {v4.4s},[x2],#16
152
aese v6.16b,v0.16b
153
subs w1,w1,#1
154
155
eor v3.16b,v3.16b,v5.16b
156
ext v5.16b,v0.16b,v5.16b,#12
157
eor v3.16b,v3.16b,v5.16b
158
ext v5.16b,v0.16b,v5.16b,#12
159
eor v6.16b,v6.16b,v1.16b
160
eor v3.16b,v3.16b,v5.16b
161
shl v1.16b,v1.16b,#1
162
eor v3.16b,v3.16b,v6.16b
163
st1 {v3.4s},[x2],#16
164
b.eq .Ldone
165
166
dup v6.4s,v3.s[3] // just splat
167
ext v5.16b,v0.16b,v4.16b,#12
168
aese v6.16b,v0.16b
169
170
eor v4.16b,v4.16b,v5.16b
171
ext v5.16b,v0.16b,v5.16b,#12
172
eor v4.16b,v4.16b,v5.16b
173
ext v5.16b,v0.16b,v5.16b,#12
174
eor v4.16b,v4.16b,v5.16b
175
176
eor v4.16b,v4.16b,v6.16b
177
b .Loop256
178
179
.Ldone:
180
str w12,[x2]
181
mov x3,#0
182
183
.Lenc_key_abort:
184
mov x0,x3 // return value
185
ldr x29,[sp],#16
186
ret
187
.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
188
189
.globl aes_v8_set_decrypt_key
190
.type aes_v8_set_decrypt_key,%function
191
.align 5
192
aes_v8_set_decrypt_key:
193
AARCH64_SIGN_LINK_REGISTER
194
stp x29,x30,[sp,#-16]!
195
add x29,sp,#0
196
bl .Lenc_key
197
198
cmp x0,#0
199
b.ne .Ldec_key_abort
200
201
sub x2,x2,#240 // restore original x2
202
mov x4,#-16
203
add x0,x2,x12,lsl#4 // end of key schedule
204
205
ld1 {v0.4s},[x2]
206
ld1 {v1.4s},[x0]
207
st1 {v0.4s},[x0],x4
208
st1 {v1.4s},[x2],#16
209
210
.Loop_imc:
211
ld1 {v0.4s},[x2]
212
ld1 {v1.4s},[x0]
213
aesimc v0.16b,v0.16b
214
aesimc v1.16b,v1.16b
215
st1 {v0.4s},[x0],x4
216
st1 {v1.4s},[x2],#16
217
cmp x0,x2
218
b.hi .Loop_imc
219
220
ld1 {v0.4s},[x2]
221
aesimc v0.16b,v0.16b
222
st1 {v0.4s},[x0]
223
224
eor x0,x0,x0 // return value
225
.Ldec_key_abort:
226
ldp x29,x30,[sp],#16
227
AARCH64_VALIDATE_LINK_REGISTER
228
ret
229
.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
230
.globl aes_v8_encrypt
231
.type aes_v8_encrypt,%function
232
.align 5
233
aes_v8_encrypt:
234
AARCH64_VALID_CALL_TARGET
235
ldr w3,[x2,#240]
236
ld1 {v0.4s},[x2],#16
237
ld1 {v2.16b},[x0]
238
sub w3,w3,#2
239
ld1 {v1.4s},[x2],#16
240
241
.Loop_enc:
242
aese v2.16b,v0.16b
243
aesmc v2.16b,v2.16b
244
ld1 {v0.4s},[x2],#16
245
subs w3,w3,#2
246
aese v2.16b,v1.16b
247
aesmc v2.16b,v2.16b
248
ld1 {v1.4s},[x2],#16
249
b.gt .Loop_enc
250
251
aese v2.16b,v0.16b
252
aesmc v2.16b,v2.16b
253
ld1 {v0.4s},[x2]
254
aese v2.16b,v1.16b
255
eor v2.16b,v2.16b,v0.16b
256
257
st1 {v2.16b},[x1]
258
ret
259
.size aes_v8_encrypt,.-aes_v8_encrypt
260
.globl aes_v8_decrypt
261
.type aes_v8_decrypt,%function
262
.align 5
263
aes_v8_decrypt:
264
AARCH64_VALID_CALL_TARGET
265
ldr w3,[x2,#240]
266
ld1 {v0.4s},[x2],#16
267
ld1 {v2.16b},[x0]
268
sub w3,w3,#2
269
ld1 {v1.4s},[x2],#16
270
271
.Loop_dec:
272
aesd v2.16b,v0.16b
273
aesimc v2.16b,v2.16b
274
ld1 {v0.4s},[x2],#16
275
subs w3,w3,#2
276
aesd v2.16b,v1.16b
277
aesimc v2.16b,v2.16b
278
ld1 {v1.4s},[x2],#16
279
b.gt .Loop_dec
280
281
aesd v2.16b,v0.16b
282
aesimc v2.16b,v2.16b
283
ld1 {v0.4s},[x2]
284
aesd v2.16b,v1.16b
285
eor v2.16b,v2.16b,v0.16b
286
287
st1 {v2.16b},[x1]
288
ret
289
.size aes_v8_decrypt,.-aes_v8_decrypt
290
.globl aes_v8_ecb_encrypt
291
.type aes_v8_ecb_encrypt,%function
292
.align 5
293
aes_v8_ecb_encrypt:
294
AARCH64_VALID_CALL_TARGET
295
subs x2,x2,#16
296
// Original input data size bigger than 16, jump to big size processing.
297
b.ne .Lecb_big_size
298
ld1 {v0.16b},[x0]
299
cmp w4,#0 // en- or decrypting?
300
ldr w5,[x3,#240]
301
ld1 {v5.4s,v6.4s},[x3],#32 // load key schedule...
302
303
b.eq .Lecb_small_dec
304
aese v0.16b,v5.16b
305
aesmc v0.16b,v0.16b
306
ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule...
307
aese v0.16b,v6.16b
308
aesmc v0.16b,v0.16b
309
subs w5,w5,#10 // if rounds==10, jump to aes-128-ecb processing
310
b.eq .Lecb_128_enc
311
.Lecb_round_loop:
312
aese v0.16b,v16.16b
313
aesmc v0.16b,v0.16b
314
ld1 {v16.4s},[x3],#16 // load key schedule...
315
aese v0.16b,v17.16b
316
aesmc v0.16b,v0.16b
317
ld1 {v17.4s},[x3],#16 // load key schedule...
318
subs w5,w5,#2 // bias
319
b.gt .Lecb_round_loop
320
.Lecb_128_enc:
321
ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule...
322
aese v0.16b,v16.16b
323
aesmc v0.16b,v0.16b
324
aese v0.16b,v17.16b
325
aesmc v0.16b,v0.16b
326
ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule...
327
aese v0.16b,v18.16b
328
aesmc v0.16b,v0.16b
329
aese v0.16b,v19.16b
330
aesmc v0.16b,v0.16b
331
ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule...
332
aese v0.16b,v20.16b
333
aesmc v0.16b,v0.16b
334
aese v0.16b,v21.16b
335
aesmc v0.16b,v0.16b
336
ld1 {v7.4s},[x3]
337
aese v0.16b,v22.16b
338
aesmc v0.16b,v0.16b
339
aese v0.16b,v23.16b
340
eor v0.16b,v0.16b,v7.16b
341
st1 {v0.16b},[x1]
342
b .Lecb_Final_abort
343
.Lecb_small_dec:
344
aesd v0.16b,v5.16b
345
aesimc v0.16b,v0.16b
346
ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule...
347
aesd v0.16b,v6.16b
348
aesimc v0.16b,v0.16b
349
subs w5,w5,#10 // bias
350
b.eq .Lecb_128_dec
351
.Lecb_dec_round_loop:
352
aesd v0.16b,v16.16b
353
aesimc v0.16b,v0.16b
354
ld1 {v16.4s},[x3],#16 // load key schedule...
355
aesd v0.16b,v17.16b
356
aesimc v0.16b,v0.16b
357
ld1 {v17.4s},[x3],#16 // load key schedule...
358
subs w5,w5,#2 // bias
359
b.gt .Lecb_dec_round_loop
360
.Lecb_128_dec:
361
ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule...
362
aesd v0.16b,v16.16b
363
aesimc v0.16b,v0.16b
364
aesd v0.16b,v17.16b
365
aesimc v0.16b,v0.16b
366
ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule...
367
aesd v0.16b,v18.16b
368
aesimc v0.16b,v0.16b
369
aesd v0.16b,v19.16b
370
aesimc v0.16b,v0.16b
371
ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule...
372
aesd v0.16b,v20.16b
373
aesimc v0.16b,v0.16b
374
aesd v0.16b,v21.16b
375
aesimc v0.16b,v0.16b
376
ld1 {v7.4s},[x3]
377
aesd v0.16b,v22.16b
378
aesimc v0.16b,v0.16b
379
aesd v0.16b,v23.16b
380
eor v0.16b,v0.16b,v7.16b
381
st1 {v0.16b},[x1]
382
b .Lecb_Final_abort
383
.Lecb_big_size:
384
stp x29,x30,[sp,#-16]!
385
add x29,sp,#0
386
mov x8,#16
387
b.lo .Lecb_done
388
csel x8,xzr,x8,eq
389
390
cmp w4,#0 // en- or decrypting?
391
ldr w5,[x3,#240]
392
and x2,x2,#-16
393
ld1 {v0.16b},[x0],x8
394
395
ld1 {v16.4s,v17.4s},[x3] // load key schedule...
396
sub w5,w5,#6
397
add x7,x3,x5,lsl#4 // pointer to last 7 round keys
398
sub w5,w5,#2
399
ld1 {v18.4s,v19.4s},[x7],#32
400
ld1 {v20.4s,v21.4s},[x7],#32
401
ld1 {v22.4s,v23.4s},[x7],#32
402
ld1 {v7.4s},[x7]
403
404
add x7,x3,#32
405
mov w6,w5
406
b.eq .Lecb_dec
407
408
ld1 {v1.16b},[x0],#16
409
subs x2,x2,#32 // bias
410
add w6,w5,#2
411
orr v3.16b,v1.16b,v1.16b
412
orr v24.16b,v1.16b,v1.16b
413
orr v1.16b,v0.16b,v0.16b
414
b.lo .Lecb_enc_tail
415
416
orr v1.16b,v3.16b,v3.16b
417
ld1 {v24.16b},[x0],#16
418
cmp x2,#32
419
b.lo .Loop3x_ecb_enc
420
421
ld1 {v25.16b},[x0],#16
422
ld1 {v26.16b},[x0],#16
423
sub x2,x2,#32 // bias
424
mov w6,w5
425
426
.Loop5x_ecb_enc:
427
aese v0.16b,v16.16b
428
aesmc v0.16b,v0.16b
429
aese v1.16b,v16.16b
430
aesmc v1.16b,v1.16b
431
aese v24.16b,v16.16b
432
aesmc v24.16b,v24.16b
433
aese v25.16b,v16.16b
434
aesmc v25.16b,v25.16b
435
aese v26.16b,v16.16b
436
aesmc v26.16b,v26.16b
437
ld1 {v16.4s},[x7],#16
438
subs w6,w6,#2
439
aese v0.16b,v17.16b
440
aesmc v0.16b,v0.16b
441
aese v1.16b,v17.16b
442
aesmc v1.16b,v1.16b
443
aese v24.16b,v17.16b
444
aesmc v24.16b,v24.16b
445
aese v25.16b,v17.16b
446
aesmc v25.16b,v25.16b
447
aese v26.16b,v17.16b
448
aesmc v26.16b,v26.16b
449
ld1 {v17.4s},[x7],#16
450
b.gt .Loop5x_ecb_enc
451
452
aese v0.16b,v16.16b
453
aesmc v0.16b,v0.16b
454
aese v1.16b,v16.16b
455
aesmc v1.16b,v1.16b
456
aese v24.16b,v16.16b
457
aesmc v24.16b,v24.16b
458
aese v25.16b,v16.16b
459
aesmc v25.16b,v25.16b
460
aese v26.16b,v16.16b
461
aesmc v26.16b,v26.16b
462
cmp x2,#0x40 // because .Lecb_enc_tail4x
463
sub x2,x2,#0x50
464
465
aese v0.16b,v17.16b
466
aesmc v0.16b,v0.16b
467
aese v1.16b,v17.16b
468
aesmc v1.16b,v1.16b
469
aese v24.16b,v17.16b
470
aesmc v24.16b,v24.16b
471
aese v25.16b,v17.16b
472
aesmc v25.16b,v25.16b
473
aese v26.16b,v17.16b
474
aesmc v26.16b,v26.16b
475
csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo
476
mov x7,x3
477
478
aese v0.16b,v18.16b
479
aesmc v0.16b,v0.16b
480
aese v1.16b,v18.16b
481
aesmc v1.16b,v1.16b
482
aese v24.16b,v18.16b
483
aesmc v24.16b,v24.16b
484
aese v25.16b,v18.16b
485
aesmc v25.16b,v25.16b
486
aese v26.16b,v18.16b
487
aesmc v26.16b,v26.16b
488
add x0,x0,x6 // x0 is adjusted in such way that
489
// at exit from the loop v1.16b-v26.16b
490
// are loaded with last "words"
491
add x6,x2,#0x60 // because .Lecb_enc_tail4x
492
493
aese v0.16b,v19.16b
494
aesmc v0.16b,v0.16b
495
aese v1.16b,v19.16b
496
aesmc v1.16b,v1.16b
497
aese v24.16b,v19.16b
498
aesmc v24.16b,v24.16b
499
aese v25.16b,v19.16b
500
aesmc v25.16b,v25.16b
501
aese v26.16b,v19.16b
502
aesmc v26.16b,v26.16b
503
504
aese v0.16b,v20.16b
505
aesmc v0.16b,v0.16b
506
aese v1.16b,v20.16b
507
aesmc v1.16b,v1.16b
508
aese v24.16b,v20.16b
509
aesmc v24.16b,v24.16b
510
aese v25.16b,v20.16b
511
aesmc v25.16b,v25.16b
512
aese v26.16b,v20.16b
513
aesmc v26.16b,v26.16b
514
515
aese v0.16b,v21.16b
516
aesmc v0.16b,v0.16b
517
aese v1.16b,v21.16b
518
aesmc v1.16b,v1.16b
519
aese v24.16b,v21.16b
520
aesmc v24.16b,v24.16b
521
aese v25.16b,v21.16b
522
aesmc v25.16b,v25.16b
523
aese v26.16b,v21.16b
524
aesmc v26.16b,v26.16b
525
526
aese v0.16b,v22.16b
527
aesmc v0.16b,v0.16b
528
aese v1.16b,v22.16b
529
aesmc v1.16b,v1.16b
530
aese v24.16b,v22.16b
531
aesmc v24.16b,v24.16b
532
aese v25.16b,v22.16b
533
aesmc v25.16b,v25.16b
534
aese v26.16b,v22.16b
535
aesmc v26.16b,v26.16b
536
537
aese v0.16b,v23.16b
538
ld1 {v2.16b},[x0],#16
539
aese v1.16b,v23.16b
540
ld1 {v3.16b},[x0],#16
541
aese v24.16b,v23.16b
542
ld1 {v27.16b},[x0],#16
543
aese v25.16b,v23.16b
544
ld1 {v28.16b},[x0],#16
545
aese v26.16b,v23.16b
546
ld1 {v29.16b},[x0],#16
547
cbz x6,.Lecb_enc_tail4x
548
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
549
eor v4.16b,v7.16b,v0.16b
550
orr v0.16b,v2.16b,v2.16b
551
eor v5.16b,v7.16b,v1.16b
552
orr v1.16b,v3.16b,v3.16b
553
eor v17.16b,v7.16b,v24.16b
554
orr v24.16b,v27.16b,v27.16b
555
eor v30.16b,v7.16b,v25.16b
556
orr v25.16b,v28.16b,v28.16b
557
eor v31.16b,v7.16b,v26.16b
558
st1 {v4.16b},[x1],#16
559
orr v26.16b,v29.16b,v29.16b
560
st1 {v5.16b},[x1],#16
561
mov w6,w5
562
st1 {v17.16b},[x1],#16
563
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
564
st1 {v30.16b},[x1],#16
565
st1 {v31.16b},[x1],#16
566
b.hs .Loop5x_ecb_enc
567
568
add x2,x2,#0x50
569
cbz x2,.Lecb_done
570
571
add w6,w5,#2
572
subs x2,x2,#0x30
573
orr v0.16b,v27.16b,v27.16b
574
orr v1.16b,v28.16b,v28.16b
575
orr v24.16b,v29.16b,v29.16b
576
b.lo .Lecb_enc_tail
577
578
b .Loop3x_ecb_enc
579
580
.align 4
581
.Lecb_enc_tail4x:
582
eor v5.16b,v7.16b,v1.16b
583
eor v17.16b,v7.16b,v24.16b
584
eor v30.16b,v7.16b,v25.16b
585
eor v31.16b,v7.16b,v26.16b
586
st1 {v5.16b},[x1],#16
587
st1 {v17.16b},[x1],#16
588
st1 {v30.16b},[x1],#16
589
st1 {v31.16b},[x1],#16
590
591
b .Lecb_done
592
.align 4
593
.Loop3x_ecb_enc:
594
aese v0.16b,v16.16b
595
aesmc v0.16b,v0.16b
596
aese v1.16b,v16.16b
597
aesmc v1.16b,v1.16b
598
aese v24.16b,v16.16b
599
aesmc v24.16b,v24.16b
600
ld1 {v16.4s},[x7],#16
601
subs w6,w6,#2
602
aese v0.16b,v17.16b
603
aesmc v0.16b,v0.16b
604
aese v1.16b,v17.16b
605
aesmc v1.16b,v1.16b
606
aese v24.16b,v17.16b
607
aesmc v24.16b,v24.16b
608
ld1 {v17.4s},[x7],#16
609
b.gt .Loop3x_ecb_enc
610
611
aese v0.16b,v16.16b
612
aesmc v0.16b,v0.16b
613
aese v1.16b,v16.16b
614
aesmc v1.16b,v1.16b
615
aese v24.16b,v16.16b
616
aesmc v24.16b,v24.16b
617
subs x2,x2,#0x30
618
csel x6,x2,x6,lo // x6, w6, is zero at this point
619
aese v0.16b,v17.16b
620
aesmc v0.16b,v0.16b
621
aese v1.16b,v17.16b
622
aesmc v1.16b,v1.16b
623
aese v24.16b,v17.16b
624
aesmc v24.16b,v24.16b
625
add x0,x0,x6 // x0 is adjusted in such way that
626
// at exit from the loop v1.16b-v24.16b
627
// are loaded with last "words"
628
mov x7,x3
629
aese v0.16b,v20.16b
630
aesmc v0.16b,v0.16b
631
aese v1.16b,v20.16b
632
aesmc v1.16b,v1.16b
633
aese v24.16b,v20.16b
634
aesmc v24.16b,v24.16b
635
ld1 {v2.16b},[x0],#16
636
aese v0.16b,v21.16b
637
aesmc v0.16b,v0.16b
638
aese v1.16b,v21.16b
639
aesmc v1.16b,v1.16b
640
aese v24.16b,v21.16b
641
aesmc v24.16b,v24.16b
642
ld1 {v3.16b},[x0],#16
643
aese v0.16b,v22.16b
644
aesmc v0.16b,v0.16b
645
aese v1.16b,v22.16b
646
aesmc v1.16b,v1.16b
647
aese v24.16b,v22.16b
648
aesmc v24.16b,v24.16b
649
ld1 {v27.16b},[x0],#16
650
aese v0.16b,v23.16b
651
aese v1.16b,v23.16b
652
aese v24.16b,v23.16b
653
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
654
add w6,w5,#2
655
eor v4.16b,v7.16b,v0.16b
656
eor v5.16b,v7.16b,v1.16b
657
eor v24.16b,v24.16b,v7.16b
658
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
659
st1 {v4.16b},[x1],#16
660
orr v0.16b,v2.16b,v2.16b
661
st1 {v5.16b},[x1],#16
662
orr v1.16b,v3.16b,v3.16b
663
st1 {v24.16b},[x1],#16
664
orr v24.16b,v27.16b,v27.16b
665
b.hs .Loop3x_ecb_enc
666
667
cmn x2,#0x30
668
b.eq .Lecb_done
669
nop
670
671
.Lecb_enc_tail:
672
aese v1.16b,v16.16b
673
aesmc v1.16b,v1.16b
674
aese v24.16b,v16.16b
675
aesmc v24.16b,v24.16b
676
ld1 {v16.4s},[x7],#16
677
subs w6,w6,#2
678
aese v1.16b,v17.16b
679
aesmc v1.16b,v1.16b
680
aese v24.16b,v17.16b
681
aesmc v24.16b,v24.16b
682
ld1 {v17.4s},[x7],#16
683
b.gt .Lecb_enc_tail
684
685
aese v1.16b,v16.16b
686
aesmc v1.16b,v1.16b
687
aese v24.16b,v16.16b
688
aesmc v24.16b,v24.16b
689
aese v1.16b,v17.16b
690
aesmc v1.16b,v1.16b
691
aese v24.16b,v17.16b
692
aesmc v24.16b,v24.16b
693
aese v1.16b,v20.16b
694
aesmc v1.16b,v1.16b
695
aese v24.16b,v20.16b
696
aesmc v24.16b,v24.16b
697
cmn x2,#0x20
698
aese v1.16b,v21.16b
699
aesmc v1.16b,v1.16b
700
aese v24.16b,v21.16b
701
aesmc v24.16b,v24.16b
702
aese v1.16b,v22.16b
703
aesmc v1.16b,v1.16b
704
aese v24.16b,v22.16b
705
aesmc v24.16b,v24.16b
706
aese v1.16b,v23.16b
707
aese v24.16b,v23.16b
708
b.eq .Lecb_enc_one
709
eor v5.16b,v7.16b,v1.16b
710
eor v17.16b,v7.16b,v24.16b
711
st1 {v5.16b},[x1],#16
712
st1 {v17.16b},[x1],#16
713
b .Lecb_done
714
715
.Lecb_enc_one:
716
eor v5.16b,v7.16b,v24.16b
717
st1 {v5.16b},[x1],#16
718
b .Lecb_done
719
.align 5
720
.Lecb_dec:
721
ld1 {v1.16b},[x0],#16
722
subs x2,x2,#32 // bias
723
add w6,w5,#2
724
orr v3.16b,v1.16b,v1.16b
725
orr v24.16b,v1.16b,v1.16b
726
orr v1.16b,v0.16b,v0.16b
727
b.lo .Lecb_dec_tail
728
729
orr v1.16b,v3.16b,v3.16b
730
ld1 {v24.16b},[x0],#16
731
cmp x2,#32
732
b.lo .Loop3x_ecb_dec
733
734
ld1 {v25.16b},[x0],#16
735
ld1 {v26.16b},[x0],#16
736
sub x2,x2,#32 // bias
737
mov w6,w5
738
739
.Loop5x_ecb_dec:
740
aesd v0.16b,v16.16b
741
aesimc v0.16b,v0.16b
742
aesd v1.16b,v16.16b
743
aesimc v1.16b,v1.16b
744
aesd v24.16b,v16.16b
745
aesimc v24.16b,v24.16b
746
aesd v25.16b,v16.16b
747
aesimc v25.16b,v25.16b
748
aesd v26.16b,v16.16b
749
aesimc v26.16b,v26.16b
750
ld1 {v16.4s},[x7],#16
751
subs w6,w6,#2
752
aesd v0.16b,v17.16b
753
aesimc v0.16b,v0.16b
754
aesd v1.16b,v17.16b
755
aesimc v1.16b,v1.16b
756
aesd v24.16b,v17.16b
757
aesimc v24.16b,v24.16b
758
aesd v25.16b,v17.16b
759
aesimc v25.16b,v25.16b
760
aesd v26.16b,v17.16b
761
aesimc v26.16b,v26.16b
762
ld1 {v17.4s},[x7],#16
763
b.gt .Loop5x_ecb_dec
764
765
aesd v0.16b,v16.16b
766
aesimc v0.16b,v0.16b
767
aesd v1.16b,v16.16b
768
aesimc v1.16b,v1.16b
769
aesd v24.16b,v16.16b
770
aesimc v24.16b,v24.16b
771
aesd v25.16b,v16.16b
772
aesimc v25.16b,v25.16b
773
aesd v26.16b,v16.16b
774
aesimc v26.16b,v26.16b
775
cmp x2,#0x40 // because .Lecb_tail4x
776
sub x2,x2,#0x50
777
778
aesd v0.16b,v17.16b
779
aesimc v0.16b,v0.16b
780
aesd v1.16b,v17.16b
781
aesimc v1.16b,v1.16b
782
aesd v24.16b,v17.16b
783
aesimc v24.16b,v24.16b
784
aesd v25.16b,v17.16b
785
aesimc v25.16b,v25.16b
786
aesd v26.16b,v17.16b
787
aesimc v26.16b,v26.16b
788
csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo
789
mov x7,x3
790
791
aesd v0.16b,v18.16b
792
aesimc v0.16b,v0.16b
793
aesd v1.16b,v18.16b
794
aesimc v1.16b,v1.16b
795
aesd v24.16b,v18.16b
796
aesimc v24.16b,v24.16b
797
aesd v25.16b,v18.16b
798
aesimc v25.16b,v25.16b
799
aesd v26.16b,v18.16b
800
aesimc v26.16b,v26.16b
801
add x0,x0,x6 // x0 is adjusted in such way that
802
// at exit from the loop v1.16b-v26.16b
803
// are loaded with last "words"
804
add x6,x2,#0x60 // because .Lecb_tail4x
805
806
aesd v0.16b,v19.16b
807
aesimc v0.16b,v0.16b
808
aesd v1.16b,v19.16b
809
aesimc v1.16b,v1.16b
810
aesd v24.16b,v19.16b
811
aesimc v24.16b,v24.16b
812
aesd v25.16b,v19.16b
813
aesimc v25.16b,v25.16b
814
aesd v26.16b,v19.16b
815
aesimc v26.16b,v26.16b
816
817
aesd v0.16b,v20.16b
818
aesimc v0.16b,v0.16b
819
aesd v1.16b,v20.16b
820
aesimc v1.16b,v1.16b
821
aesd v24.16b,v20.16b
822
aesimc v24.16b,v24.16b
823
aesd v25.16b,v20.16b
824
aesimc v25.16b,v25.16b
825
aesd v26.16b,v20.16b
826
aesimc v26.16b,v26.16b
827
828
aesd v0.16b,v21.16b
829
aesimc v0.16b,v0.16b
830
aesd v1.16b,v21.16b
831
aesimc v1.16b,v1.16b
832
aesd v24.16b,v21.16b
833
aesimc v24.16b,v24.16b
834
aesd v25.16b,v21.16b
835
aesimc v25.16b,v25.16b
836
aesd v26.16b,v21.16b
837
aesimc v26.16b,v26.16b
838
839
aesd v0.16b,v22.16b
840
aesimc v0.16b,v0.16b
841
aesd v1.16b,v22.16b
842
aesimc v1.16b,v1.16b
843
aesd v24.16b,v22.16b
844
aesimc v24.16b,v24.16b
845
aesd v25.16b,v22.16b
846
aesimc v25.16b,v25.16b
847
aesd v26.16b,v22.16b
848
aesimc v26.16b,v26.16b
849
850
aesd v0.16b,v23.16b
851
ld1 {v2.16b},[x0],#16
852
aesd v1.16b,v23.16b
853
ld1 {v3.16b},[x0],#16
854
aesd v24.16b,v23.16b
855
ld1 {v27.16b},[x0],#16
856
aesd v25.16b,v23.16b
857
ld1 {v28.16b},[x0],#16
858
aesd v26.16b,v23.16b
859
ld1 {v29.16b},[x0],#16
860
cbz x6,.Lecb_tail4x
861
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
862
eor v4.16b,v7.16b,v0.16b
863
orr v0.16b,v2.16b,v2.16b
864
eor v5.16b,v7.16b,v1.16b
865
orr v1.16b,v3.16b,v3.16b
866
eor v17.16b,v7.16b,v24.16b
867
orr v24.16b,v27.16b,v27.16b
868
eor v30.16b,v7.16b,v25.16b
869
orr v25.16b,v28.16b,v28.16b
870
eor v31.16b,v7.16b,v26.16b
871
st1 {v4.16b},[x1],#16
872
orr v26.16b,v29.16b,v29.16b
873
st1 {v5.16b},[x1],#16
874
mov w6,w5
875
st1 {v17.16b},[x1],#16
876
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
877
st1 {v30.16b},[x1],#16
878
st1 {v31.16b},[x1],#16
879
b.hs .Loop5x_ecb_dec
880
881
add x2,x2,#0x50
882
cbz x2,.Lecb_done
883
884
add w6,w5,#2
885
subs x2,x2,#0x30
886
orr v0.16b,v27.16b,v27.16b
887
orr v1.16b,v28.16b,v28.16b
888
orr v24.16b,v29.16b,v29.16b
889
b.lo .Lecb_dec_tail
890
891
b .Loop3x_ecb_dec
892
893
.align 4
894
.Lecb_tail4x:
895
eor v5.16b,v7.16b,v1.16b
896
eor v17.16b,v7.16b,v24.16b
897
eor v30.16b,v7.16b,v25.16b
898
eor v31.16b,v7.16b,v26.16b
899
st1 {v5.16b},[x1],#16
900
st1 {v17.16b},[x1],#16
901
st1 {v30.16b},[x1],#16
902
st1 {v31.16b},[x1],#16
903
904
b .Lecb_done
905
.align 4
906
.Loop3x_ecb_dec:
907
aesd v0.16b,v16.16b
908
aesimc v0.16b,v0.16b
909
aesd v1.16b,v16.16b
910
aesimc v1.16b,v1.16b
911
aesd v24.16b,v16.16b
912
aesimc v24.16b,v24.16b
913
ld1 {v16.4s},[x7],#16
914
subs w6,w6,#2
915
aesd v0.16b,v17.16b
916
aesimc v0.16b,v0.16b
917
aesd v1.16b,v17.16b
918
aesimc v1.16b,v1.16b
919
aesd v24.16b,v17.16b
920
aesimc v24.16b,v24.16b
921
ld1 {v17.4s},[x7],#16
922
b.gt .Loop3x_ecb_dec
923
924
aesd v0.16b,v16.16b
925
aesimc v0.16b,v0.16b
926
aesd v1.16b,v16.16b
927
aesimc v1.16b,v1.16b
928
aesd v24.16b,v16.16b
929
aesimc v24.16b,v24.16b
930
subs x2,x2,#0x30
931
csel x6,x2,x6,lo // x6, w6, is zero at this point
932
aesd v0.16b,v17.16b
933
aesimc v0.16b,v0.16b
934
aesd v1.16b,v17.16b
935
aesimc v1.16b,v1.16b
936
aesd v24.16b,v17.16b
937
aesimc v24.16b,v24.16b
938
add x0,x0,x6 // x0 is adjusted in such way that
939
// at exit from the loop v1.16b-v24.16b
940
// are loaded with last "words"
941
mov x7,x3
942
aesd v0.16b,v20.16b
943
aesimc v0.16b,v0.16b
944
aesd v1.16b,v20.16b
945
aesimc v1.16b,v1.16b
946
aesd v24.16b,v20.16b
947
aesimc v24.16b,v24.16b
948
ld1 {v2.16b},[x0],#16
949
aesd v0.16b,v21.16b
950
aesimc v0.16b,v0.16b
951
aesd v1.16b,v21.16b
952
aesimc v1.16b,v1.16b
953
aesd v24.16b,v21.16b
954
aesimc v24.16b,v24.16b
955
ld1 {v3.16b},[x0],#16
956
aesd v0.16b,v22.16b
957
aesimc v0.16b,v0.16b
958
aesd v1.16b,v22.16b
959
aesimc v1.16b,v1.16b
960
aesd v24.16b,v22.16b
961
aesimc v24.16b,v24.16b
962
ld1 {v27.16b},[x0],#16
963
aesd v0.16b,v23.16b
964
aesd v1.16b,v23.16b
965
aesd v24.16b,v23.16b
966
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
967
add w6,w5,#2
968
eor v4.16b,v7.16b,v0.16b
969
eor v5.16b,v7.16b,v1.16b
970
eor v24.16b,v24.16b,v7.16b
971
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
972
st1 {v4.16b},[x1],#16
973
orr v0.16b,v2.16b,v2.16b
974
st1 {v5.16b},[x1],#16
975
orr v1.16b,v3.16b,v3.16b
976
st1 {v24.16b},[x1],#16
977
orr v24.16b,v27.16b,v27.16b
978
b.hs .Loop3x_ecb_dec
979
980
cmn x2,#0x30
981
b.eq .Lecb_done
982
nop
983
984
.Lecb_dec_tail:
985
aesd v1.16b,v16.16b
986
aesimc v1.16b,v1.16b
987
aesd v24.16b,v16.16b
988
aesimc v24.16b,v24.16b
989
ld1 {v16.4s},[x7],#16
990
subs w6,w6,#2
991
aesd v1.16b,v17.16b
992
aesimc v1.16b,v1.16b
993
aesd v24.16b,v17.16b
994
aesimc v24.16b,v24.16b
995
ld1 {v17.4s},[x7],#16
996
b.gt .Lecb_dec_tail
997
998
aesd v1.16b,v16.16b
999
aesimc v1.16b,v1.16b
1000
aesd v24.16b,v16.16b
1001
aesimc v24.16b,v24.16b
1002
aesd v1.16b,v17.16b
1003
aesimc v1.16b,v1.16b
1004
aesd v24.16b,v17.16b
1005
aesimc v24.16b,v24.16b
1006
aesd v1.16b,v20.16b
1007
aesimc v1.16b,v1.16b
1008
aesd v24.16b,v20.16b
1009
aesimc v24.16b,v24.16b
1010
cmn x2,#0x20
1011
aesd v1.16b,v21.16b
1012
aesimc v1.16b,v1.16b
1013
aesd v24.16b,v21.16b
1014
aesimc v24.16b,v24.16b
1015
aesd v1.16b,v22.16b
1016
aesimc v1.16b,v1.16b
1017
aesd v24.16b,v22.16b
1018
aesimc v24.16b,v24.16b
1019
aesd v1.16b,v23.16b
1020
aesd v24.16b,v23.16b
1021
b.eq .Lecb_dec_one
1022
eor v5.16b,v7.16b,v1.16b
1023
eor v17.16b,v7.16b,v24.16b
1024
st1 {v5.16b},[x1],#16
1025
st1 {v17.16b},[x1],#16
1026
b .Lecb_done
1027
1028
.Lecb_dec_one:
1029
eor v5.16b,v7.16b,v24.16b
1030
st1 {v5.16b},[x1],#16
1031
1032
.Lecb_done:
1033
ldr x29,[sp],#16
1034
.Lecb_Final_abort:
1035
ret
1036
.size aes_v8_ecb_encrypt,.-aes_v8_ecb_encrypt
1037
.globl aes_v8_cbc_encrypt
1038
.type aes_v8_cbc_encrypt,%function
1039
.align 5
1040
aes_v8_cbc_encrypt:
1041
AARCH64_VALID_CALL_TARGET
1042
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1043
stp x29,x30,[sp,#-16]!
1044
add x29,sp,#0
1045
subs x2,x2,#16
1046
mov x8,#16
1047
b.lo .Lcbc_abort
1048
csel x8,xzr,x8,eq
1049
1050
cmp w5,#0 // en- or decrypting?
1051
ldr w5,[x3,#240]
1052
and x2,x2,#-16
1053
ld1 {v6.16b},[x4]
1054
ld1 {v0.16b},[x0],x8
1055
1056
ld1 {v16.4s,v17.4s},[x3] // load key schedule...
1057
sub w5,w5,#6
1058
add x7,x3,x5,lsl#4 // pointer to last 7 round keys
1059
sub w5,w5,#2
1060
ld1 {v18.4s,v19.4s},[x7],#32
1061
ld1 {v20.4s,v21.4s},[x7],#32
1062
ld1 {v22.4s,v23.4s},[x7],#32
1063
ld1 {v7.4s},[x7]
1064
1065
add x7,x3,#32
1066
mov w6,w5
1067
b.eq .Lcbc_dec
1068
1069
cmp w5,#2
1070
eor v0.16b,v0.16b,v6.16b
1071
eor v5.16b,v16.16b,v7.16b
1072
b.eq .Lcbc_enc128
1073
1074
ld1 {v2.4s,v3.4s},[x7]
1075
add x7,x3,#16
1076
add x6,x3,#16*4
1077
add x12,x3,#16*5
1078
aese v0.16b,v16.16b
1079
aesmc v0.16b,v0.16b
1080
add x14,x3,#16*6
1081
add x3,x3,#16*7
1082
b .Lenter_cbc_enc
1083
1084
.align 4
1085
.Loop_cbc_enc:
1086
aese v0.16b,v16.16b
1087
aesmc v0.16b,v0.16b
1088
st1 {v6.16b},[x1],#16
1089
.Lenter_cbc_enc:
1090
aese v0.16b,v17.16b
1091
aesmc v0.16b,v0.16b
1092
aese v0.16b,v2.16b
1093
aesmc v0.16b,v0.16b
1094
ld1 {v16.4s},[x6]
1095
cmp w5,#4
1096
aese v0.16b,v3.16b
1097
aesmc v0.16b,v0.16b
1098
ld1 {v17.4s},[x12]
1099
b.eq .Lcbc_enc192
1100
1101
aese v0.16b,v16.16b
1102
aesmc v0.16b,v0.16b
1103
ld1 {v16.4s},[x14]
1104
aese v0.16b,v17.16b
1105
aesmc v0.16b,v0.16b
1106
ld1 {v17.4s},[x3]
1107
nop
1108
1109
.Lcbc_enc192:
1110
aese v0.16b,v16.16b
1111
aesmc v0.16b,v0.16b
1112
subs x2,x2,#16
1113
aese v0.16b,v17.16b
1114
aesmc v0.16b,v0.16b
1115
csel x8,xzr,x8,eq
1116
aese v0.16b,v18.16b
1117
aesmc v0.16b,v0.16b
1118
aese v0.16b,v19.16b
1119
aesmc v0.16b,v0.16b
1120
ld1 {v16.16b},[x0],x8
1121
aese v0.16b,v20.16b
1122
aesmc v0.16b,v0.16b
1123
eor v16.16b,v16.16b,v5.16b
1124
aese v0.16b,v21.16b
1125
aesmc v0.16b,v0.16b
1126
ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
1127
aese v0.16b,v22.16b
1128
aesmc v0.16b,v0.16b
1129
aese v0.16b,v23.16b
1130
eor v6.16b,v0.16b,v7.16b
1131
b.hs .Loop_cbc_enc
1132
1133
st1 {v6.16b},[x1],#16
1134
b .Lcbc_done
1135
1136
.align 5
1137
.Lcbc_enc128:
1138
ld1 {v2.4s,v3.4s},[x7]
1139
aese v0.16b,v16.16b
1140
aesmc v0.16b,v0.16b
1141
b .Lenter_cbc_enc128
1142
.Loop_cbc_enc128:
1143
aese v0.16b,v16.16b
1144
aesmc v0.16b,v0.16b
1145
st1 {v6.16b},[x1],#16
1146
.Lenter_cbc_enc128:
1147
aese v0.16b,v17.16b
1148
aesmc v0.16b,v0.16b
1149
subs x2,x2,#16
1150
aese v0.16b,v2.16b
1151
aesmc v0.16b,v0.16b
1152
csel x8,xzr,x8,eq
1153
aese v0.16b,v3.16b
1154
aesmc v0.16b,v0.16b
1155
aese v0.16b,v18.16b
1156
aesmc v0.16b,v0.16b
1157
aese v0.16b,v19.16b
1158
aesmc v0.16b,v0.16b
1159
ld1 {v16.16b},[x0],x8
1160
aese v0.16b,v20.16b
1161
aesmc v0.16b,v0.16b
1162
aese v0.16b,v21.16b
1163
aesmc v0.16b,v0.16b
1164
aese v0.16b,v22.16b
1165
aesmc v0.16b,v0.16b
1166
eor v16.16b,v16.16b,v5.16b
1167
aese v0.16b,v23.16b
1168
eor v6.16b,v0.16b,v7.16b
1169
b.hs .Loop_cbc_enc128
1170
1171
st1 {v6.16b},[x1],#16
1172
b .Lcbc_done
1173
.align 5
1174
.Lcbc_dec:
1175
ld1 {v24.16b},[x0],#16
1176
subs x2,x2,#32 // bias
1177
add w6,w5,#2
1178
orr v3.16b,v0.16b,v0.16b
1179
orr v1.16b,v0.16b,v0.16b
1180
orr v27.16b,v24.16b,v24.16b
1181
b.lo .Lcbc_dec_tail
1182
1183
orr v1.16b,v24.16b,v24.16b
1184
ld1 {v24.16b},[x0],#16
1185
orr v2.16b,v0.16b,v0.16b
1186
orr v3.16b,v1.16b,v1.16b
1187
orr v27.16b,v24.16b,v24.16b
1188
cmp x2,#32
1189
b.lo .Loop3x_cbc_dec
1190
1191
ld1 {v25.16b},[x0],#16
1192
ld1 {v26.16b},[x0],#16
1193
sub x2,x2,#32 // bias
1194
mov w6,w5
1195
orr v28.16b,v25.16b,v25.16b
1196
orr v29.16b,v26.16b,v26.16b
1197
1198
.Loop5x_cbc_dec:
1199
aesd v0.16b,v16.16b
1200
aesimc v0.16b,v0.16b
1201
aesd v1.16b,v16.16b
1202
aesimc v1.16b,v1.16b
1203
aesd v24.16b,v16.16b
1204
aesimc v24.16b,v24.16b
1205
aesd v25.16b,v16.16b
1206
aesimc v25.16b,v25.16b
1207
aesd v26.16b,v16.16b
1208
aesimc v26.16b,v26.16b
1209
ld1 {v16.4s},[x7],#16
1210
subs w6,w6,#2
1211
aesd v0.16b,v17.16b
1212
aesimc v0.16b,v0.16b
1213
aesd v1.16b,v17.16b
1214
aesimc v1.16b,v1.16b
1215
aesd v24.16b,v17.16b
1216
aesimc v24.16b,v24.16b
1217
aesd v25.16b,v17.16b
1218
aesimc v25.16b,v25.16b
1219
aesd v26.16b,v17.16b
1220
aesimc v26.16b,v26.16b
1221
ld1 {v17.4s},[x7],#16
1222
b.gt .Loop5x_cbc_dec
1223
1224
aesd v0.16b,v16.16b
1225
aesimc v0.16b,v0.16b
1226
aesd v1.16b,v16.16b
1227
aesimc v1.16b,v1.16b
1228
aesd v24.16b,v16.16b
1229
aesimc v24.16b,v24.16b
1230
aesd v25.16b,v16.16b
1231
aesimc v25.16b,v25.16b
1232
aesd v26.16b,v16.16b
1233
aesimc v26.16b,v26.16b
1234
cmp x2,#0x40 // because .Lcbc_tail4x
1235
sub x2,x2,#0x50
1236
1237
aesd v0.16b,v17.16b
1238
aesimc v0.16b,v0.16b
1239
aesd v1.16b,v17.16b
1240
aesimc v1.16b,v1.16b
1241
aesd v24.16b,v17.16b
1242
aesimc v24.16b,v24.16b
1243
aesd v25.16b,v17.16b
1244
aesimc v25.16b,v25.16b
1245
aesd v26.16b,v17.16b
1246
aesimc v26.16b,v26.16b
1247
csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo
1248
mov x7,x3
1249
1250
aesd v0.16b,v18.16b
1251
aesimc v0.16b,v0.16b
1252
aesd v1.16b,v18.16b
1253
aesimc v1.16b,v1.16b
1254
aesd v24.16b,v18.16b
1255
aesimc v24.16b,v24.16b
1256
aesd v25.16b,v18.16b
1257
aesimc v25.16b,v25.16b
1258
aesd v26.16b,v18.16b
1259
aesimc v26.16b,v26.16b
1260
add x0,x0,x6 // x0 is adjusted in such way that
1261
// at exit from the loop v1.16b-v26.16b
1262
// are loaded with last "words"
1263
add x6,x2,#0x60 // because .Lcbc_tail4x
1264
1265
aesd v0.16b,v19.16b
1266
aesimc v0.16b,v0.16b
1267
aesd v1.16b,v19.16b
1268
aesimc v1.16b,v1.16b
1269
aesd v24.16b,v19.16b
1270
aesimc v24.16b,v24.16b
1271
aesd v25.16b,v19.16b
1272
aesimc v25.16b,v25.16b
1273
aesd v26.16b,v19.16b
1274
aesimc v26.16b,v26.16b
1275
1276
aesd v0.16b,v20.16b
1277
aesimc v0.16b,v0.16b
1278
aesd v1.16b,v20.16b
1279
aesimc v1.16b,v1.16b
1280
aesd v24.16b,v20.16b
1281
aesimc v24.16b,v24.16b
1282
aesd v25.16b,v20.16b
1283
aesimc v25.16b,v25.16b
1284
aesd v26.16b,v20.16b
1285
aesimc v26.16b,v26.16b
1286
1287
aesd v0.16b,v21.16b
1288
aesimc v0.16b,v0.16b
1289
aesd v1.16b,v21.16b
1290
aesimc v1.16b,v1.16b
1291
aesd v24.16b,v21.16b
1292
aesimc v24.16b,v24.16b
1293
aesd v25.16b,v21.16b
1294
aesimc v25.16b,v25.16b
1295
aesd v26.16b,v21.16b
1296
aesimc v26.16b,v26.16b
1297
1298
aesd v0.16b,v22.16b
1299
aesimc v0.16b,v0.16b
1300
aesd v1.16b,v22.16b
1301
aesimc v1.16b,v1.16b
1302
aesd v24.16b,v22.16b
1303
aesimc v24.16b,v24.16b
1304
aesd v25.16b,v22.16b
1305
aesimc v25.16b,v25.16b
1306
aesd v26.16b,v22.16b
1307
aesimc v26.16b,v26.16b
1308
1309
eor v4.16b,v6.16b,v7.16b
1310
aesd v0.16b,v23.16b
1311
eor v5.16b,v2.16b,v7.16b
1312
ld1 {v2.16b},[x0],#16
1313
aesd v1.16b,v23.16b
1314
eor v17.16b,v3.16b,v7.16b
1315
ld1 {v3.16b},[x0],#16
1316
aesd v24.16b,v23.16b
1317
eor v30.16b,v27.16b,v7.16b
1318
ld1 {v27.16b},[x0],#16
1319
aesd v25.16b,v23.16b
1320
eor v31.16b,v28.16b,v7.16b
1321
ld1 {v28.16b},[x0],#16
1322
aesd v26.16b,v23.16b
1323
orr v6.16b,v29.16b,v29.16b
1324
ld1 {v29.16b},[x0],#16
1325
cbz x6,.Lcbc_tail4x
1326
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
1327
eor v4.16b,v4.16b,v0.16b
1328
orr v0.16b,v2.16b,v2.16b
1329
eor v5.16b,v5.16b,v1.16b
1330
orr v1.16b,v3.16b,v3.16b
1331
eor v17.16b,v17.16b,v24.16b
1332
orr v24.16b,v27.16b,v27.16b
1333
eor v30.16b,v30.16b,v25.16b
1334
orr v25.16b,v28.16b,v28.16b
1335
eor v31.16b,v31.16b,v26.16b
1336
st1 {v4.16b},[x1],#16
1337
orr v26.16b,v29.16b,v29.16b
1338
st1 {v5.16b},[x1],#16
1339
mov w6,w5
1340
st1 {v17.16b},[x1],#16
1341
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
1342
st1 {v30.16b},[x1],#16
1343
st1 {v31.16b},[x1],#16
1344
b.hs .Loop5x_cbc_dec
1345
1346
add x2,x2,#0x50
1347
cbz x2,.Lcbc_done
1348
1349
add w6,w5,#2
1350
subs x2,x2,#0x30
1351
orr v0.16b,v27.16b,v27.16b
1352
orr v2.16b,v27.16b,v27.16b
1353
orr v1.16b,v28.16b,v28.16b
1354
orr v3.16b,v28.16b,v28.16b
1355
orr v24.16b,v29.16b,v29.16b
1356
orr v27.16b,v29.16b,v29.16b
1357
b.lo .Lcbc_dec_tail
1358
1359
b .Loop3x_cbc_dec
1360
1361
.align 4
1362
.Lcbc_tail4x:
1363
eor v5.16b,v4.16b,v1.16b
1364
eor v17.16b,v17.16b,v24.16b
1365
eor v30.16b,v30.16b,v25.16b
1366
eor v31.16b,v31.16b,v26.16b
1367
st1 {v5.16b},[x1],#16
1368
st1 {v17.16b},[x1],#16
1369
st1 {v30.16b},[x1],#16
1370
st1 {v31.16b},[x1],#16
1371
1372
b .Lcbc_done
1373
.align 4
1374
.Loop3x_cbc_dec:
1375
aesd v0.16b,v16.16b
1376
aesimc v0.16b,v0.16b
1377
aesd v1.16b,v16.16b
1378
aesimc v1.16b,v1.16b
1379
aesd v24.16b,v16.16b
1380
aesimc v24.16b,v24.16b
1381
ld1 {v16.4s},[x7],#16
1382
subs w6,w6,#2
1383
aesd v0.16b,v17.16b
1384
aesimc v0.16b,v0.16b
1385
aesd v1.16b,v17.16b
1386
aesimc v1.16b,v1.16b
1387
aesd v24.16b,v17.16b
1388
aesimc v24.16b,v24.16b
1389
ld1 {v17.4s},[x7],#16
1390
b.gt .Loop3x_cbc_dec
1391
1392
aesd v0.16b,v16.16b
1393
aesimc v0.16b,v0.16b
1394
aesd v1.16b,v16.16b
1395
aesimc v1.16b,v1.16b
1396
aesd v24.16b,v16.16b
1397
aesimc v24.16b,v24.16b
1398
eor v4.16b,v6.16b,v7.16b
1399
subs x2,x2,#0x30
1400
eor v5.16b,v2.16b,v7.16b
1401
csel x6,x2,x6,lo // x6, w6, is zero at this point
1402
aesd v0.16b,v17.16b
1403
aesimc v0.16b,v0.16b
1404
aesd v1.16b,v17.16b
1405
aesimc v1.16b,v1.16b
1406
aesd v24.16b,v17.16b
1407
aesimc v24.16b,v24.16b
1408
eor v17.16b,v3.16b,v7.16b
1409
add x0,x0,x6 // x0 is adjusted in such way that
1410
// at exit from the loop v1.16b-v24.16b
1411
// are loaded with last "words"
1412
orr v6.16b,v27.16b,v27.16b
1413
mov x7,x3
1414
aesd v0.16b,v20.16b
1415
aesimc v0.16b,v0.16b
1416
aesd v1.16b,v20.16b
1417
aesimc v1.16b,v1.16b
1418
aesd v24.16b,v20.16b
1419
aesimc v24.16b,v24.16b
1420
ld1 {v2.16b},[x0],#16
1421
aesd v0.16b,v21.16b
1422
aesimc v0.16b,v0.16b
1423
aesd v1.16b,v21.16b
1424
aesimc v1.16b,v1.16b
1425
aesd v24.16b,v21.16b
1426
aesimc v24.16b,v24.16b
1427
ld1 {v3.16b},[x0],#16
1428
aesd v0.16b,v22.16b
1429
aesimc v0.16b,v0.16b
1430
aesd v1.16b,v22.16b
1431
aesimc v1.16b,v1.16b
1432
aesd v24.16b,v22.16b
1433
aesimc v24.16b,v24.16b
1434
ld1 {v27.16b},[x0],#16
1435
aesd v0.16b,v23.16b
1436
aesd v1.16b,v23.16b
1437
aesd v24.16b,v23.16b
1438
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
1439
add w6,w5,#2
1440
eor v4.16b,v4.16b,v0.16b
1441
eor v5.16b,v5.16b,v1.16b
1442
eor v24.16b,v24.16b,v17.16b
1443
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
1444
st1 {v4.16b},[x1],#16
1445
orr v0.16b,v2.16b,v2.16b
1446
st1 {v5.16b},[x1],#16
1447
orr v1.16b,v3.16b,v3.16b
1448
st1 {v24.16b},[x1],#16
1449
orr v24.16b,v27.16b,v27.16b
1450
b.hs .Loop3x_cbc_dec
1451
1452
cmn x2,#0x30
1453
b.eq .Lcbc_done
1454
nop
1455
1456
.Lcbc_dec_tail:
1457
aesd v1.16b,v16.16b
1458
aesimc v1.16b,v1.16b
1459
aesd v24.16b,v16.16b
1460
aesimc v24.16b,v24.16b
1461
ld1 {v16.4s},[x7],#16
1462
subs w6,w6,#2
1463
aesd v1.16b,v17.16b
1464
aesimc v1.16b,v1.16b
1465
aesd v24.16b,v17.16b
1466
aesimc v24.16b,v24.16b
1467
ld1 {v17.4s},[x7],#16
1468
b.gt .Lcbc_dec_tail
1469
1470
aesd v1.16b,v16.16b
1471
aesimc v1.16b,v1.16b
1472
aesd v24.16b,v16.16b
1473
aesimc v24.16b,v24.16b
1474
aesd v1.16b,v17.16b
1475
aesimc v1.16b,v1.16b
1476
aesd v24.16b,v17.16b
1477
aesimc v24.16b,v24.16b
1478
aesd v1.16b,v20.16b
1479
aesimc v1.16b,v1.16b
1480
aesd v24.16b,v20.16b
1481
aesimc v24.16b,v24.16b
1482
cmn x2,#0x20
1483
aesd v1.16b,v21.16b
1484
aesimc v1.16b,v1.16b
1485
aesd v24.16b,v21.16b
1486
aesimc v24.16b,v24.16b
1487
eor v5.16b,v6.16b,v7.16b
1488
aesd v1.16b,v22.16b
1489
aesimc v1.16b,v1.16b
1490
aesd v24.16b,v22.16b
1491
aesimc v24.16b,v24.16b
1492
eor v17.16b,v3.16b,v7.16b
1493
aesd v1.16b,v23.16b
1494
aesd v24.16b,v23.16b
1495
b.eq .Lcbc_dec_one
1496
eor v5.16b,v5.16b,v1.16b
1497
eor v17.16b,v17.16b,v24.16b
1498
orr v6.16b,v27.16b,v27.16b
1499
st1 {v5.16b},[x1],#16
1500
st1 {v17.16b},[x1],#16
1501
b .Lcbc_done
1502
1503
.Lcbc_dec_one:
1504
eor v5.16b,v5.16b,v24.16b
1505
orr v6.16b,v27.16b,v27.16b
1506
st1 {v5.16b},[x1],#16
1507
1508
.Lcbc_done:
1509
st1 {v6.16b},[x4]
1510
.Lcbc_abort:
1511
ldr x29,[sp],#16
1512
ret
1513
.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
1514
.globl aes_v8_ctr32_encrypt_blocks_unroll12_eor3
1515
.type aes_v8_ctr32_encrypt_blocks_unroll12_eor3,%function
1516
.align 5
1517
aes_v8_ctr32_encrypt_blocks_unroll12_eor3:
1518
AARCH64_VALID_CALL_TARGET
1519
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1520
stp x29,x30,[sp,#-80]!
1521
stp d8,d9,[sp, #16]
1522
stp d10,d11,[sp, #32]
1523
stp d12,d13,[sp, #48]
1524
stp d14,d15,[sp, #64]
1525
add x29,sp,#0
1526
1527
ldr w5,[x3,#240]
1528
1529
ldr w8, [x4, #12]
1530
#ifdef __AARCH64EB__
1531
ld1 {v24.16b},[x4]
1532
#else
1533
ld1 {v24.4s},[x4]
1534
#endif
1535
ld1 {v2.4s,v3.4s},[x3] // load key schedule...
1536
sub w5,w5,#4
1537
cmp x2,#2
1538
add x7,x3,x5,lsl#4 // pointer to last round key
1539
sub w5,w5,#2
1540
add x7, x7, #64
1541
ld1 {v1.4s},[x7]
1542
add x7,x3,#32
1543
mov w6,w5
1544
#ifndef __AARCH64EB__
1545
rev w8, w8
1546
#endif
1547
1548
orr v25.16b,v24.16b,v24.16b
1549
add w10, w8, #1
1550
orr v26.16b,v24.16b,v24.16b
1551
add w8, w8, #2
1552
orr v0.16b,v24.16b,v24.16b
1553
rev w10, w10
1554
mov v25.s[3],w10
1555
b.ls .Lctr32_tail_unroll
1556
cmp x2,#6
1557
rev w12, w8
1558
sub x2,x2,#3 // bias
1559
mov v26.s[3],w12
1560
b.lo .Loop3x_ctr32_unroll
1561
cmp x2,#9
1562
orr v27.16b,v24.16b,v24.16b
1563
add w11, w8, #1
1564
orr v28.16b,v24.16b,v24.16b
1565
add w13, w8, #2
1566
rev w11, w11
1567
orr v29.16b,v24.16b,v24.16b
1568
add w8, w8, #3
1569
rev w13, w13
1570
mov v27.s[3],w11
1571
rev w14, w8
1572
mov v28.s[3],w13
1573
mov v29.s[3],w14
1574
sub x2,x2,#3
1575
b.lo .Loop6x_ctr32_unroll
1576
1577
// push regs to stack when 12 data chunks are interleaved
1578
stp x19,x20,[sp,#-16]!
1579
stp x21,x22,[sp,#-16]!
1580
stp x23,x24,[sp,#-16]!
1581
stp d8,d9,[sp,#-32]!
1582
stp d10,d11,[sp,#-32]!
1583
1584
add w15,w8,#1
1585
add w19,w8,#2
1586
add w20,w8,#3
1587
add w21,w8,#4
1588
add w22,w8,#5
1589
add w8,w8,#6
1590
orr v30.16b,v24.16b,v24.16b
1591
rev w15,w15
1592
orr v31.16b,v24.16b,v24.16b
1593
rev w19,w19
1594
orr v8.16b,v24.16b,v24.16b
1595
rev w20,w20
1596
orr v9.16b,v24.16b,v24.16b
1597
rev w21,w21
1598
orr v10.16b,v24.16b,v24.16b
1599
rev w22,w22
1600
orr v11.16b,v24.16b,v24.16b
1601
rev w23,w8
1602
1603
sub x2,x2,#6 // bias
1604
mov v30.s[3],w15
1605
mov v31.s[3],w19
1606
mov v8.s[3],w20
1607
mov v9.s[3],w21
1608
mov v10.s[3],w22
1609
mov v11.s[3],w23
1610
b .Loop12x_ctr32_unroll
1611
1612
.align 4
1613
.Loop12x_ctr32_unroll:
1614
aese v24.16b,v2.16b
1615
aesmc v24.16b,v24.16b
1616
aese v25.16b,v2.16b
1617
aesmc v25.16b,v25.16b
1618
aese v26.16b,v2.16b
1619
aesmc v26.16b,v26.16b
1620
aese v27.16b,v2.16b
1621
aesmc v27.16b,v27.16b
1622
aese v28.16b,v2.16b
1623
aesmc v28.16b,v28.16b
1624
aese v29.16b,v2.16b
1625
aesmc v29.16b,v29.16b
1626
aese v30.16b,v2.16b
1627
aesmc v30.16b,v30.16b
1628
aese v31.16b,v2.16b
1629
aesmc v31.16b,v31.16b
1630
aese v8.16b,v2.16b
1631
aesmc v8.16b,v8.16b
1632
aese v9.16b,v2.16b
1633
aesmc v9.16b,v9.16b
1634
aese v10.16b,v2.16b
1635
aesmc v10.16b,v10.16b
1636
aese v11.16b,v2.16b
1637
aesmc v11.16b,v11.16b
1638
ld1 {v2.4s},[x7],#16
1639
subs w6,w6,#2
1640
aese v24.16b,v3.16b
1641
aesmc v24.16b,v24.16b
1642
aese v25.16b,v3.16b
1643
aesmc v25.16b,v25.16b
1644
aese v26.16b,v3.16b
1645
aesmc v26.16b,v26.16b
1646
aese v27.16b,v3.16b
1647
aesmc v27.16b,v27.16b
1648
aese v28.16b,v3.16b
1649
aesmc v28.16b,v28.16b
1650
aese v29.16b,v3.16b
1651
aesmc v29.16b,v29.16b
1652
aese v30.16b,v3.16b
1653
aesmc v30.16b,v30.16b
1654
aese v31.16b,v3.16b
1655
aesmc v31.16b,v31.16b
1656
aese v8.16b,v3.16b
1657
aesmc v8.16b,v8.16b
1658
aese v9.16b,v3.16b
1659
aesmc v9.16b,v9.16b
1660
aese v10.16b,v3.16b
1661
aesmc v10.16b,v10.16b
1662
aese v11.16b,v3.16b
1663
aesmc v11.16b,v11.16b
1664
ld1 {v3.4s},[x7],#16
1665
b.gt .Loop12x_ctr32_unroll
1666
1667
aese v24.16b,v2.16b
1668
aesmc v24.16b,v24.16b
1669
aese v25.16b,v2.16b
1670
aesmc v25.16b,v25.16b
1671
aese v26.16b,v2.16b
1672
aesmc v26.16b,v26.16b
1673
aese v27.16b,v2.16b
1674
aesmc v27.16b,v27.16b
1675
aese v28.16b,v2.16b
1676
aesmc v28.16b,v28.16b
1677
aese v29.16b,v2.16b
1678
aesmc v29.16b,v29.16b
1679
aese v30.16b,v2.16b
1680
aesmc v30.16b,v30.16b
1681
aese v31.16b,v2.16b
1682
aesmc v31.16b,v31.16b
1683
aese v8.16b,v2.16b
1684
aesmc v8.16b,v8.16b
1685
aese v9.16b,v2.16b
1686
aesmc v9.16b,v9.16b
1687
aese v10.16b,v2.16b
1688
aesmc v10.16b,v10.16b
1689
aese v11.16b,v2.16b
1690
aesmc v11.16b,v11.16b
1691
ld1 {v2.4s},[x7],#16
1692
1693
aese v24.16b,v3.16b
1694
aesmc v24.16b,v24.16b
1695
aese v25.16b,v3.16b
1696
aesmc v25.16b,v25.16b
1697
aese v26.16b,v3.16b
1698
aesmc v26.16b,v26.16b
1699
aese v27.16b,v3.16b
1700
aesmc v27.16b,v27.16b
1701
aese v28.16b,v3.16b
1702
aesmc v28.16b,v28.16b
1703
aese v29.16b,v3.16b
1704
aesmc v29.16b,v29.16b
1705
aese v30.16b,v3.16b
1706
aesmc v30.16b,v30.16b
1707
aese v31.16b,v3.16b
1708
aesmc v31.16b,v31.16b
1709
aese v8.16b,v3.16b
1710
aesmc v8.16b,v8.16b
1711
aese v9.16b,v3.16b
1712
aesmc v9.16b,v9.16b
1713
aese v10.16b,v3.16b
1714
aesmc v10.16b,v10.16b
1715
aese v11.16b,v3.16b
1716
aesmc v11.16b,v11.16b
1717
ld1 {v3.4s},[x7],#16
1718
1719
aese v24.16b,v2.16b
1720
aesmc v24.16b,v24.16b
1721
add w9,w8,#1
1722
add w10,w8,#2
1723
aese v25.16b,v2.16b
1724
aesmc v25.16b,v25.16b
1725
add w12,w8,#3
1726
add w11,w8,#4
1727
aese v26.16b,v2.16b
1728
aesmc v26.16b,v26.16b
1729
add w13,w8,#5
1730
add w14,w8,#6
1731
rev w9,w9
1732
aese v27.16b,v2.16b
1733
aesmc v27.16b,v27.16b
1734
add w15,w8,#7
1735
add w19,w8,#8
1736
rev w10,w10
1737
rev w12,w12
1738
aese v28.16b,v2.16b
1739
aesmc v28.16b,v28.16b
1740
add w20,w8,#9
1741
add w21,w8,#10
1742
rev w11,w11
1743
rev w13,w13
1744
aese v29.16b,v2.16b
1745
aesmc v29.16b,v29.16b
1746
add w22,w8,#11
1747
add w23,w8,#12
1748
rev w14,w14
1749
rev w15,w15
1750
aese v30.16b,v2.16b
1751
aesmc v30.16b,v30.16b
1752
rev w19,w19
1753
rev w20,w20
1754
aese v31.16b,v2.16b
1755
aesmc v31.16b,v31.16b
1756
rev w21,w21
1757
rev w22,w22
1758
aese v8.16b,v2.16b
1759
aesmc v8.16b,v8.16b
1760
rev w23,w23
1761
aese v9.16b,v2.16b
1762
aesmc v9.16b,v9.16b
1763
aese v10.16b,v2.16b
1764
aesmc v10.16b,v10.16b
1765
aese v11.16b,v2.16b
1766
aesmc v11.16b,v11.16b
1767
ld1 {v2.4s},[x7],#16
1768
1769
aese v24.16b,v3.16b
1770
aesmc v24.16b,v24.16b
1771
aese v25.16b,v3.16b
1772
aesmc v25.16b,v25.16b
1773
aese v26.16b,v3.16b
1774
aesmc v26.16b,v26.16b
1775
aese v27.16b,v3.16b
1776
aesmc v27.16b,v27.16b
1777
ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1778
aese v28.16b,v3.16b
1779
aesmc v28.16b,v28.16b
1780
aese v29.16b,v3.16b
1781
aesmc v29.16b,v29.16b
1782
aese v30.16b,v3.16b
1783
aesmc v30.16b,v30.16b
1784
aese v31.16b,v3.16b
1785
aesmc v31.16b,v31.16b
1786
ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1787
aese v8.16b,v3.16b
1788
aesmc v8.16b,v8.16b
1789
aese v9.16b,v3.16b
1790
aesmc v9.16b,v9.16b
1791
aese v10.16b,v3.16b
1792
aesmc v10.16b,v10.16b
1793
aese v11.16b,v3.16b
1794
aesmc v11.16b,v11.16b
1795
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1796
ld1 {v3.4s},[x7],#16
1797
1798
mov x7, x3
1799
aese v24.16b,v2.16b
1800
aesmc v24.16b,v24.16b
1801
aese v25.16b,v2.16b
1802
aesmc v25.16b,v25.16b
1803
aese v26.16b,v2.16b
1804
aesmc v26.16b,v26.16b
1805
aese v27.16b,v2.16b
1806
aesmc v27.16b,v27.16b
1807
aese v28.16b,v2.16b
1808
aesmc v28.16b,v28.16b
1809
aese v29.16b,v2.16b
1810
aesmc v29.16b,v29.16b
1811
aese v30.16b,v2.16b
1812
aesmc v30.16b,v30.16b
1813
aese v31.16b,v2.16b
1814
aesmc v31.16b,v31.16b
1815
aese v8.16b,v2.16b
1816
aesmc v8.16b,v8.16b
1817
aese v9.16b,v2.16b
1818
aesmc v9.16b,v9.16b
1819
aese v10.16b,v2.16b
1820
aesmc v10.16b,v10.16b
1821
aese v11.16b,v2.16b
1822
aesmc v11.16b,v11.16b
1823
ld1 {v2.4s},[x7],#16 // re-pre-load rndkey[0]
1824
1825
aese v24.16b,v3.16b
1826
.inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b
1827
orr v24.16b,v0.16b,v0.16b
1828
aese v25.16b,v3.16b
1829
.inst 0xce0164a5 //eor3 v5.16b,v5.16b,v1.16b,v25.16b
1830
orr v25.16b,v0.16b,v0.16b
1831
aese v26.16b,v3.16b
1832
.inst 0xce0168c6 //eor3 v6.16b,v6.16b,v1.16b,v26.16b
1833
orr v26.16b,v0.16b,v0.16b
1834
aese v27.16b,v3.16b
1835
.inst 0xce016ce7 //eor3 v7.16b,v7.16b,v1.16b,v27.16b
1836
orr v27.16b,v0.16b,v0.16b
1837
aese v28.16b,v3.16b
1838
.inst 0xce017210 //eor3 v16.16b,v16.16b,v1.16b,v28.16b
1839
orr v28.16b,v0.16b,v0.16b
1840
aese v29.16b,v3.16b
1841
.inst 0xce017631 //eor3 v17.16b,v17.16b,v1.16b,v29.16b
1842
orr v29.16b,v0.16b,v0.16b
1843
aese v30.16b,v3.16b
1844
.inst 0xce017a52 //eor3 v18.16b,v18.16b,v1.16b,v30.16b
1845
orr v30.16b,v0.16b,v0.16b
1846
aese v31.16b,v3.16b
1847
.inst 0xce017e73 //eor3 v19.16b,v19.16b,v1.16b,v31.16b
1848
orr v31.16b,v0.16b,v0.16b
1849
aese v8.16b,v3.16b
1850
.inst 0xce012294 //eor3 v20.16b,v20.16b,v1.16b,v8.16b
1851
orr v8.16b,v0.16b,v0.16b
1852
aese v9.16b,v3.16b
1853
.inst 0xce0126b5 //eor3 v21.16b,v21.16b,v1.16b,v9.16b
1854
orr v9.16b,v0.16b,v0.16b
1855
aese v10.16b,v3.16b
1856
.inst 0xce012ad6 //eor3 v22.16b,v22.16b,v1.16b,v10.16b
1857
orr v10.16b,v0.16b,v0.16b
1858
aese v11.16b,v3.16b
1859
.inst 0xce012ef7 //eor3 v23.16b,v23.16b,v1.16b,v11.16b
1860
orr v11.16b,v0.16b,v0.16b
1861
ld1 {v3.4s},[x7],#16 // re-pre-load rndkey[1]
1862
1863
mov v24.s[3],w9
1864
mov v25.s[3],w10
1865
mov v26.s[3],w12
1866
mov v27.s[3],w11
1867
st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1868
mov v28.s[3],w13
1869
mov v29.s[3],w14
1870
mov v30.s[3],w15
1871
mov v31.s[3],w19
1872
st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64
1873
mov v8.s[3],w20
1874
mov v9.s[3],w21
1875
mov v10.s[3],w22
1876
mov v11.s[3],w23
1877
st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1878
1879
mov w6,w5
1880
1881
add w8,w8,#12
1882
subs x2,x2,#12
1883
b.hs .Loop12x_ctr32_unroll
1884
1885
// pop regs from stack when 12 data chunks are interleaved
1886
ldp d10,d11,[sp],#32
1887
ldp d8,d9,[sp],#32
1888
ldp x23,x24,[sp],#16
1889
ldp x21,x22,[sp],#16
1890
ldp x19,x20,[sp],#16
1891
1892
add x2,x2,#12
1893
cbz x2,.Lctr32_done_unroll
1894
sub w8,w8,#12
1895
1896
cmp x2,#2
1897
b.ls .Lctr32_tail_unroll
1898
1899
cmp x2,#6
1900
sub x2,x2,#3 // bias
1901
add w8,w8,#3
1902
b.lo .Loop3x_ctr32_unroll
1903
1904
sub x2,x2,#3
1905
add w8,w8,#3
1906
b.lo .Loop6x_ctr32_unroll
1907
1908
.align 4
1909
.Loop6x_ctr32_unroll:
1910
aese v24.16b,v2.16b
1911
aesmc v24.16b,v24.16b
1912
aese v25.16b,v2.16b
1913
aesmc v25.16b,v25.16b
1914
aese v26.16b,v2.16b
1915
aesmc v26.16b,v26.16b
1916
aese v27.16b,v2.16b
1917
aesmc v27.16b,v27.16b
1918
aese v28.16b,v2.16b
1919
aesmc v28.16b,v28.16b
1920
aese v29.16b,v2.16b
1921
aesmc v29.16b,v29.16b
1922
ld1 {v2.4s},[x7],#16
1923
subs w6,w6,#2
1924
aese v24.16b,v3.16b
1925
aesmc v24.16b,v24.16b
1926
aese v25.16b,v3.16b
1927
aesmc v25.16b,v25.16b
1928
aese v26.16b,v3.16b
1929
aesmc v26.16b,v26.16b
1930
aese v27.16b,v3.16b
1931
aesmc v27.16b,v27.16b
1932
aese v28.16b,v3.16b
1933
aesmc v28.16b,v28.16b
1934
aese v29.16b,v3.16b
1935
aesmc v29.16b,v29.16b
1936
ld1 {v3.4s},[x7],#16
1937
b.gt .Loop6x_ctr32_unroll
1938
1939
aese v24.16b,v2.16b
1940
aesmc v24.16b,v24.16b
1941
aese v25.16b,v2.16b
1942
aesmc v25.16b,v25.16b
1943
aese v26.16b,v2.16b
1944
aesmc v26.16b,v26.16b
1945
aese v27.16b,v2.16b
1946
aesmc v27.16b,v27.16b
1947
aese v28.16b,v2.16b
1948
aesmc v28.16b,v28.16b
1949
aese v29.16b,v2.16b
1950
aesmc v29.16b,v29.16b
1951
ld1 {v2.4s},[x7],#16
1952
1953
aese v24.16b,v3.16b
1954
aesmc v24.16b,v24.16b
1955
aese v25.16b,v3.16b
1956
aesmc v25.16b,v25.16b
1957
aese v26.16b,v3.16b
1958
aesmc v26.16b,v26.16b
1959
aese v27.16b,v3.16b
1960
aesmc v27.16b,v27.16b
1961
aese v28.16b,v3.16b
1962
aesmc v28.16b,v28.16b
1963
aese v29.16b,v3.16b
1964
aesmc v29.16b,v29.16b
1965
ld1 {v3.4s},[x7],#16
1966
1967
aese v24.16b,v2.16b
1968
aesmc v24.16b,v24.16b
1969
add w9,w8,#1
1970
add w10,w8,#2
1971
aese v25.16b,v2.16b
1972
aesmc v25.16b,v25.16b
1973
add w12,w8,#3
1974
add w11,w8,#4
1975
aese v26.16b,v2.16b
1976
aesmc v26.16b,v26.16b
1977
add w13,w8,#5
1978
add w14,w8,#6
1979
rev w9,w9
1980
aese v27.16b,v2.16b
1981
aesmc v27.16b,v27.16b
1982
rev w10,w10
1983
rev w12,w12
1984
aese v28.16b,v2.16b
1985
aesmc v28.16b,v28.16b
1986
rev w11,w11
1987
rev w13,w13
1988
aese v29.16b,v2.16b
1989
aesmc v29.16b,v29.16b
1990
rev w14,w14
1991
ld1 {v2.4s},[x7],#16
1992
1993
aese v24.16b,v3.16b
1994
aesmc v24.16b,v24.16b
1995
aese v25.16b,v3.16b
1996
aesmc v25.16b,v25.16b
1997
ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1998
aese v26.16b,v3.16b
1999
aesmc v26.16b,v26.16b
2000
aese v27.16b,v3.16b
2001
aesmc v27.16b,v27.16b
2002
ld1 {v16.16b,v17.16b},[x0],#32
2003
aese v28.16b,v3.16b
2004
aesmc v28.16b,v28.16b
2005
aese v29.16b,v3.16b
2006
aesmc v29.16b,v29.16b
2007
ld1 {v3.4s},[x7],#16
2008
2009
mov x7, x3
2010
aese v24.16b,v2.16b
2011
aesmc v24.16b,v24.16b
2012
aese v25.16b,v2.16b
2013
aesmc v25.16b,v25.16b
2014
aese v26.16b,v2.16b
2015
aesmc v26.16b,v26.16b
2016
aese v27.16b,v2.16b
2017
aesmc v27.16b,v27.16b
2018
aese v28.16b,v2.16b
2019
aesmc v28.16b,v28.16b
2020
aese v29.16b,v2.16b
2021
aesmc v29.16b,v29.16b
2022
ld1 {v2.4s},[x7],#16 // re-pre-load rndkey[0]
2023
2024
aese v24.16b,v3.16b
2025
.inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b
2026
aese v25.16b,v3.16b
2027
.inst 0xce0164a5 //eor3 v5.16b,v5.16b,v1.16b,v25.16b
2028
aese v26.16b,v3.16b
2029
.inst 0xce0168c6 //eor3 v6.16b,v6.16b,v1.16b,v26.16b
2030
aese v27.16b,v3.16b
2031
.inst 0xce016ce7 //eor3 v7.16b,v7.16b,v1.16b,v27.16b
2032
aese v28.16b,v3.16b
2033
.inst 0xce017210 //eor3 v16.16b,v16.16b,v1.16b,v28.16b
2034
aese v29.16b,v3.16b
2035
.inst 0xce017631 //eor3 v17.16b,v17.16b,v1.16b,v29.16b
2036
ld1 {v3.4s},[x7],#16 // re-pre-load rndkey[1]
2037
2038
orr v24.16b,v0.16b,v0.16b
2039
orr v25.16b,v0.16b,v0.16b
2040
orr v26.16b,v0.16b,v0.16b
2041
orr v27.16b,v0.16b,v0.16b
2042
orr v28.16b,v0.16b,v0.16b
2043
orr v29.16b,v0.16b,v0.16b
2044
2045
mov v24.s[3],w9
2046
mov v25.s[3],w10
2047
st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
2048
mov v26.s[3],w12
2049
mov v27.s[3],w11
2050
st1 {v16.16b,v17.16b},[x1],#32
2051
mov v28.s[3],w13
2052
mov v29.s[3],w14
2053
2054
cbz x2,.Lctr32_done_unroll
2055
mov w6,w5
2056
2057
cmp x2,#2
2058
b.ls .Lctr32_tail_unroll
2059
2060
sub x2,x2,#3 // bias
2061
add w8,w8,#3
2062
b .Loop3x_ctr32_unroll
2063
2064
.align 4
2065
.Loop3x_ctr32_unroll:
2066
aese v24.16b,v2.16b
2067
aesmc v24.16b,v24.16b
2068
aese v25.16b,v2.16b
2069
aesmc v25.16b,v25.16b
2070
aese v26.16b,v2.16b
2071
aesmc v26.16b,v26.16b
2072
ld1 {v2.4s},[x7],#16
2073
subs w6,w6,#2
2074
aese v24.16b,v3.16b
2075
aesmc v24.16b,v24.16b
2076
aese v25.16b,v3.16b
2077
aesmc v25.16b,v25.16b
2078
aese v26.16b,v3.16b
2079
aesmc v26.16b,v26.16b
2080
ld1 {v3.4s},[x7],#16
2081
b.gt .Loop3x_ctr32_unroll
2082
2083
aese v24.16b,v2.16b
2084
aesmc v9.16b,v24.16b
2085
aese v25.16b,v2.16b
2086
aesmc v10.16b,v25.16b
2087
ld1 {v4.16b,v5.16b,v6.16b},[x0],#48
2088
orr v24.16b,v0.16b,v0.16b
2089
aese v26.16b,v2.16b
2090
aesmc v26.16b,v26.16b
2091
ld1 {v2.4s},[x7],#16
2092
orr v25.16b,v0.16b,v0.16b
2093
aese v9.16b,v3.16b
2094
aesmc v9.16b,v9.16b
2095
aese v10.16b,v3.16b
2096
aesmc v10.16b,v10.16b
2097
aese v26.16b,v3.16b
2098
aesmc v11.16b,v26.16b
2099
ld1 {v3.4s},[x7],#16
2100
orr v26.16b,v0.16b,v0.16b
2101
add w9,w8,#1
2102
aese v9.16b,v2.16b
2103
aesmc v9.16b,v9.16b
2104
aese v10.16b,v2.16b
2105
aesmc v10.16b,v10.16b
2106
add w10,w8,#2
2107
aese v11.16b,v2.16b
2108
aesmc v11.16b,v11.16b
2109
ld1 {v2.4s},[x7],#16
2110
add w8,w8,#3
2111
aese v9.16b,v3.16b
2112
aesmc v9.16b,v9.16b
2113
aese v10.16b,v3.16b
2114
aesmc v10.16b,v10.16b
2115
2116
rev w9,w9
2117
aese v11.16b,v3.16b
2118
aesmc v11.16b,v11.16b
2119
ld1 {v3.4s},[x7],#16
2120
mov v24.s[3], w9
2121
mov x7,x3
2122
rev w10,w10
2123
aese v9.16b,v2.16b
2124
aesmc v9.16b,v9.16b
2125
2126
aese v10.16b,v2.16b
2127
aesmc v10.16b,v10.16b
2128
mov v25.s[3], w10
2129
rev w12,w8
2130
aese v11.16b,v2.16b
2131
aesmc v11.16b,v11.16b
2132
mov v26.s[3], w12
2133
2134
aese v9.16b,v3.16b
2135
aese v10.16b,v3.16b
2136
aese v11.16b,v3.16b
2137
2138
.inst 0xce012484 //eor3 v4.16b,v4.16b,v1.16b,v9.16b
2139
ld1 {v2.4s},[x7],#16 // re-pre-load rndkey[0]
2140
.inst 0xce0128a5 //eor3 v5.16b,v5.16b,v1.16b,v10.16b
2141
mov w6,w5
2142
.inst 0xce012cc6 //eor3 v6.16b,v6.16b,v1.16b,v11.16b
2143
ld1 {v3.4s},[x7],#16 // re-pre-load rndkey[1]
2144
st1 {v4.16b,v5.16b,v6.16b},[x1],#48
2145
2146
cbz x2,.Lctr32_done_unroll
2147
2148
.Lctr32_tail_unroll:
2149
cmp x2,#1
2150
b.eq .Lctr32_tail_1_unroll
2151
2152
.Lctr32_tail_2_unroll:
2153
aese v24.16b,v2.16b
2154
aesmc v24.16b,v24.16b
2155
aese v25.16b,v2.16b
2156
aesmc v25.16b,v25.16b
2157
ld1 {v2.4s},[x7],#16
2158
subs w6,w6,#2
2159
aese v24.16b,v3.16b
2160
aesmc v24.16b,v24.16b
2161
aese v25.16b,v3.16b
2162
aesmc v25.16b,v25.16b
2163
ld1 {v3.4s},[x7],#16
2164
b.gt .Lctr32_tail_2_unroll
2165
2166
aese v24.16b,v2.16b
2167
aesmc v24.16b,v24.16b
2168
aese v25.16b,v2.16b
2169
aesmc v25.16b,v25.16b
2170
ld1 {v2.4s},[x7],#16
2171
aese v24.16b,v3.16b
2172
aesmc v24.16b,v24.16b
2173
aese v25.16b,v3.16b
2174
aesmc v25.16b,v25.16b
2175
ld1 {v3.4s},[x7],#16
2176
ld1 {v4.16b,v5.16b},[x0],#32
2177
aese v24.16b,v2.16b
2178
aesmc v24.16b,v24.16b
2179
aese v25.16b,v2.16b
2180
aesmc v25.16b,v25.16b
2181
ld1 {v2.4s},[x7],#16
2182
aese v24.16b,v3.16b
2183
aesmc v24.16b,v24.16b
2184
aese v25.16b,v3.16b
2185
aesmc v25.16b,v25.16b
2186
ld1 {v3.4s},[x7],#16
2187
aese v24.16b,v2.16b
2188
aesmc v24.16b,v24.16b
2189
aese v25.16b,v2.16b
2190
aesmc v25.16b,v25.16b
2191
aese v24.16b,v3.16b
2192
aese v25.16b,v3.16b
2193
2194
.inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b
2195
.inst 0xce0164a5 //eor3 v5.16b,v5.16b,v1.16b,v25.16b
2196
st1 {v4.16b,v5.16b},[x1],#32
2197
b .Lctr32_done_unroll
2198
2199
.Lctr32_tail_1_unroll:
2200
aese v24.16b,v2.16b
2201
aesmc v24.16b,v24.16b
2202
ld1 {v2.4s},[x7],#16
2203
subs w6,w6,#2
2204
aese v24.16b,v3.16b
2205
aesmc v24.16b,v24.16b
2206
ld1 {v3.4s},[x7],#16
2207
b.gt .Lctr32_tail_1_unroll
2208
2209
aese v24.16b,v2.16b
2210
aesmc v24.16b,v24.16b
2211
ld1 {v2.4s},[x7],#16
2212
aese v24.16b,v3.16b
2213
aesmc v24.16b,v24.16b
2214
ld1 {v3.4s},[x7],#16
2215
ld1 {v4.16b},[x0]
2216
aese v24.16b,v2.16b
2217
aesmc v24.16b,v24.16b
2218
ld1 {v2.4s},[x7],#16
2219
aese v24.16b,v3.16b
2220
aesmc v24.16b,v24.16b
2221
ld1 {v3.4s},[x7],#16
2222
aese v24.16b,v2.16b
2223
aesmc v24.16b,v24.16b
2224
aese v24.16b,v3.16b
2225
2226
.inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b
2227
st1 {v4.16b},[x1],#16
2228
2229
.Lctr32_done_unroll:
2230
ldp d8,d9,[sp, #16]
2231
ldp d10,d11,[sp, #32]
2232
ldp d12,d13,[sp, #48]
2233
ldp d14,d15,[sp, #64]
2234
ldr x29,[sp],#80
2235
ret
2236
.size aes_v8_ctr32_encrypt_blocks_unroll12_eor3,.-aes_v8_ctr32_encrypt_blocks_unroll12_eor3
2237
.globl aes_v8_ctr32_encrypt_blocks
2238
.type aes_v8_ctr32_encrypt_blocks,%function
2239
.align 5
2240
aes_v8_ctr32_encrypt_blocks:
2241
AARCH64_VALID_CALL_TARGET
2242
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
2243
stp x29,x30,[sp,#-16]!
2244
add x29,sp,#0
2245
ldr w5,[x3,#240]
2246
2247
ldr w8, [x4, #12]
2248
#ifdef __AARCH64EB__
2249
ld1 {v0.16b},[x4]
2250
#else
2251
ld1 {v0.4s},[x4]
2252
#endif
2253
ld1 {v16.4s,v17.4s},[x3] // load key schedule...
2254
sub w5,w5,#4
2255
mov x12,#16
2256
cmp x2,#2
2257
add x7,x3,x5,lsl#4 // pointer to last 5 round keys
2258
sub w5,w5,#2
2259
ld1 {v20.4s,v21.4s},[x7],#32
2260
ld1 {v22.4s,v23.4s},[x7],#32
2261
ld1 {v7.4s},[x7]
2262
add x7,x3,#32
2263
mov w6,w5
2264
csel x12,xzr,x12,lo
2265
#ifndef __AARCH64EB__
2266
rev w8, w8
2267
#endif
2268
orr v1.16b,v0.16b,v0.16b
2269
add w10, w8, #1
2270
orr v18.16b,v0.16b,v0.16b
2271
add w8, w8, #2
2272
orr v6.16b,v0.16b,v0.16b
2273
rev w10, w10
2274
mov v1.s[3],w10
2275
b.ls .Lctr32_tail
2276
rev w12, w8
2277
sub x2,x2,#3 // bias
2278
mov v18.s[3],w12
2279
cmp x2,#32
2280
b.lo .Loop3x_ctr32
2281
2282
add w13,w8,#1
2283
add w14,w8,#2
2284
orr v24.16b,v0.16b,v0.16b
2285
rev w13,w13
2286
orr v25.16b,v0.16b,v0.16b
2287
rev w14,w14
2288
mov v24.s[3],w13
2289
sub x2,x2,#2 // bias
2290
mov v25.s[3],w14
2291
add w8,w8,#2
2292
b .Loop5x_ctr32
2293
2294
.align 4
2295
.Loop5x_ctr32:
2296
aese v0.16b,v16.16b
2297
aesmc v0.16b,v0.16b
2298
aese v1.16b,v16.16b
2299
aesmc v1.16b,v1.16b
2300
aese v18.16b,v16.16b
2301
aesmc v18.16b,v18.16b
2302
aese v24.16b,v16.16b
2303
aesmc v24.16b,v24.16b
2304
aese v25.16b,v16.16b
2305
aesmc v25.16b,v25.16b
2306
ld1 {v16.4s},[x7],#16
2307
subs w6,w6,#2
2308
aese v0.16b,v17.16b
2309
aesmc v0.16b,v0.16b
2310
aese v1.16b,v17.16b
2311
aesmc v1.16b,v1.16b
2312
aese v18.16b,v17.16b
2313
aesmc v18.16b,v18.16b
2314
aese v24.16b,v17.16b
2315
aesmc v24.16b,v24.16b
2316
aese v25.16b,v17.16b
2317
aesmc v25.16b,v25.16b
2318
ld1 {v17.4s},[x7],#16
2319
b.gt .Loop5x_ctr32
2320
2321
mov x7,x3
2322
aese v0.16b,v16.16b
2323
aesmc v0.16b,v0.16b
2324
aese v1.16b,v16.16b
2325
aesmc v1.16b,v1.16b
2326
aese v18.16b,v16.16b
2327
aesmc v18.16b,v18.16b
2328
aese v24.16b,v16.16b
2329
aesmc v24.16b,v24.16b
2330
aese v25.16b,v16.16b
2331
aesmc v25.16b,v25.16b
2332
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
2333
2334
aese v0.16b,v17.16b
2335
aesmc v0.16b,v0.16b
2336
aese v1.16b,v17.16b
2337
aesmc v1.16b,v1.16b
2338
aese v18.16b,v17.16b
2339
aesmc v18.16b,v18.16b
2340
aese v24.16b,v17.16b
2341
aesmc v24.16b,v24.16b
2342
aese v25.16b,v17.16b
2343
aesmc v25.16b,v25.16b
2344
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
2345
2346
aese v0.16b,v20.16b
2347
aesmc v0.16b,v0.16b
2348
add w9,w8,#1
2349
add w10,w8,#2
2350
aese v1.16b,v20.16b
2351
aesmc v1.16b,v1.16b
2352
add w12,w8,#3
2353
add w13,w8,#4
2354
aese v18.16b,v20.16b
2355
aesmc v18.16b,v18.16b
2356
add w14,w8,#5
2357
rev w9,w9
2358
aese v24.16b,v20.16b
2359
aesmc v24.16b,v24.16b
2360
rev w10,w10
2361
rev w12,w12
2362
aese v25.16b,v20.16b
2363
aesmc v25.16b,v25.16b
2364
rev w13,w13
2365
rev w14,w14
2366
2367
aese v0.16b,v21.16b
2368
aesmc v0.16b,v0.16b
2369
aese v1.16b,v21.16b
2370
aesmc v1.16b,v1.16b
2371
aese v18.16b,v21.16b
2372
aesmc v18.16b,v18.16b
2373
aese v24.16b,v21.16b
2374
aesmc v24.16b,v24.16b
2375
aese v25.16b,v21.16b
2376
aesmc v25.16b,v25.16b
2377
2378
aese v0.16b,v22.16b
2379
aesmc v0.16b,v0.16b
2380
ld1 {v2.16b},[x0],#16
2381
aese v1.16b,v22.16b
2382
aesmc v1.16b,v1.16b
2383
ld1 {v3.16b},[x0],#16
2384
aese v18.16b,v22.16b
2385
aesmc v18.16b,v18.16b
2386
ld1 {v19.16b},[x0],#16
2387
aese v24.16b,v22.16b
2388
aesmc v24.16b,v24.16b
2389
ld1 {v26.16b},[x0],#16
2390
aese v25.16b,v22.16b
2391
aesmc v25.16b,v25.16b
2392
ld1 {v27.16b},[x0],#16
2393
2394
aese v0.16b,v23.16b
2395
eor v2.16b,v2.16b,v7.16b
2396
aese v1.16b,v23.16b
2397
eor v3.16b,v3.16b,v7.16b
2398
aese v18.16b,v23.16b
2399
eor v19.16b,v19.16b,v7.16b
2400
aese v24.16b,v23.16b
2401
eor v26.16b,v26.16b,v7.16b
2402
aese v25.16b,v23.16b
2403
eor v27.16b,v27.16b,v7.16b
2404
2405
eor v2.16b,v2.16b,v0.16b
2406
orr v0.16b,v6.16b,v6.16b
2407
eor v3.16b,v3.16b,v1.16b
2408
orr v1.16b,v6.16b,v6.16b
2409
eor v19.16b,v19.16b,v18.16b
2410
orr v18.16b,v6.16b,v6.16b
2411
eor v26.16b,v26.16b,v24.16b
2412
orr v24.16b,v6.16b,v6.16b
2413
eor v27.16b,v27.16b,v25.16b
2414
orr v25.16b,v6.16b,v6.16b
2415
2416
st1 {v2.16b},[x1],#16
2417
mov v0.s[3],w9
2418
st1 {v3.16b},[x1],#16
2419
mov v1.s[3],w10
2420
st1 {v19.16b},[x1],#16
2421
mov v18.s[3],w12
2422
st1 {v26.16b},[x1],#16
2423
mov v24.s[3],w13
2424
st1 {v27.16b},[x1],#16
2425
mov v25.s[3],w14
2426
2427
mov w6,w5
2428
cbz x2,.Lctr32_done
2429
2430
add w8,w8,#5
2431
subs x2,x2,#5
2432
b.hs .Loop5x_ctr32
2433
2434
add x2,x2,#5
2435
sub w8,w8,#5
2436
2437
cmp x2,#2
2438
mov x12,#16
2439
csel x12,xzr,x12,lo
2440
b.ls .Lctr32_tail
2441
2442
sub x2,x2,#3 // bias
2443
add w8,w8,#3
2444
b .Loop3x_ctr32
2445
2446
.align 4
2447
.Loop3x_ctr32:
2448
aese v0.16b,v16.16b
2449
aesmc v0.16b,v0.16b
2450
aese v1.16b,v16.16b
2451
aesmc v1.16b,v1.16b
2452
aese v18.16b,v16.16b
2453
aesmc v18.16b,v18.16b
2454
ld1 {v16.4s},[x7],#16
2455
subs w6,w6,#2
2456
aese v0.16b,v17.16b
2457
aesmc v0.16b,v0.16b
2458
aese v1.16b,v17.16b
2459
aesmc v1.16b,v1.16b
2460
aese v18.16b,v17.16b
2461
aesmc v18.16b,v18.16b
2462
ld1 {v17.4s},[x7],#16
2463
b.gt .Loop3x_ctr32
2464
2465
aese v0.16b,v16.16b
2466
aesmc v4.16b,v0.16b
2467
aese v1.16b,v16.16b
2468
aesmc v5.16b,v1.16b
2469
ld1 {v2.16b},[x0],#16
2470
orr v0.16b,v6.16b,v6.16b
2471
aese v18.16b,v16.16b
2472
aesmc v18.16b,v18.16b
2473
ld1 {v3.16b},[x0],#16
2474
orr v1.16b,v6.16b,v6.16b
2475
aese v4.16b,v17.16b
2476
aesmc v4.16b,v4.16b
2477
aese v5.16b,v17.16b
2478
aesmc v5.16b,v5.16b
2479
ld1 {v19.16b},[x0],#16
2480
mov x7,x3
2481
aese v18.16b,v17.16b
2482
aesmc v17.16b,v18.16b
2483
orr v18.16b,v6.16b,v6.16b
2484
add w9,w8,#1
2485
aese v4.16b,v20.16b
2486
aesmc v4.16b,v4.16b
2487
aese v5.16b,v20.16b
2488
aesmc v5.16b,v5.16b
2489
eor v2.16b,v2.16b,v7.16b
2490
add w10,w8,#2
2491
aese v17.16b,v20.16b
2492
aesmc v17.16b,v17.16b
2493
eor v3.16b,v3.16b,v7.16b
2494
add w8,w8,#3
2495
aese v4.16b,v21.16b
2496
aesmc v4.16b,v4.16b
2497
aese v5.16b,v21.16b
2498
aesmc v5.16b,v5.16b
2499
eor v19.16b,v19.16b,v7.16b
2500
rev w9,w9
2501
aese v17.16b,v21.16b
2502
aesmc v17.16b,v17.16b
2503
mov v0.s[3], w9
2504
rev w10,w10
2505
aese v4.16b,v22.16b
2506
aesmc v4.16b,v4.16b
2507
aese v5.16b,v22.16b
2508
aesmc v5.16b,v5.16b
2509
mov v1.s[3], w10
2510
rev w12,w8
2511
aese v17.16b,v22.16b
2512
aesmc v17.16b,v17.16b
2513
mov v18.s[3], w12
2514
subs x2,x2,#3
2515
aese v4.16b,v23.16b
2516
aese v5.16b,v23.16b
2517
aese v17.16b,v23.16b
2518
2519
eor v2.16b,v2.16b,v4.16b
2520
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
2521
st1 {v2.16b},[x1],#16
2522
eor v3.16b,v3.16b,v5.16b
2523
mov w6,w5
2524
st1 {v3.16b},[x1],#16
2525
eor v19.16b,v19.16b,v17.16b
2526
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
2527
st1 {v19.16b},[x1],#16
2528
b.hs .Loop3x_ctr32
2529
2530
adds x2,x2,#3
2531
b.eq .Lctr32_done
2532
cmp x2,#1
2533
mov x12,#16
2534
csel x12,xzr,x12,eq
2535
2536
.Lctr32_tail:
2537
aese v0.16b,v16.16b
2538
aesmc v0.16b,v0.16b
2539
aese v1.16b,v16.16b
2540
aesmc v1.16b,v1.16b
2541
ld1 {v16.4s},[x7],#16
2542
subs w6,w6,#2
2543
aese v0.16b,v17.16b
2544
aesmc v0.16b,v0.16b
2545
aese v1.16b,v17.16b
2546
aesmc v1.16b,v1.16b
2547
ld1 {v17.4s},[x7],#16
2548
b.gt .Lctr32_tail
2549
2550
aese v0.16b,v16.16b
2551
aesmc v0.16b,v0.16b
2552
aese v1.16b,v16.16b
2553
aesmc v1.16b,v1.16b
2554
aese v0.16b,v17.16b
2555
aesmc v0.16b,v0.16b
2556
aese v1.16b,v17.16b
2557
aesmc v1.16b,v1.16b
2558
ld1 {v2.16b},[x0],x12
2559
aese v0.16b,v20.16b
2560
aesmc v0.16b,v0.16b
2561
aese v1.16b,v20.16b
2562
aesmc v1.16b,v1.16b
2563
ld1 {v3.16b},[x0]
2564
aese v0.16b,v21.16b
2565
aesmc v0.16b,v0.16b
2566
aese v1.16b,v21.16b
2567
aesmc v1.16b,v1.16b
2568
eor v2.16b,v2.16b,v7.16b
2569
aese v0.16b,v22.16b
2570
aesmc v0.16b,v0.16b
2571
aese v1.16b,v22.16b
2572
aesmc v1.16b,v1.16b
2573
eor v3.16b,v3.16b,v7.16b
2574
aese v0.16b,v23.16b
2575
aese v1.16b,v23.16b
2576
2577
cmp x2,#1
2578
eor v2.16b,v2.16b,v0.16b
2579
eor v3.16b,v3.16b,v1.16b
2580
st1 {v2.16b},[x1],#16
2581
b.eq .Lctr32_done
2582
st1 {v3.16b},[x1]
2583
2584
.Lctr32_done:
2585
ldr x29,[sp],#16
2586
ret
2587
.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
2588
.globl aes_v8_xts_encrypt
2589
.type aes_v8_xts_encrypt,%function
2590
.align 5
2591
aes_v8_xts_encrypt:
2592
AARCH64_VALID_CALL_TARGET
2593
cmp x2,#16
2594
// Original input data size bigger than 16, jump to big size processing.
2595
b.ne .Lxts_enc_big_size
2596
// Encrypt the iv with key2, as the first XEX iv.
2597
ldr w6,[x4,#240]
2598
ld1 {v0.4s},[x4],#16
2599
ld1 {v6.16b},[x5]
2600
sub w6,w6,#2
2601
ld1 {v1.4s},[x4],#16
2602
2603
.Loop_enc_iv_enc:
2604
aese v6.16b,v0.16b
2605
aesmc v6.16b,v6.16b
2606
ld1 {v0.4s},[x4],#16
2607
subs w6,w6,#2
2608
aese v6.16b,v1.16b
2609
aesmc v6.16b,v6.16b
2610
ld1 {v1.4s},[x4],#16
2611
b.gt .Loop_enc_iv_enc
2612
2613
aese v6.16b,v0.16b
2614
aesmc v6.16b,v6.16b
2615
ld1 {v0.4s},[x4]
2616
aese v6.16b,v1.16b
2617
eor v6.16b,v6.16b,v0.16b
2618
2619
ld1 {v0.16b},[x0]
2620
eor v0.16b,v6.16b,v0.16b
2621
2622
ldr w6,[x3,#240]
2623
ld1 {v28.4s,v29.4s},[x3],#32 // load key schedule...
2624
2625
aese v0.16b,v28.16b
2626
aesmc v0.16b,v0.16b
2627
ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule...
2628
aese v0.16b,v29.16b
2629
aesmc v0.16b,v0.16b
2630
subs w6,w6,#10 // if rounds==10, jump to aes-128-xts processing
2631
b.eq .Lxts_128_enc
2632
.Lxts_enc_round_loop:
2633
aese v0.16b,v16.16b
2634
aesmc v0.16b,v0.16b
2635
ld1 {v16.4s},[x3],#16 // load key schedule...
2636
aese v0.16b,v17.16b
2637
aesmc v0.16b,v0.16b
2638
ld1 {v17.4s},[x3],#16 // load key schedule...
2639
subs w6,w6,#2 // bias
2640
b.gt .Lxts_enc_round_loop
2641
.Lxts_128_enc:
2642
ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule...
2643
aese v0.16b,v16.16b
2644
aesmc v0.16b,v0.16b
2645
aese v0.16b,v17.16b
2646
aesmc v0.16b,v0.16b
2647
ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule...
2648
aese v0.16b,v18.16b
2649
aesmc v0.16b,v0.16b
2650
aese v0.16b,v19.16b
2651
aesmc v0.16b,v0.16b
2652
ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule...
2653
aese v0.16b,v20.16b
2654
aesmc v0.16b,v0.16b
2655
aese v0.16b,v21.16b
2656
aesmc v0.16b,v0.16b
2657
ld1 {v7.4s},[x3]
2658
aese v0.16b,v22.16b
2659
aesmc v0.16b,v0.16b
2660
aese v0.16b,v23.16b
2661
eor v0.16b,v0.16b,v7.16b
2662
eor v0.16b,v0.16b,v6.16b
2663
st1 {v0.16b},[x1]
2664
b .Lxts_enc_final_abort
2665
2666
.align 4
2667
.Lxts_enc_big_size:
2668
stp x19,x20,[sp,#-64]!
2669
stp x21,x22,[sp,#48]
2670
stp d8,d9,[sp,#32]
2671
stp d10,d11,[sp,#16]
2672
2673
// tailcnt store the tail value of length%16.
2674
and x21,x2,#0xf
2675
and x2,x2,#-16
2676
subs x2,x2,#16
2677
mov x8,#16
2678
b.lo .Lxts_abort
2679
csel x8,xzr,x8,eq
2680
2681
// Firstly, encrypt the iv with key2, as the first iv of XEX.
2682
ldr w6,[x4,#240]
2683
ld1 {v0.4s},[x4],#16
2684
ld1 {v6.16b},[x5]
2685
sub w6,w6,#2
2686
ld1 {v1.4s},[x4],#16
2687
2688
.Loop_iv_enc:
2689
aese v6.16b,v0.16b
2690
aesmc v6.16b,v6.16b
2691
ld1 {v0.4s},[x4],#16
2692
subs w6,w6,#2
2693
aese v6.16b,v1.16b
2694
aesmc v6.16b,v6.16b
2695
ld1 {v1.4s},[x4],#16
2696
b.gt .Loop_iv_enc
2697
2698
aese v6.16b,v0.16b
2699
aesmc v6.16b,v6.16b
2700
ld1 {v0.4s},[x4]
2701
aese v6.16b,v1.16b
2702
eor v6.16b,v6.16b,v0.16b
2703
2704
// The iv for second block
2705
// x9- iv(low), x10 - iv(high)
2706
// the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b
2707
fmov x9,d6
2708
fmov x10,v6.d[1]
2709
mov w19,#0x87
2710
extr x22,x10,x10,#32
2711
extr x10,x10,x9,#63
2712
and w11,w19,w22,asr#31
2713
eor x9,x11,x9,lsl#1
2714
fmov d8,x9
2715
fmov v8.d[1],x10
2716
2717
ldr w5,[x3,#240] // next starting point
2718
ld1 {v0.16b},[x0],x8
2719
2720
ld1 {v16.4s,v17.4s},[x3] // load key schedule...
2721
sub w5,w5,#6
2722
add x7,x3,x5,lsl#4 // pointer to last 7 round keys
2723
sub w5,w5,#2
2724
ld1 {v18.4s,v19.4s},[x7],#32
2725
ld1 {v20.4s,v21.4s},[x7],#32
2726
ld1 {v22.4s,v23.4s},[x7],#32
2727
ld1 {v7.4s},[x7]
2728
2729
add x7,x3,#32
2730
mov w6,w5
2731
2732
// Encryption
2733
.Lxts_enc:
2734
ld1 {v24.16b},[x0],#16
2735
subs x2,x2,#32 // bias
2736
add w6,w5,#2
2737
orr v3.16b,v0.16b,v0.16b
2738
orr v1.16b,v0.16b,v0.16b
2739
orr v28.16b,v0.16b,v0.16b
2740
orr v27.16b,v24.16b,v24.16b
2741
orr v29.16b,v24.16b,v24.16b
2742
b.lo .Lxts_inner_enc_tail
2743
eor v0.16b,v0.16b,v6.16b // before encryption, xor with iv
2744
eor v24.16b,v24.16b,v8.16b
2745
2746
// The iv for third block
2747
extr x22,x10,x10,#32
2748
extr x10,x10,x9,#63
2749
and w11,w19,w22,asr#31
2750
eor x9,x11,x9,lsl#1
2751
fmov d9,x9
2752
fmov v9.d[1],x10
2753
2754
2755
orr v1.16b,v24.16b,v24.16b
2756
ld1 {v24.16b},[x0],#16
2757
orr v2.16b,v0.16b,v0.16b
2758
orr v3.16b,v1.16b,v1.16b
2759
eor v27.16b,v24.16b,v9.16b // the third block
2760
eor v24.16b,v24.16b,v9.16b
2761
cmp x2,#32
2762
b.lo .Lxts_outer_enc_tail
2763
2764
// The iv for fourth block
2765
extr x22,x10,x10,#32
2766
extr x10,x10,x9,#63
2767
and w11,w19,w22,asr#31
2768
eor x9,x11,x9,lsl#1
2769
fmov d10,x9
2770
fmov v10.d[1],x10
2771
2772
ld1 {v25.16b},[x0],#16
2773
// The iv for fifth block
2774
extr x22,x10,x10,#32
2775
extr x10,x10,x9,#63
2776
and w11,w19,w22,asr#31
2777
eor x9,x11,x9,lsl#1
2778
fmov d11,x9
2779
fmov v11.d[1],x10
2780
2781
ld1 {v26.16b},[x0],#16
2782
eor v25.16b,v25.16b,v10.16b // the fourth block
2783
eor v26.16b,v26.16b,v11.16b
2784
sub x2,x2,#32 // bias
2785
mov w6,w5
2786
b .Loop5x_xts_enc
2787
2788
.align 4
2789
.Loop5x_xts_enc:
2790
aese v0.16b,v16.16b
2791
aesmc v0.16b,v0.16b
2792
aese v1.16b,v16.16b
2793
aesmc v1.16b,v1.16b
2794
aese v24.16b,v16.16b
2795
aesmc v24.16b,v24.16b
2796
aese v25.16b,v16.16b
2797
aesmc v25.16b,v25.16b
2798
aese v26.16b,v16.16b
2799
aesmc v26.16b,v26.16b
2800
ld1 {v16.4s},[x7],#16
2801
subs w6,w6,#2
2802
aese v0.16b,v17.16b
2803
aesmc v0.16b,v0.16b
2804
aese v1.16b,v17.16b
2805
aesmc v1.16b,v1.16b
2806
aese v24.16b,v17.16b
2807
aesmc v24.16b,v24.16b
2808
aese v25.16b,v17.16b
2809
aesmc v25.16b,v25.16b
2810
aese v26.16b,v17.16b
2811
aesmc v26.16b,v26.16b
2812
ld1 {v17.4s},[x7],#16
2813
b.gt .Loop5x_xts_enc
2814
2815
aese v0.16b,v16.16b
2816
aesmc v0.16b,v0.16b
2817
aese v1.16b,v16.16b
2818
aesmc v1.16b,v1.16b
2819
aese v24.16b,v16.16b
2820
aesmc v24.16b,v24.16b
2821
aese v25.16b,v16.16b
2822
aesmc v25.16b,v25.16b
2823
aese v26.16b,v16.16b
2824
aesmc v26.16b,v26.16b
2825
subs x2,x2,#0x50 // because .Lxts_enc_tail4x
2826
2827
aese v0.16b,v17.16b
2828
aesmc v0.16b,v0.16b
2829
aese v1.16b,v17.16b
2830
aesmc v1.16b,v1.16b
2831
aese v24.16b,v17.16b
2832
aesmc v24.16b,v24.16b
2833
aese v25.16b,v17.16b
2834
aesmc v25.16b,v25.16b
2835
aese v26.16b,v17.16b
2836
aesmc v26.16b,v26.16b
2837
csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo
2838
mov x7,x3
2839
2840
aese v0.16b,v18.16b
2841
aesmc v0.16b,v0.16b
2842
aese v1.16b,v18.16b
2843
aesmc v1.16b,v1.16b
2844
aese v24.16b,v18.16b
2845
aesmc v24.16b,v24.16b
2846
aese v25.16b,v18.16b
2847
aesmc v25.16b,v25.16b
2848
aese v26.16b,v18.16b
2849
aesmc v26.16b,v26.16b
2850
add x0,x0,x6 // x0 is adjusted in such way that
2851
// at exit from the loop v1.16b-v26.16b
2852
// are loaded with last "words"
2853
add x6,x2,#0x60 // because .Lxts_enc_tail4x
2854
2855
aese v0.16b,v19.16b
2856
aesmc v0.16b,v0.16b
2857
aese v1.16b,v19.16b
2858
aesmc v1.16b,v1.16b
2859
aese v24.16b,v19.16b
2860
aesmc v24.16b,v24.16b
2861
aese v25.16b,v19.16b
2862
aesmc v25.16b,v25.16b
2863
aese v26.16b,v19.16b
2864
aesmc v26.16b,v26.16b
2865
2866
aese v0.16b,v20.16b
2867
aesmc v0.16b,v0.16b
2868
aese v1.16b,v20.16b
2869
aesmc v1.16b,v1.16b
2870
aese v24.16b,v20.16b
2871
aesmc v24.16b,v24.16b
2872
aese v25.16b,v20.16b
2873
aesmc v25.16b,v25.16b
2874
aese v26.16b,v20.16b
2875
aesmc v26.16b,v26.16b
2876
2877
aese v0.16b,v21.16b
2878
aesmc v0.16b,v0.16b
2879
aese v1.16b,v21.16b
2880
aesmc v1.16b,v1.16b
2881
aese v24.16b,v21.16b
2882
aesmc v24.16b,v24.16b
2883
aese v25.16b,v21.16b
2884
aesmc v25.16b,v25.16b
2885
aese v26.16b,v21.16b
2886
aesmc v26.16b,v26.16b
2887
2888
aese v0.16b,v22.16b
2889
aesmc v0.16b,v0.16b
2890
aese v1.16b,v22.16b
2891
aesmc v1.16b,v1.16b
2892
aese v24.16b,v22.16b
2893
aesmc v24.16b,v24.16b
2894
aese v25.16b,v22.16b
2895
aesmc v25.16b,v25.16b
2896
aese v26.16b,v22.16b
2897
aesmc v26.16b,v26.16b
2898
2899
eor v4.16b,v7.16b,v6.16b
2900
aese v0.16b,v23.16b
2901
// The iv for first block of one iteration
2902
extr x22,x10,x10,#32
2903
extr x10,x10,x9,#63
2904
and w11,w19,w22,asr#31
2905
eor x9,x11,x9,lsl#1
2906
fmov d6,x9
2907
fmov v6.d[1],x10
2908
eor v5.16b,v7.16b,v8.16b
2909
ld1 {v2.16b},[x0],#16
2910
aese v1.16b,v23.16b
2911
// The iv for second block
2912
extr x22,x10,x10,#32
2913
extr x10,x10,x9,#63
2914
and w11,w19,w22,asr#31
2915
eor x9,x11,x9,lsl#1
2916
fmov d8,x9
2917
fmov v8.d[1],x10
2918
eor v17.16b,v7.16b,v9.16b
2919
ld1 {v3.16b},[x0],#16
2920
aese v24.16b,v23.16b
2921
// The iv for third block
2922
extr x22,x10,x10,#32
2923
extr x10,x10,x9,#63
2924
and w11,w19,w22,asr#31
2925
eor x9,x11,x9,lsl#1
2926
fmov d9,x9
2927
fmov v9.d[1],x10
2928
eor v30.16b,v7.16b,v10.16b
2929
ld1 {v27.16b},[x0],#16
2930
aese v25.16b,v23.16b
2931
// The iv for fourth block
2932
extr x22,x10,x10,#32
2933
extr x10,x10,x9,#63
2934
and w11,w19,w22,asr#31
2935
eor x9,x11,x9,lsl#1
2936
fmov d10,x9
2937
fmov v10.d[1],x10
2938
eor v31.16b,v7.16b,v11.16b
2939
ld1 {v28.16b},[x0],#16
2940
aese v26.16b,v23.16b
2941
2942
// The iv for fifth block
2943
extr x22,x10,x10,#32
2944
extr x10,x10,x9,#63
2945
and w11,w19,w22,asr #31
2946
eor x9,x11,x9,lsl #1
2947
fmov d11,x9
2948
fmov v11.d[1],x10
2949
2950
ld1 {v29.16b},[x0],#16
2951
cbz x6,.Lxts_enc_tail4x
2952
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
2953
eor v4.16b,v4.16b,v0.16b
2954
eor v0.16b,v2.16b,v6.16b
2955
eor v5.16b,v5.16b,v1.16b
2956
eor v1.16b,v3.16b,v8.16b
2957
eor v17.16b,v17.16b,v24.16b
2958
eor v24.16b,v27.16b,v9.16b
2959
eor v30.16b,v30.16b,v25.16b
2960
eor v25.16b,v28.16b,v10.16b
2961
eor v31.16b,v31.16b,v26.16b
2962
st1 {v4.16b},[x1],#16
2963
eor v26.16b,v29.16b,v11.16b
2964
st1 {v5.16b},[x1],#16
2965
mov w6,w5
2966
st1 {v17.16b},[x1],#16
2967
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
2968
st1 {v30.16b},[x1],#16
2969
st1 {v31.16b},[x1],#16
2970
b.hs .Loop5x_xts_enc
2971
2972
2973
// If left 4 blocks, borrow the five block's processing.
2974
cmn x2,#0x10
2975
b.ne .Loop5x_enc_after
2976
orr v11.16b,v10.16b,v10.16b
2977
orr v10.16b,v9.16b,v9.16b
2978
orr v9.16b,v8.16b,v8.16b
2979
orr v8.16b,v6.16b,v6.16b
2980
fmov x9,d11
2981
fmov x10,v11.d[1]
2982
eor v0.16b,v6.16b,v2.16b
2983
eor v1.16b,v8.16b,v3.16b
2984
eor v24.16b,v27.16b,v9.16b
2985
eor v25.16b,v28.16b,v10.16b
2986
eor v26.16b,v29.16b,v11.16b
2987
b.eq .Loop5x_xts_enc
2988
2989
.Loop5x_enc_after:
2990
add x2,x2,#0x50
2991
cbz x2,.Lxts_enc_done
2992
2993
add w6,w5,#2
2994
subs x2,x2,#0x30
2995
b.lo .Lxts_inner_enc_tail
2996
2997
eor v0.16b,v6.16b,v27.16b
2998
eor v1.16b,v8.16b,v28.16b
2999
eor v24.16b,v29.16b,v9.16b
3000
b .Lxts_outer_enc_tail
3001
3002
.align 4
3003
.Lxts_enc_tail4x:
3004
add x0,x0,#16
3005
eor v5.16b,v1.16b,v5.16b
3006
st1 {v5.16b},[x1],#16
3007
eor v17.16b,v24.16b,v17.16b
3008
st1 {v17.16b},[x1],#16
3009
eor v30.16b,v25.16b,v30.16b
3010
eor v31.16b,v26.16b,v31.16b
3011
st1 {v30.16b,v31.16b},[x1],#32
3012
3013
b .Lxts_enc_done
3014
.align 4
3015
.Lxts_outer_enc_tail:
3016
aese v0.16b,v16.16b
3017
aesmc v0.16b,v0.16b
3018
aese v1.16b,v16.16b
3019
aesmc v1.16b,v1.16b
3020
aese v24.16b,v16.16b
3021
aesmc v24.16b,v24.16b
3022
ld1 {v16.4s},[x7],#16
3023
subs w6,w6,#2
3024
aese v0.16b,v17.16b
3025
aesmc v0.16b,v0.16b
3026
aese v1.16b,v17.16b
3027
aesmc v1.16b,v1.16b
3028
aese v24.16b,v17.16b
3029
aesmc v24.16b,v24.16b
3030
ld1 {v17.4s},[x7],#16
3031
b.gt .Lxts_outer_enc_tail
3032
3033
aese v0.16b,v16.16b
3034
aesmc v0.16b,v0.16b
3035
aese v1.16b,v16.16b
3036
aesmc v1.16b,v1.16b
3037
aese v24.16b,v16.16b
3038
aesmc v24.16b,v24.16b
3039
eor v4.16b,v6.16b,v7.16b
3040
subs x2,x2,#0x30
3041
// The iv for first block
3042
fmov x9,d9
3043
fmov x10,v9.d[1]
3044
//mov w19,#0x87
3045
extr x22,x10,x10,#32
3046
extr x10,x10,x9,#63
3047
and w11,w19,w22,asr#31
3048
eor x9,x11,x9,lsl#1
3049
fmov d6,x9
3050
fmov v6.d[1],x10
3051
eor v5.16b,v8.16b,v7.16b
3052
csel x6,x2,x6,lo // x6, w6, is zero at this point
3053
aese v0.16b,v17.16b
3054
aesmc v0.16b,v0.16b
3055
aese v1.16b,v17.16b
3056
aesmc v1.16b,v1.16b
3057
aese v24.16b,v17.16b
3058
aesmc v24.16b,v24.16b
3059
eor v17.16b,v9.16b,v7.16b
3060
3061
add x6,x6,#0x20
3062
add x0,x0,x6
3063
mov x7,x3
3064
3065
aese v0.16b,v20.16b
3066
aesmc v0.16b,v0.16b
3067
aese v1.16b,v20.16b
3068
aesmc v1.16b,v1.16b
3069
aese v24.16b,v20.16b
3070
aesmc v24.16b,v24.16b
3071
aese v0.16b,v21.16b
3072
aesmc v0.16b,v0.16b
3073
aese v1.16b,v21.16b
3074
aesmc v1.16b,v1.16b
3075
aese v24.16b,v21.16b
3076
aesmc v24.16b,v24.16b
3077
aese v0.16b,v22.16b
3078
aesmc v0.16b,v0.16b
3079
aese v1.16b,v22.16b
3080
aesmc v1.16b,v1.16b
3081
aese v24.16b,v22.16b
3082
aesmc v24.16b,v24.16b
3083
aese v0.16b,v23.16b
3084
aese v1.16b,v23.16b
3085
aese v24.16b,v23.16b
3086
ld1 {v27.16b},[x0],#16
3087
add w6,w5,#2
3088
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
3089
eor v4.16b,v4.16b,v0.16b
3090
eor v5.16b,v5.16b,v1.16b
3091
eor v24.16b,v24.16b,v17.16b
3092
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
3093
st1 {v4.16b},[x1],#16
3094
st1 {v5.16b},[x1],#16
3095
st1 {v24.16b},[x1],#16
3096
cmn x2,#0x30
3097
b.eq .Lxts_enc_done
3098
.Lxts_encxor_one:
3099
orr v28.16b,v3.16b,v3.16b
3100
orr v29.16b,v27.16b,v27.16b
3101
nop
3102
3103
.Lxts_inner_enc_tail:
3104
cmn x2,#0x10
3105
eor v1.16b,v28.16b,v6.16b
3106
eor v24.16b,v29.16b,v8.16b
3107
b.eq .Lxts_enc_tail_loop
3108
eor v24.16b,v29.16b,v6.16b
3109
.Lxts_enc_tail_loop:
3110
aese v1.16b,v16.16b
3111
aesmc v1.16b,v1.16b
3112
aese v24.16b,v16.16b
3113
aesmc v24.16b,v24.16b
3114
ld1 {v16.4s},[x7],#16
3115
subs w6,w6,#2
3116
aese v1.16b,v17.16b
3117
aesmc v1.16b,v1.16b
3118
aese v24.16b,v17.16b
3119
aesmc v24.16b,v24.16b
3120
ld1 {v17.4s},[x7],#16
3121
b.gt .Lxts_enc_tail_loop
3122
3123
aese v1.16b,v16.16b
3124
aesmc v1.16b,v1.16b
3125
aese v24.16b,v16.16b
3126
aesmc v24.16b,v24.16b
3127
aese v1.16b,v17.16b
3128
aesmc v1.16b,v1.16b
3129
aese v24.16b,v17.16b
3130
aesmc v24.16b,v24.16b
3131
aese v1.16b,v20.16b
3132
aesmc v1.16b,v1.16b
3133
aese v24.16b,v20.16b
3134
aesmc v24.16b,v24.16b
3135
cmn x2,#0x20
3136
aese v1.16b,v21.16b
3137
aesmc v1.16b,v1.16b
3138
aese v24.16b,v21.16b
3139
aesmc v24.16b,v24.16b
3140
eor v5.16b,v6.16b,v7.16b
3141
aese v1.16b,v22.16b
3142
aesmc v1.16b,v1.16b
3143
aese v24.16b,v22.16b
3144
aesmc v24.16b,v24.16b
3145
eor v17.16b,v8.16b,v7.16b
3146
aese v1.16b,v23.16b
3147
aese v24.16b,v23.16b
3148
b.eq .Lxts_enc_one
3149
eor v5.16b,v5.16b,v1.16b
3150
st1 {v5.16b},[x1],#16
3151
eor v17.16b,v17.16b,v24.16b
3152
orr v6.16b,v8.16b,v8.16b
3153
st1 {v17.16b},[x1],#16
3154
fmov x9,d8
3155
fmov x10,v8.d[1]
3156
mov w19,#0x87
3157
extr x22,x10,x10,#32
3158
extr x10,x10,x9,#63
3159
and w11,w19,w22,asr #31
3160
eor x9,x11,x9,lsl #1
3161
fmov d6,x9
3162
fmov v6.d[1],x10
3163
b .Lxts_enc_done
3164
3165
.Lxts_enc_one:
3166
eor v5.16b,v5.16b,v24.16b
3167
orr v6.16b,v6.16b,v6.16b
3168
st1 {v5.16b},[x1],#16
3169
fmov x9,d6
3170
fmov x10,v6.d[1]
3171
mov w19,#0x87
3172
extr x22,x10,x10,#32
3173
extr x10,x10,x9,#63
3174
and w11,w19,w22,asr #31
3175
eor x9,x11,x9,lsl #1
3176
fmov d6,x9
3177
fmov v6.d[1],x10
3178
b .Lxts_enc_done
3179
.align 5
3180
.Lxts_enc_done:
3181
// Process the tail block with cipher stealing.
3182
tst x21,#0xf
3183
b.eq .Lxts_abort
3184
3185
mov x20,x0
3186
mov x13,x1
3187
sub x1,x1,#16
3188
.composite_enc_loop:
3189
subs x21,x21,#1
3190
ldrb w15,[x1,x21]
3191
ldrb w14,[x20,x21]
3192
strb w15,[x13,x21]
3193
strb w14,[x1,x21]
3194
b.gt .composite_enc_loop
3195
.Lxts_enc_load_done:
3196
ld1 {v26.16b},[x1]
3197
eor v26.16b,v26.16b,v6.16b
3198
3199
// Encrypt the composite block to get the last second encrypted text block
3200
ldr w6,[x3,#240] // load key schedule...
3201
ld1 {v0.4s},[x3],#16
3202
sub w6,w6,#2
3203
ld1 {v1.4s},[x3],#16 // load key schedule...
3204
.Loop_final_enc:
3205
aese v26.16b,v0.16b
3206
aesmc v26.16b,v26.16b
3207
ld1 {v0.4s},[x3],#16
3208
subs w6,w6,#2
3209
aese v26.16b,v1.16b
3210
aesmc v26.16b,v26.16b
3211
ld1 {v1.4s},[x3],#16
3212
b.gt .Loop_final_enc
3213
3214
aese v26.16b,v0.16b
3215
aesmc v26.16b,v26.16b
3216
ld1 {v0.4s},[x3]
3217
aese v26.16b,v1.16b
3218
eor v26.16b,v26.16b,v0.16b
3219
eor v26.16b,v26.16b,v6.16b
3220
st1 {v26.16b},[x1]
3221
3222
.Lxts_abort:
3223
ldp x21,x22,[sp,#48]
3224
ldp d8,d9,[sp,#32]
3225
ldp d10,d11,[sp,#16]
3226
ldp x19,x20,[sp],#64
3227
.Lxts_enc_final_abort:
3228
ret
3229
.size aes_v8_xts_encrypt,.-aes_v8_xts_encrypt
3230
.globl aes_v8_xts_decrypt
3231
.type aes_v8_xts_decrypt,%function
3232
.align 5
3233
aes_v8_xts_decrypt:
3234
AARCH64_VALID_CALL_TARGET
3235
cmp x2,#16
3236
// Original input data size bigger than 16, jump to big size processing.
3237
b.ne .Lxts_dec_big_size
3238
// Encrypt the iv with key2, as the first XEX iv.
3239
ldr w6,[x4,#240]
3240
ld1 {v0.4s},[x4],#16
3241
ld1 {v6.16b},[x5]
3242
sub w6,w6,#2
3243
ld1 {v1.4s},[x4],#16
3244
3245
.Loop_dec_small_iv_enc:
3246
aese v6.16b,v0.16b
3247
aesmc v6.16b,v6.16b
3248
ld1 {v0.4s},[x4],#16
3249
subs w6,w6,#2
3250
aese v6.16b,v1.16b
3251
aesmc v6.16b,v6.16b
3252
ld1 {v1.4s},[x4],#16
3253
b.gt .Loop_dec_small_iv_enc
3254
3255
aese v6.16b,v0.16b
3256
aesmc v6.16b,v6.16b
3257
ld1 {v0.4s},[x4]
3258
aese v6.16b,v1.16b
3259
eor v6.16b,v6.16b,v0.16b
3260
3261
ld1 {v0.16b},[x0]
3262
eor v0.16b,v6.16b,v0.16b
3263
3264
ldr w6,[x3,#240]
3265
ld1 {v28.4s,v29.4s},[x3],#32 // load key schedule...
3266
3267
aesd v0.16b,v28.16b
3268
aesimc v0.16b,v0.16b
3269
ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule...
3270
aesd v0.16b,v29.16b
3271
aesimc v0.16b,v0.16b
3272
subs w6,w6,#10 // bias
3273
b.eq .Lxts_128_dec
3274
.Lxts_dec_round_loop:
3275
aesd v0.16b,v16.16b
3276
aesimc v0.16b,v0.16b
3277
ld1 {v16.4s},[x3],#16 // load key schedule...
3278
aesd v0.16b,v17.16b
3279
aesimc v0.16b,v0.16b
3280
ld1 {v17.4s},[x3],#16 // load key schedule...
3281
subs w6,w6,#2 // bias
3282
b.gt .Lxts_dec_round_loop
3283
.Lxts_128_dec:
3284
ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule...
3285
aesd v0.16b,v16.16b
3286
aesimc v0.16b,v0.16b
3287
aesd v0.16b,v17.16b
3288
aesimc v0.16b,v0.16b
3289
ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule...
3290
aesd v0.16b,v18.16b
3291
aesimc v0.16b,v0.16b
3292
aesd v0.16b,v19.16b
3293
aesimc v0.16b,v0.16b
3294
ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule...
3295
aesd v0.16b,v20.16b
3296
aesimc v0.16b,v0.16b
3297
aesd v0.16b,v21.16b
3298
aesimc v0.16b,v0.16b
3299
ld1 {v7.4s},[x3]
3300
aesd v0.16b,v22.16b
3301
aesimc v0.16b,v0.16b
3302
aesd v0.16b,v23.16b
3303
eor v0.16b,v0.16b,v7.16b
3304
eor v0.16b,v6.16b,v0.16b
3305
st1 {v0.16b},[x1]
3306
b .Lxts_dec_final_abort
3307
.Lxts_dec_big_size:
3308
stp x19,x20,[sp,#-64]!
3309
stp x21,x22,[sp,#48]
3310
stp d8,d9,[sp,#32]
3311
stp d10,d11,[sp,#16]
3312
3313
and x21,x2,#0xf
3314
and x2,x2,#-16
3315
subs x2,x2,#16
3316
mov x8,#16
3317
b.lo .Lxts_dec_abort
3318
3319
// Encrypt the iv with key2, as the first XEX iv
3320
ldr w6,[x4,#240]
3321
ld1 {v0.4s},[x4],#16
3322
ld1 {v6.16b},[x5]
3323
sub w6,w6,#2
3324
ld1 {v1.4s},[x4],#16
3325
3326
.Loop_dec_iv_enc:
3327
aese v6.16b,v0.16b
3328
aesmc v6.16b,v6.16b
3329
ld1 {v0.4s},[x4],#16
3330
subs w6,w6,#2
3331
aese v6.16b,v1.16b
3332
aesmc v6.16b,v6.16b
3333
ld1 {v1.4s},[x4],#16
3334
b.gt .Loop_dec_iv_enc
3335
3336
aese v6.16b,v0.16b
3337
aesmc v6.16b,v6.16b
3338
ld1 {v0.4s},[x4]
3339
aese v6.16b,v1.16b
3340
eor v6.16b,v6.16b,v0.16b
3341
3342
// The iv for second block
3343
// x9- iv(low), x10 - iv(high)
3344
// the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b
3345
fmov x9,d6
3346
fmov x10,v6.d[1]
3347
mov w19,#0x87
3348
extr x22,x10,x10,#32
3349
extr x10,x10,x9,#63
3350
and w11,w19,w22,asr #31
3351
eor x9,x11,x9,lsl #1
3352
fmov d8,x9
3353
fmov v8.d[1],x10
3354
3355
ldr w5,[x3,#240] // load rounds number
3356
3357
// The iv for third block
3358
extr x22,x10,x10,#32
3359
extr x10,x10,x9,#63
3360
and w11,w19,w22,asr #31
3361
eor x9,x11,x9,lsl #1
3362
fmov d9,x9
3363
fmov v9.d[1],x10
3364
3365
ld1 {v16.4s,v17.4s},[x3] // load key schedule...
3366
sub w5,w5,#6
3367
add x7,x3,x5,lsl#4 // pointer to last 7 round keys
3368
sub w5,w5,#2
3369
ld1 {v18.4s,v19.4s},[x7],#32 // load key schedule...
3370
ld1 {v20.4s,v21.4s},[x7],#32
3371
ld1 {v22.4s,v23.4s},[x7],#32
3372
ld1 {v7.4s},[x7]
3373
3374
// The iv for fourth block
3375
extr x22,x10,x10,#32
3376
extr x10,x10,x9,#63
3377
and w11,w19,w22,asr #31
3378
eor x9,x11,x9,lsl #1
3379
fmov d10,x9
3380
fmov v10.d[1],x10
3381
3382
add x7,x3,#32
3383
mov w6,w5
3384
b .Lxts_dec
3385
3386
// Decryption
3387
.align 5
3388
.Lxts_dec:
3389
tst x21,#0xf
3390
b.eq .Lxts_dec_begin
3391
subs x2,x2,#16
3392
csel x8,xzr,x8,eq
3393
ld1 {v0.16b},[x0],#16
3394
b.lo .Lxts_done
3395
sub x0,x0,#16
3396
.Lxts_dec_begin:
3397
ld1 {v0.16b},[x0],x8
3398
subs x2,x2,#32 // bias
3399
add w6,w5,#2
3400
orr v3.16b,v0.16b,v0.16b
3401
orr v1.16b,v0.16b,v0.16b
3402
orr v28.16b,v0.16b,v0.16b
3403
ld1 {v24.16b},[x0],#16
3404
orr v27.16b,v24.16b,v24.16b
3405
orr v29.16b,v24.16b,v24.16b
3406
b.lo .Lxts_inner_dec_tail
3407
eor v0.16b,v0.16b,v6.16b // before decryt, xor with iv
3408
eor v24.16b,v24.16b,v8.16b
3409
3410
orr v1.16b,v24.16b,v24.16b
3411
ld1 {v24.16b},[x0],#16
3412
orr v2.16b,v0.16b,v0.16b
3413
orr v3.16b,v1.16b,v1.16b
3414
eor v27.16b,v24.16b,v9.16b // third block xox with third iv
3415
eor v24.16b,v24.16b,v9.16b
3416
cmp x2,#32
3417
b.lo .Lxts_outer_dec_tail
3418
3419
ld1 {v25.16b},[x0],#16
3420
3421
// The iv for fifth block
3422
extr x22,x10,x10,#32
3423
extr x10,x10,x9,#63
3424
and w11,w19,w22,asr #31
3425
eor x9,x11,x9,lsl #1
3426
fmov d11,x9
3427
fmov v11.d[1],x10
3428
3429
ld1 {v26.16b},[x0],#16
3430
eor v25.16b,v25.16b,v10.16b // the fourth block
3431
eor v26.16b,v26.16b,v11.16b
3432
sub x2,x2,#32 // bias
3433
mov w6,w5
3434
b .Loop5x_xts_dec
3435
3436
.align 4
3437
.Loop5x_xts_dec:
3438
aesd v0.16b,v16.16b
3439
aesimc v0.16b,v0.16b
3440
aesd v1.16b,v16.16b
3441
aesimc v1.16b,v1.16b
3442
aesd v24.16b,v16.16b
3443
aesimc v24.16b,v24.16b
3444
aesd v25.16b,v16.16b
3445
aesimc v25.16b,v25.16b
3446
aesd v26.16b,v16.16b
3447
aesimc v26.16b,v26.16b
3448
ld1 {v16.4s},[x7],#16 // load key schedule...
3449
subs w6,w6,#2
3450
aesd v0.16b,v17.16b
3451
aesimc v0.16b,v0.16b
3452
aesd v1.16b,v17.16b
3453
aesimc v1.16b,v1.16b
3454
aesd v24.16b,v17.16b
3455
aesimc v24.16b,v24.16b
3456
aesd v25.16b,v17.16b
3457
aesimc v25.16b,v25.16b
3458
aesd v26.16b,v17.16b
3459
aesimc v26.16b,v26.16b
3460
ld1 {v17.4s},[x7],#16 // load key schedule...
3461
b.gt .Loop5x_xts_dec
3462
3463
aesd v0.16b,v16.16b
3464
aesimc v0.16b,v0.16b
3465
aesd v1.16b,v16.16b
3466
aesimc v1.16b,v1.16b
3467
aesd v24.16b,v16.16b
3468
aesimc v24.16b,v24.16b
3469
aesd v25.16b,v16.16b
3470
aesimc v25.16b,v25.16b
3471
aesd v26.16b,v16.16b
3472
aesimc v26.16b,v26.16b
3473
subs x2,x2,#0x50 // because .Lxts_dec_tail4x
3474
3475
aesd v0.16b,v17.16b
3476
aesimc v0.16b,v0.16b
3477
aesd v1.16b,v17.16b
3478
aesimc v1.16b,v1.16b
3479
aesd v24.16b,v17.16b
3480
aesimc v24.16b,v24.16b
3481
aesd v25.16b,v17.16b
3482
aesimc v25.16b,v25.16b
3483
aesd v26.16b,v17.16b
3484
aesimc v26.16b,v26.16b
3485
csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo
3486
mov x7,x3
3487
3488
aesd v0.16b,v18.16b
3489
aesimc v0.16b,v0.16b
3490
aesd v1.16b,v18.16b
3491
aesimc v1.16b,v1.16b
3492
aesd v24.16b,v18.16b
3493
aesimc v24.16b,v24.16b
3494
aesd v25.16b,v18.16b
3495
aesimc v25.16b,v25.16b
3496
aesd v26.16b,v18.16b
3497
aesimc v26.16b,v26.16b
3498
add x0,x0,x6 // x0 is adjusted in such way that
3499
// at exit from the loop v1.16b-v26.16b
3500
// are loaded with last "words"
3501
add x6,x2,#0x60 // because .Lxts_dec_tail4x
3502
3503
aesd v0.16b,v19.16b
3504
aesimc v0.16b,v0.16b
3505
aesd v1.16b,v19.16b
3506
aesimc v1.16b,v1.16b
3507
aesd v24.16b,v19.16b
3508
aesimc v24.16b,v24.16b
3509
aesd v25.16b,v19.16b
3510
aesimc v25.16b,v25.16b
3511
aesd v26.16b,v19.16b
3512
aesimc v26.16b,v26.16b
3513
3514
aesd v0.16b,v20.16b
3515
aesimc v0.16b,v0.16b
3516
aesd v1.16b,v20.16b
3517
aesimc v1.16b,v1.16b
3518
aesd v24.16b,v20.16b
3519
aesimc v24.16b,v24.16b
3520
aesd v25.16b,v20.16b
3521
aesimc v25.16b,v25.16b
3522
aesd v26.16b,v20.16b
3523
aesimc v26.16b,v26.16b
3524
3525
aesd v0.16b,v21.16b
3526
aesimc v0.16b,v0.16b
3527
aesd v1.16b,v21.16b
3528
aesimc v1.16b,v1.16b
3529
aesd v24.16b,v21.16b
3530
aesimc v24.16b,v24.16b
3531
aesd v25.16b,v21.16b
3532
aesimc v25.16b,v25.16b
3533
aesd v26.16b,v21.16b
3534
aesimc v26.16b,v26.16b
3535
3536
aesd v0.16b,v22.16b
3537
aesimc v0.16b,v0.16b
3538
aesd v1.16b,v22.16b
3539
aesimc v1.16b,v1.16b
3540
aesd v24.16b,v22.16b
3541
aesimc v24.16b,v24.16b
3542
aesd v25.16b,v22.16b
3543
aesimc v25.16b,v25.16b
3544
aesd v26.16b,v22.16b
3545
aesimc v26.16b,v26.16b
3546
3547
eor v4.16b,v7.16b,v6.16b
3548
aesd v0.16b,v23.16b
3549
// The iv for first block of next iteration.
3550
extr x22,x10,x10,#32
3551
extr x10,x10,x9,#63
3552
and w11,w19,w22,asr #31
3553
eor x9,x11,x9,lsl #1
3554
fmov d6,x9
3555
fmov v6.d[1],x10
3556
eor v5.16b,v7.16b,v8.16b
3557
ld1 {v2.16b},[x0],#16
3558
aesd v1.16b,v23.16b
3559
// The iv for second block
3560
extr x22,x10,x10,#32
3561
extr x10,x10,x9,#63
3562
and w11,w19,w22,asr #31
3563
eor x9,x11,x9,lsl #1
3564
fmov d8,x9
3565
fmov v8.d[1],x10
3566
eor v17.16b,v7.16b,v9.16b
3567
ld1 {v3.16b},[x0],#16
3568
aesd v24.16b,v23.16b
3569
// The iv for third block
3570
extr x22,x10,x10,#32
3571
extr x10,x10,x9,#63
3572
and w11,w19,w22,asr #31
3573
eor x9,x11,x9,lsl #1
3574
fmov d9,x9
3575
fmov v9.d[1],x10
3576
eor v30.16b,v7.16b,v10.16b
3577
ld1 {v27.16b},[x0],#16
3578
aesd v25.16b,v23.16b
3579
// The iv for fourth block
3580
extr x22,x10,x10,#32
3581
extr x10,x10,x9,#63
3582
and w11,w19,w22,asr #31
3583
eor x9,x11,x9,lsl #1
3584
fmov d10,x9
3585
fmov v10.d[1],x10
3586
eor v31.16b,v7.16b,v11.16b
3587
ld1 {v28.16b},[x0],#16
3588
aesd v26.16b,v23.16b
3589
3590
// The iv for fifth block
3591
extr x22,x10,x10,#32
3592
extr x10,x10,x9,#63
3593
and w11,w19,w22,asr #31
3594
eor x9,x11,x9,lsl #1
3595
fmov d11,x9
3596
fmov v11.d[1],x10
3597
3598
ld1 {v29.16b},[x0],#16
3599
cbz x6,.Lxts_dec_tail4x
3600
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
3601
eor v4.16b,v4.16b,v0.16b
3602
eor v0.16b,v2.16b,v6.16b
3603
eor v5.16b,v5.16b,v1.16b
3604
eor v1.16b,v3.16b,v8.16b
3605
eor v17.16b,v17.16b,v24.16b
3606
eor v24.16b,v27.16b,v9.16b
3607
eor v30.16b,v30.16b,v25.16b
3608
eor v25.16b,v28.16b,v10.16b
3609
eor v31.16b,v31.16b,v26.16b
3610
st1 {v4.16b},[x1],#16
3611
eor v26.16b,v29.16b,v11.16b
3612
st1 {v5.16b},[x1],#16
3613
mov w6,w5
3614
st1 {v17.16b},[x1],#16
3615
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
3616
st1 {v30.16b},[x1],#16
3617
st1 {v31.16b},[x1],#16
3618
b.hs .Loop5x_xts_dec
3619
3620
cmn x2,#0x10
3621
b.ne .Loop5x_dec_after
3622
// If x2(x2) equal to -0x10, the left blocks is 4.
3623
// After specially processing, utilize the five blocks processing again.
3624
// It will use the following IVs: v6.16b,v6.16b,v8.16b,v9.16b,v10.16b.
3625
orr v11.16b,v10.16b,v10.16b
3626
orr v10.16b,v9.16b,v9.16b
3627
orr v9.16b,v8.16b,v8.16b
3628
orr v8.16b,v6.16b,v6.16b
3629
fmov x9,d11
3630
fmov x10,v11.d[1]
3631
eor v0.16b,v6.16b,v2.16b
3632
eor v1.16b,v8.16b,v3.16b
3633
eor v24.16b,v27.16b,v9.16b
3634
eor v25.16b,v28.16b,v10.16b
3635
eor v26.16b,v29.16b,v11.16b
3636
b.eq .Loop5x_xts_dec
3637
3638
.Loop5x_dec_after:
3639
add x2,x2,#0x50
3640
cbz x2,.Lxts_done
3641
3642
add w6,w5,#2
3643
subs x2,x2,#0x30
3644
b.lo .Lxts_inner_dec_tail
3645
3646
eor v0.16b,v6.16b,v27.16b
3647
eor v1.16b,v8.16b,v28.16b
3648
eor v24.16b,v29.16b,v9.16b
3649
b .Lxts_outer_dec_tail
3650
3651
.align 4
3652
.Lxts_dec_tail4x:
3653
add x0,x0,#16
3654
tst x21,#0xf
3655
eor v5.16b,v1.16b,v4.16b
3656
st1 {v5.16b},[x1],#16
3657
eor v17.16b,v24.16b,v17.16b
3658
st1 {v17.16b},[x1],#16
3659
eor v30.16b,v25.16b,v30.16b
3660
eor v31.16b,v26.16b,v31.16b
3661
st1 {v30.16b,v31.16b},[x1],#32
3662
3663
b.eq .Lxts_dec_abort
3664
ld1 {v0.16b},[x0],#16
3665
b .Lxts_done
3666
.align 4
3667
.Lxts_outer_dec_tail:
3668
aesd v0.16b,v16.16b
3669
aesimc v0.16b,v0.16b
3670
aesd v1.16b,v16.16b
3671
aesimc v1.16b,v1.16b
3672
aesd v24.16b,v16.16b
3673
aesimc v24.16b,v24.16b
3674
ld1 {v16.4s},[x7],#16
3675
subs w6,w6,#2
3676
aesd v0.16b,v17.16b
3677
aesimc v0.16b,v0.16b
3678
aesd v1.16b,v17.16b
3679
aesimc v1.16b,v1.16b
3680
aesd v24.16b,v17.16b
3681
aesimc v24.16b,v24.16b
3682
ld1 {v17.4s},[x7],#16
3683
b.gt .Lxts_outer_dec_tail
3684
3685
aesd v0.16b,v16.16b
3686
aesimc v0.16b,v0.16b
3687
aesd v1.16b,v16.16b
3688
aesimc v1.16b,v1.16b
3689
aesd v24.16b,v16.16b
3690
aesimc v24.16b,v24.16b
3691
eor v4.16b,v6.16b,v7.16b
3692
subs x2,x2,#0x30
3693
// The iv for first block
3694
fmov x9,d9
3695
fmov x10,v9.d[1]
3696
mov w19,#0x87
3697
extr x22,x10,x10,#32
3698
extr x10,x10,x9,#63
3699
and w11,w19,w22,asr #31
3700
eor x9,x11,x9,lsl #1
3701
fmov d6,x9
3702
fmov v6.d[1],x10
3703
eor v5.16b,v8.16b,v7.16b
3704
csel x6,x2,x6,lo // x6, w6, is zero at this point
3705
aesd v0.16b,v17.16b
3706
aesimc v0.16b,v0.16b
3707
aesd v1.16b,v17.16b
3708
aesimc v1.16b,v1.16b
3709
aesd v24.16b,v17.16b
3710
aesimc v24.16b,v24.16b
3711
eor v17.16b,v9.16b,v7.16b
3712
// The iv for second block
3713
extr x22,x10,x10,#32
3714
extr x10,x10,x9,#63
3715
and w11,w19,w22,asr #31
3716
eor x9,x11,x9,lsl #1
3717
fmov d8,x9
3718
fmov v8.d[1],x10
3719
3720
add x6,x6,#0x20
3721
add x0,x0,x6 // x0 is adjusted to the last data
3722
3723
mov x7,x3
3724
3725
// The iv for third block
3726
extr x22,x10,x10,#32
3727
extr x10,x10,x9,#63
3728
and w11,w19,w22,asr #31
3729
eor x9,x11,x9,lsl #1
3730
fmov d9,x9
3731
fmov v9.d[1],x10
3732
3733
aesd v0.16b,v20.16b
3734
aesimc v0.16b,v0.16b
3735
aesd v1.16b,v20.16b
3736
aesimc v1.16b,v1.16b
3737
aesd v24.16b,v20.16b
3738
aesimc v24.16b,v24.16b
3739
aesd v0.16b,v21.16b
3740
aesimc v0.16b,v0.16b
3741
aesd v1.16b,v21.16b
3742
aesimc v1.16b,v1.16b
3743
aesd v24.16b,v21.16b
3744
aesimc v24.16b,v24.16b
3745
aesd v0.16b,v22.16b
3746
aesimc v0.16b,v0.16b
3747
aesd v1.16b,v22.16b
3748
aesimc v1.16b,v1.16b
3749
aesd v24.16b,v22.16b
3750
aesimc v24.16b,v24.16b
3751
ld1 {v27.16b},[x0],#16
3752
aesd v0.16b,v23.16b
3753
aesd v1.16b,v23.16b
3754
aesd v24.16b,v23.16b
3755
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
3756
add w6,w5,#2
3757
eor v4.16b,v4.16b,v0.16b
3758
eor v5.16b,v5.16b,v1.16b
3759
eor v24.16b,v24.16b,v17.16b
3760
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
3761
st1 {v4.16b},[x1],#16
3762
st1 {v5.16b},[x1],#16
3763
st1 {v24.16b},[x1],#16
3764
3765
cmn x2,#0x30
3766
add x2,x2,#0x30
3767
b.eq .Lxts_done
3768
sub x2,x2,#0x30
3769
orr v28.16b,v3.16b,v3.16b
3770
orr v29.16b,v27.16b,v27.16b
3771
nop
3772
3773
.Lxts_inner_dec_tail:
3774
// x2 == -0x10 means two blocks left.
3775
cmn x2,#0x10
3776
eor v1.16b,v28.16b,v6.16b
3777
eor v24.16b,v29.16b,v8.16b
3778
b.eq .Lxts_dec_tail_loop
3779
eor v24.16b,v29.16b,v6.16b
3780
.Lxts_dec_tail_loop:
3781
aesd v1.16b,v16.16b
3782
aesimc v1.16b,v1.16b
3783
aesd v24.16b,v16.16b
3784
aesimc v24.16b,v24.16b
3785
ld1 {v16.4s},[x7],#16
3786
subs w6,w6,#2
3787
aesd v1.16b,v17.16b
3788
aesimc v1.16b,v1.16b
3789
aesd v24.16b,v17.16b
3790
aesimc v24.16b,v24.16b
3791
ld1 {v17.4s},[x7],#16
3792
b.gt .Lxts_dec_tail_loop
3793
3794
aesd v1.16b,v16.16b
3795
aesimc v1.16b,v1.16b
3796
aesd v24.16b,v16.16b
3797
aesimc v24.16b,v24.16b
3798
aesd v1.16b,v17.16b
3799
aesimc v1.16b,v1.16b
3800
aesd v24.16b,v17.16b
3801
aesimc v24.16b,v24.16b
3802
aesd v1.16b,v20.16b
3803
aesimc v1.16b,v1.16b
3804
aesd v24.16b,v20.16b
3805
aesimc v24.16b,v24.16b
3806
cmn x2,#0x20
3807
aesd v1.16b,v21.16b
3808
aesimc v1.16b,v1.16b
3809
aesd v24.16b,v21.16b
3810
aesimc v24.16b,v24.16b
3811
eor v5.16b,v6.16b,v7.16b
3812
aesd v1.16b,v22.16b
3813
aesimc v1.16b,v1.16b
3814
aesd v24.16b,v22.16b
3815
aesimc v24.16b,v24.16b
3816
eor v17.16b,v8.16b,v7.16b
3817
aesd v1.16b,v23.16b
3818
aesd v24.16b,v23.16b
3819
b.eq .Lxts_dec_one
3820
eor v5.16b,v5.16b,v1.16b
3821
eor v17.16b,v17.16b,v24.16b
3822
orr v6.16b,v9.16b,v9.16b
3823
orr v8.16b,v10.16b,v10.16b
3824
st1 {v5.16b},[x1],#16
3825
st1 {v17.16b},[x1],#16
3826
add x2,x2,#16
3827
b .Lxts_done
3828
3829
.Lxts_dec_one:
3830
eor v5.16b,v5.16b,v24.16b
3831
orr v6.16b,v8.16b,v8.16b
3832
orr v8.16b,v9.16b,v9.16b
3833
st1 {v5.16b},[x1],#16
3834
add x2,x2,#32
3835
3836
.Lxts_done:
3837
tst x21,#0xf
3838
b.eq .Lxts_dec_abort
3839
// Processing the last two blocks with cipher stealing.
3840
mov x7,x3
3841
cbnz x2,.Lxts_dec_1st_done
3842
ld1 {v0.16b},[x0],#16
3843
3844
// Decrypt the last second block to get the last plain text block
3845
.Lxts_dec_1st_done:
3846
eor v26.16b,v0.16b,v8.16b
3847
ldr w6,[x3,#240]
3848
ld1 {v0.4s},[x3],#16
3849
sub w6,w6,#2
3850
ld1 {v1.4s},[x3],#16
3851
.Loop_final_2nd_dec:
3852
aesd v26.16b,v0.16b
3853
aesimc v26.16b,v26.16b
3854
ld1 {v0.4s},[x3],#16 // load key schedule...
3855
subs w6,w6,#2
3856
aesd v26.16b,v1.16b
3857
aesimc v26.16b,v26.16b
3858
ld1 {v1.4s},[x3],#16 // load key schedule...
3859
b.gt .Loop_final_2nd_dec
3860
3861
aesd v26.16b,v0.16b
3862
aesimc v26.16b,v26.16b
3863
ld1 {v0.4s},[x3]
3864
aesd v26.16b,v1.16b
3865
eor v26.16b,v26.16b,v0.16b
3866
eor v26.16b,v26.16b,v8.16b
3867
st1 {v26.16b},[x1]
3868
3869
mov x20,x0
3870
add x13,x1,#16
3871
3872
// Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
3873
// to get the last encrypted block.
3874
.composite_dec_loop:
3875
subs x21,x21,#1
3876
ldrb w15,[x1,x21]
3877
ldrb w14,[x20,x21]
3878
strb w15,[x13,x21]
3879
strb w14,[x1,x21]
3880
b.gt .composite_dec_loop
3881
.Lxts_dec_load_done:
3882
ld1 {v26.16b},[x1]
3883
eor v26.16b,v26.16b,v6.16b
3884
3885
// Decrypt the composite block to get the last second plain text block
3886
ldr w6,[x7,#240]
3887
ld1 {v0.4s},[x7],#16
3888
sub w6,w6,#2
3889
ld1 {v1.4s},[x7],#16
3890
.Loop_final_dec:
3891
aesd v26.16b,v0.16b
3892
aesimc v26.16b,v26.16b
3893
ld1 {v0.4s},[x7],#16 // load key schedule...
3894
subs w6,w6,#2
3895
aesd v26.16b,v1.16b
3896
aesimc v26.16b,v26.16b
3897
ld1 {v1.4s},[x7],#16 // load key schedule...
3898
b.gt .Loop_final_dec
3899
3900
aesd v26.16b,v0.16b
3901
aesimc v26.16b,v26.16b
3902
ld1 {v0.4s},[x7]
3903
aesd v26.16b,v1.16b
3904
eor v26.16b,v26.16b,v0.16b
3905
eor v26.16b,v26.16b,v6.16b
3906
st1 {v26.16b},[x1]
3907
3908
.Lxts_dec_abort:
3909
ldp x21,x22,[sp,#48]
3910
ldp d8,d9,[sp,#32]
3911
ldp d10,d11,[sp,#16]
3912
ldp x19,x20,[sp],#64
3913
3914
.Lxts_dec_final_abort:
3915
ret
3916
.size aes_v8_xts_decrypt,.-aes_v8_xts_decrypt
3917
#endif
3918
3919