Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S
39507 views
1
/* Do not modify. This file is auto-generated from aes-gcm-armv8_64.pl. */
2
#include "arm_arch.h"
3
4
#if __ARM_MAX_ARCH__>=8
5
.arch armv8-a+crypto
6
.text
7
.globl aes_gcm_enc_128_kernel
8
.type aes_gcm_enc_128_kernel,%function
9
.align 4
10
aes_gcm_enc_128_kernel:
11
AARCH64_VALID_CALL_TARGET
12
cbz x1, .L128_enc_ret
13
stp x19, x20, [sp, #-112]!
14
mov x16, x4
15
mov x8, x5
16
stp x21, x22, [sp, #16]
17
stp x23, x24, [sp, #32]
18
stp d8, d9, [sp, #48]
19
stp d10, d11, [sp, #64]
20
stp d12, d13, [sp, #80]
21
stp d14, d15, [sp, #96]
22
23
ldp x10, x11, [x16] //ctr96_b64, ctr96_t32
24
#ifdef __AARCH64EB__
25
rev x10, x10
26
rev x11, x11
27
#endif
28
ldp x13, x14, [x8, #160] //load rk10
29
#ifdef __AARCH64EB__
30
ror x13, x13, #32
31
ror x14, x14, #32
32
#endif
33
ld1 {v11.16b}, [x3]
34
ext v11.16b, v11.16b, v11.16b, #8
35
rev64 v11.16b, v11.16b
36
lsr x5, x1, #3 //byte_len
37
mov x15, x5
38
39
ld1 {v18.4s}, [x8], #16 //load rk0
40
add x4, x0, x1, lsr #3 //end_input_ptr
41
sub x5, x5, #1 //byte_len - 1
42
43
lsr x12, x11, #32
44
ldr q15, [x3, #112] //load h4l | h4h
45
#ifndef __AARCH64EB__
46
ext v15.16b, v15.16b, v15.16b, #8
47
#endif
48
fmov d1, x10 //CTR block 1
49
rev w12, w12 //rev_ctr32
50
51
add w12, w12, #1 //increment rev_ctr32
52
orr w11, w11, w11
53
ld1 {v19.4s}, [x8], #16 //load rk1
54
55
rev w9, w12 //CTR block 1
56
add w12, w12, #1 //CTR block 1
57
fmov d3, x10 //CTR block 3
58
59
orr x9, x11, x9, lsl #32 //CTR block 1
60
ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible
61
62
fmov v1.d[1], x9 //CTR block 1
63
rev w9, w12 //CTR block 2
64
65
fmov d2, x10 //CTR block 2
66
orr x9, x11, x9, lsl #32 //CTR block 2
67
add w12, w12, #1 //CTR block 2
68
69
fmov v2.d[1], x9 //CTR block 2
70
rev w9, w12 //CTR block 3
71
72
orr x9, x11, x9, lsl #32 //CTR block 3
73
ld1 {v20.4s}, [x8], #16 //load rk2
74
75
add w12, w12, #1 //CTR block 3
76
fmov v3.d[1], x9 //CTR block 3
77
78
ldr q14, [x3, #80] //load h3l | h3h
79
#ifndef __AARCH64EB__
80
ext v14.16b, v14.16b, v14.16b, #8
81
#endif
82
aese v1.16b, v18.16b
83
aesmc v1.16b, v1.16b //AES block 1 - round 0
84
ld1 {v21.4s}, [x8], #16 //load rk3
85
86
aese v2.16b, v18.16b
87
aesmc v2.16b, v2.16b //AES block 2 - round 0
88
ldr q12, [x3, #32] //load h1l | h1h
89
#ifndef __AARCH64EB__
90
ext v12.16b, v12.16b, v12.16b, #8
91
#endif
92
93
aese v0.16b, v18.16b
94
aesmc v0.16b, v0.16b //AES block 0 - round 0
95
ld1 {v22.4s}, [x8], #16 //load rk4
96
97
aese v3.16b, v18.16b
98
aesmc v3.16b, v3.16b //AES block 3 - round 0
99
ld1 {v23.4s}, [x8], #16 //load rk5
100
101
aese v2.16b, v19.16b
102
aesmc v2.16b, v2.16b //AES block 2 - round 1
103
trn2 v17.2d, v14.2d, v15.2d //h4l | h3l
104
105
aese v0.16b, v19.16b
106
aesmc v0.16b, v0.16b //AES block 0 - round 1
107
ld1 {v24.4s}, [x8], #16 //load rk6
108
109
aese v1.16b, v19.16b
110
aesmc v1.16b, v1.16b //AES block 1 - round 1
111
ld1 {v25.4s}, [x8], #16 //load rk7
112
113
aese v3.16b, v19.16b
114
aesmc v3.16b, v3.16b //AES block 3 - round 1
115
trn1 v9.2d, v14.2d, v15.2d //h4h | h3h
116
117
aese v0.16b, v20.16b
118
aesmc v0.16b, v0.16b //AES block 0 - round 2
119
ld1 {v26.4s}, [x8], #16 //load rk8
120
121
aese v1.16b, v20.16b
122
aesmc v1.16b, v1.16b //AES block 1 - round 2
123
ldr q13, [x3, #64] //load h2l | h2h
124
#ifndef __AARCH64EB__
125
ext v13.16b, v13.16b, v13.16b, #8
126
#endif
127
128
aese v3.16b, v20.16b
129
aesmc v3.16b, v3.16b //AES block 3 - round 2
130
131
aese v2.16b, v20.16b
132
aesmc v2.16b, v2.16b //AES block 2 - round 2
133
eor v17.16b, v17.16b, v9.16b //h4k | h3k
134
135
aese v0.16b, v21.16b
136
aesmc v0.16b, v0.16b //AES block 0 - round 3
137
138
aese v1.16b, v21.16b
139
aesmc v1.16b, v1.16b //AES block 1 - round 3
140
141
aese v2.16b, v21.16b
142
aesmc v2.16b, v2.16b //AES block 2 - round 3
143
ld1 {v27.4s}, [x8], #16 //load rk9
144
145
aese v3.16b, v21.16b
146
aesmc v3.16b, v3.16b //AES block 3 - round 3
147
148
and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
149
trn2 v16.2d, v12.2d, v13.2d //h2l | h1l
150
151
aese v3.16b, v22.16b
152
aesmc v3.16b, v3.16b //AES block 3 - round 4
153
add x5, x5, x0
154
155
aese v2.16b, v22.16b
156
aesmc v2.16b, v2.16b //AES block 2 - round 4
157
cmp x0, x5 //check if we have <= 4 blocks
158
159
aese v0.16b, v22.16b
160
aesmc v0.16b, v0.16b //AES block 0 - round 4
161
162
aese v3.16b, v23.16b
163
aesmc v3.16b, v3.16b //AES block 3 - round 5
164
165
aese v2.16b, v23.16b
166
aesmc v2.16b, v2.16b //AES block 2 - round 5
167
168
aese v0.16b, v23.16b
169
aesmc v0.16b, v0.16b //AES block 0 - round 5
170
171
aese v3.16b, v24.16b
172
aesmc v3.16b, v3.16b //AES block 3 - round 6
173
174
aese v1.16b, v22.16b
175
aesmc v1.16b, v1.16b //AES block 1 - round 4
176
177
aese v2.16b, v24.16b
178
aesmc v2.16b, v2.16b //AES block 2 - round 6
179
trn1 v8.2d, v12.2d, v13.2d //h2h | h1h
180
181
aese v0.16b, v24.16b
182
aesmc v0.16b, v0.16b //AES block 0 - round 6
183
184
aese v1.16b, v23.16b
185
aesmc v1.16b, v1.16b //AES block 1 - round 5
186
187
aese v3.16b, v25.16b
188
aesmc v3.16b, v3.16b //AES block 3 - round 7
189
190
aese v0.16b, v25.16b
191
aesmc v0.16b, v0.16b //AES block 0 - round 7
192
193
aese v1.16b, v24.16b
194
aesmc v1.16b, v1.16b //AES block 1 - round 6
195
196
aese v2.16b, v25.16b
197
aesmc v2.16b, v2.16b //AES block 2 - round 7
198
199
aese v0.16b, v26.16b
200
aesmc v0.16b, v0.16b //AES block 0 - round 8
201
202
aese v1.16b, v25.16b
203
aesmc v1.16b, v1.16b //AES block 1 - round 7
204
205
aese v2.16b, v26.16b
206
aesmc v2.16b, v2.16b //AES block 2 - round 8
207
208
aese v3.16b, v26.16b
209
aesmc v3.16b, v3.16b //AES block 3 - round 8
210
211
aese v1.16b, v26.16b
212
aesmc v1.16b, v1.16b //AES block 1 - round 8
213
214
aese v2.16b, v27.16b //AES block 2 - round 9
215
216
aese v0.16b, v27.16b //AES block 0 - round 9
217
218
eor v16.16b, v16.16b, v8.16b //h2k | h1k
219
220
aese v1.16b, v27.16b //AES block 1 - round 9
221
222
aese v3.16b, v27.16b //AES block 3 - round 9
223
b.ge .L128_enc_tail //handle tail
224
225
ldp x6, x7, [x0, #0] //AES block 0 - load plaintext
226
#ifdef __AARCH64EB__
227
rev x6, x6
228
rev x7, x7
229
#endif
230
ldp x21, x22, [x0, #32] //AES block 2 - load plaintext
231
#ifdef __AARCH64EB__
232
rev x21, x21
233
rev x22, x22
234
#endif
235
ldp x19, x20, [x0, #16] //AES block 1 - load plaintext
236
#ifdef __AARCH64EB__
237
rev x19, x19
238
rev x20, x20
239
#endif
240
ldp x23, x24, [x0, #48] //AES block 3 - load plaintext
241
#ifdef __AARCH64EB__
242
rev x23, x23
243
rev x24, x24
244
#endif
245
eor x6, x6, x13 //AES block 0 - round 10 low
246
eor x7, x7, x14 //AES block 0 - round 10 high
247
248
eor x21, x21, x13 //AES block 2 - round 10 low
249
fmov d4, x6 //AES block 0 - mov low
250
251
eor x19, x19, x13 //AES block 1 - round 10 low
252
eor x22, x22, x14 //AES block 2 - round 10 high
253
fmov v4.d[1], x7 //AES block 0 - mov high
254
255
fmov d5, x19 //AES block 1 - mov low
256
eor x20, x20, x14 //AES block 1 - round 10 high
257
258
eor x23, x23, x13 //AES block 3 - round 10 low
259
fmov v5.d[1], x20 //AES block 1 - mov high
260
261
fmov d6, x21 //AES block 2 - mov low
262
eor x24, x24, x14 //AES block 3 - round 10 high
263
rev w9, w12 //CTR block 4
264
265
fmov v6.d[1], x22 //AES block 2 - mov high
266
orr x9, x11, x9, lsl #32 //CTR block 4
267
268
eor v4.16b, v4.16b, v0.16b //AES block 0 - result
269
fmov d0, x10 //CTR block 4
270
add w12, w12, #1 //CTR block 4
271
272
fmov v0.d[1], x9 //CTR block 4
273
rev w9, w12 //CTR block 5
274
275
eor v5.16b, v5.16b, v1.16b //AES block 1 - result
276
fmov d1, x10 //CTR block 5
277
orr x9, x11, x9, lsl #32 //CTR block 5
278
279
add w12, w12, #1 //CTR block 5
280
add x0, x0, #64 //AES input_ptr update
281
fmov v1.d[1], x9 //CTR block 5
282
283
fmov d7, x23 //AES block 3 - mov low
284
rev w9, w12 //CTR block 6
285
st1 { v4.16b}, [x2], #16 //AES block 0 - store result
286
287
fmov v7.d[1], x24 //AES block 3 - mov high
288
orr x9, x11, x9, lsl #32 //CTR block 6
289
290
add w12, w12, #1 //CTR block 6
291
eor v6.16b, v6.16b, v2.16b //AES block 2 - result
292
st1 { v5.16b}, [x2], #16 //AES block 1 - store result
293
294
fmov d2, x10 //CTR block 6
295
cmp x0, x5 //check if we have <= 8 blocks
296
297
fmov v2.d[1], x9 //CTR block 6
298
rev w9, w12 //CTR block 7
299
st1 { v6.16b}, [x2], #16 //AES block 2 - store result
300
301
orr x9, x11, x9, lsl #32 //CTR block 7
302
303
eor v7.16b, v7.16b, v3.16b //AES block 3 - result
304
st1 { v7.16b}, [x2], #16 //AES block 3 - store result
305
b.ge .L128_enc_prepretail //do prepretail
306
307
.L128_enc_main_loop: //main loop start
308
ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext
309
#ifdef __AARCH64EB__
310
rev x23, x23
311
rev x24, x24
312
#endif
313
rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free)
314
rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free)
315
316
aese v2.16b, v18.16b
317
aesmc v2.16b, v2.16b //AES block 4k+6 - round 0
318
fmov d3, x10 //CTR block 4k+3
319
320
ext v11.16b, v11.16b, v11.16b, #8 //PRE 0
321
rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free)
322
323
aese v1.16b, v18.16b
324
aesmc v1.16b, v1.16b //AES block 4k+5 - round 0
325
add w12, w12, #1 //CTR block 4k+3
326
fmov v3.d[1], x9 //CTR block 4k+3
327
328
aese v0.16b, v18.16b
329
aesmc v0.16b, v0.16b //AES block 4k+4 - round 0
330
mov d31, v6.d[1] //GHASH block 4k+2 - mid
331
332
aese v2.16b, v19.16b
333
aesmc v2.16b, v2.16b //AES block 4k+6 - round 1
334
mov d30, v5.d[1] //GHASH block 4k+1 - mid
335
336
aese v1.16b, v19.16b
337
aesmc v1.16b, v1.16b //AES block 4k+5 - round 1
338
eor v4.16b, v4.16b, v11.16b //PRE 1
339
340
aese v3.16b, v18.16b
341
aesmc v3.16b, v3.16b //AES block 4k+7 - round 0
342
eor x24, x24, x14 //AES block 4k+3 - round 10 high
343
344
pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high
345
eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid
346
ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext
347
#ifdef __AARCH64EB__
348
rev x6, x6
349
rev x7, x7
350
#endif
351
aese v0.16b, v19.16b
352
aesmc v0.16b, v0.16b //AES block 4k+4 - round 1
353
rev w9, w12 //CTR block 4k+8
354
355
eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid
356
mov d8, v4.d[1] //GHASH block 4k - mid
357
orr x9, x11, x9, lsl #32 //CTR block 4k+8
358
359
pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high
360
add w12, w12, #1 //CTR block 4k+8
361
mov d10, v17.d[1] //GHASH block 4k - mid
362
363
aese v0.16b, v20.16b
364
aesmc v0.16b, v0.16b //AES block 4k+4 - round 2
365
366
pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low
367
eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid
368
369
aese v1.16b, v20.16b
370
aesmc v1.16b, v1.16b //AES block 4k+5 - round 2
371
372
aese v0.16b, v21.16b
373
aesmc v0.16b, v0.16b //AES block 4k+4 - round 3
374
eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high
375
376
pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low
377
378
pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid
379
rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free)
380
381
pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid
382
383
pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low
384
ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid
385
386
pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high
387
eor x7, x7, x14 //AES block 4k+4 - round 10 high
388
389
eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid
390
mov d30, v7.d[1] //GHASH block 4k+3 - mid
391
392
aese v3.16b, v19.16b
393
aesmc v3.16b, v3.16b //AES block 4k+7 - round 1
394
eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low
395
396
aese v2.16b, v20.16b
397
aesmc v2.16b, v2.16b //AES block 4k+6 - round 2
398
eor x6, x6, x13 //AES block 4k+4 - round 10 low
399
400
aese v1.16b, v21.16b
401
aesmc v1.16b, v1.16b //AES block 4k+5 - round 3
402
eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid
403
404
pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high
405
406
aese v2.16b, v21.16b
407
aesmc v2.16b, v2.16b //AES block 4k+6 - round 3
408
eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high
409
410
pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid
411
412
pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low
413
movi v8.8b, #0xc2
414
415
pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid
416
eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low
417
418
aese v1.16b, v22.16b
419
aesmc v1.16b, v1.16b //AES block 4k+5 - round 4
420
421
aese v3.16b, v20.16b
422
aesmc v3.16b, v3.16b //AES block 4k+7 - round 2
423
shl d8, d8, #56 //mod_constant
424
425
aese v0.16b, v22.16b
426
aesmc v0.16b, v0.16b //AES block 4k+4 - round 4
427
eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high
428
429
aese v1.16b, v23.16b
430
aesmc v1.16b, v1.16b //AES block 4k+5 - round 5
431
ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext
432
#ifdef __AARCH64EB__
433
rev x19, x19
434
rev x20, x20
435
#endif
436
aese v3.16b, v21.16b
437
aesmc v3.16b, v3.16b //AES block 4k+7 - round 3
438
eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid
439
440
aese v0.16b, v23.16b
441
aesmc v0.16b, v0.16b //AES block 4k+4 - round 5
442
ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext
443
#ifdef __AARCH64EB__
444
rev x21, x21
445
rev x22, x22
446
#endif
447
pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid
448
eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low
449
450
aese v2.16b, v22.16b
451
aesmc v2.16b, v2.16b //AES block 4k+6 - round 4
452
eor x19, x19, x13 //AES block 4k+5 - round 10 low
453
454
aese v3.16b, v22.16b
455
aesmc v3.16b, v3.16b //AES block 4k+7 - round 4
456
eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid
457
458
aese v1.16b, v24.16b
459
aesmc v1.16b, v1.16b //AES block 4k+5 - round 6
460
eor x23, x23, x13 //AES block 4k+3 - round 10 low
461
462
aese v2.16b, v23.16b
463
aesmc v2.16b, v2.16b //AES block 4k+6 - round 5
464
eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up
465
466
fmov d4, x6 //AES block 4k+4 - mov low
467
aese v0.16b, v24.16b
468
aesmc v0.16b, v0.16b //AES block 4k+4 - round 6
469
fmov v4.d[1], x7 //AES block 4k+4 - mov high
470
471
add x0, x0, #64 //AES input_ptr update
472
fmov d7, x23 //AES block 4k+3 - mov low
473
ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment
474
475
aese v3.16b, v23.16b
476
aesmc v3.16b, v3.16b //AES block 4k+7 - round 5
477
fmov d5, x19 //AES block 4k+5 - mov low
478
479
aese v0.16b, v25.16b
480
aesmc v0.16b, v0.16b //AES block 4k+4 - round 7
481
eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up
482
483
aese v2.16b, v24.16b
484
aesmc v2.16b, v2.16b //AES block 4k+6 - round 6
485
eor x20, x20, x14 //AES block 4k+5 - round 10 high
486
487
aese v1.16b, v25.16b
488
aesmc v1.16b, v1.16b //AES block 4k+5 - round 7
489
fmov v5.d[1], x20 //AES block 4k+5 - mov high
490
491
aese v0.16b, v26.16b
492
aesmc v0.16b, v0.16b //AES block 4k+4 - round 8
493
fmov v7.d[1], x24 //AES block 4k+3 - mov high
494
495
aese v3.16b, v24.16b
496
aesmc v3.16b, v3.16b //AES block 4k+7 - round 6
497
cmp x0, x5 //.LOOP CONTROL
498
499
aese v1.16b, v26.16b
500
aesmc v1.16b, v1.16b //AES block 4k+5 - round 8
501
eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid
502
503
aese v0.16b, v27.16b //AES block 4k+4 - round 9
504
eor x21, x21, x13 //AES block 4k+6 - round 10 low
505
eor x22, x22, x14 //AES block 4k+6 - round 10 high
506
507
aese v3.16b, v25.16b
508
aesmc v3.16b, v3.16b //AES block 4k+7 - round 7
509
fmov d6, x21 //AES block 4k+6 - mov low
510
511
aese v1.16b, v27.16b //AES block 4k+5 - round 9
512
fmov v6.d[1], x22 //AES block 4k+6 - mov high
513
514
aese v2.16b, v25.16b
515
aesmc v2.16b, v2.16b //AES block 4k+6 - round 7
516
eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result
517
518
fmov d0, x10 //CTR block 4k+8
519
aese v3.16b, v26.16b
520
aesmc v3.16b, v3.16b //AES block 4k+7 - round 8
521
522
fmov v0.d[1], x9 //CTR block 4k+8
523
rev w9, w12 //CTR block 4k+9
524
eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid
525
526
aese v2.16b, v26.16b
527
aesmc v2.16b, v2.16b //AES block 4k+6 - round 8
528
eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result
529
530
add w12, w12, #1 //CTR block 4k+9
531
orr x9, x11, x9, lsl #32 //CTR block 4k+9
532
fmov d1, x10 //CTR block 4k+9
533
534
pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low
535
fmov v1.d[1], x9 //CTR block 4k+9
536
rev w9, w12 //CTR block 4k+10
537
538
aese v2.16b, v27.16b //AES block 4k+6 - round 9
539
st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result
540
eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result
541
orr x9, x11, x9, lsl #32 //CTR block 4k+10
542
543
aese v3.16b, v27.16b //AES block 4k+7 - round 9
544
add w12, w12, #1 //CTR block 4k+10
545
ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment
546
fmov d2, x10 //CTR block 4k+10
547
548
eor v11.16b, v11.16b, v9.16b //MODULO - fold into low
549
st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result
550
551
fmov v2.d[1], x9 //CTR block 4k+10
552
st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result
553
rev w9, w12 //CTR block 4k+11
554
555
orr x9, x11, x9, lsl #32 //CTR block 4k+11
556
eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result
557
558
eor v11.16b, v11.16b, v10.16b //MODULO - fold into low
559
st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result
560
b.lt .L128_enc_main_loop
561
562
.L128_enc_prepretail: //PREPRETAIL
563
rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free)
564
fmov d3, x10 //CTR block 4k+3
565
rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free)
566
567
ext v11.16b, v11.16b, v11.16b, #8 //PRE 0
568
add w12, w12, #1 //CTR block 4k+3
569
fmov v3.d[1], x9 //CTR block 4k+3
570
571
aese v1.16b, v18.16b
572
aesmc v1.16b, v1.16b //AES block 4k+5 - round 0
573
rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free)
574
575
pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low
576
577
rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free)
578
eor v4.16b, v4.16b, v11.16b //PRE 1
579
580
pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high
581
582
aese v3.16b, v18.16b
583
aesmc v3.16b, v3.16b //AES block 4k+7 - round 0
584
mov d30, v5.d[1] //GHASH block 4k+1 - mid
585
586
pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low
587
mov d8, v4.d[1] //GHASH block 4k - mid
588
589
mov d31, v6.d[1] //GHASH block 4k+2 - mid
590
mov d10, v17.d[1] //GHASH block 4k - mid
591
592
aese v1.16b, v19.16b
593
aesmc v1.16b, v1.16b //AES block 4k+5 - round 1
594
eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid
595
596
eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid
597
598
pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high
599
eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid
600
601
aese v3.16b, v19.16b
602
aesmc v3.16b, v3.16b //AES block 4k+7 - round 1
603
604
pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid
605
eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low
606
607
pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid
608
609
aese v0.16b, v18.16b
610
aesmc v0.16b, v0.16b //AES block 4k+4 - round 0
611
ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid
612
613
aese v2.16b, v18.16b
614
aesmc v2.16b, v2.16b //AES block 4k+6 - round 0
615
616
eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid
617
mov d30, v7.d[1] //GHASH block 4k+3 - mid
618
619
aese v0.16b, v19.16b
620
aesmc v0.16b, v0.16b //AES block 4k+4 - round 1
621
eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high
622
623
pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid
624
625
pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high
626
eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid
627
628
pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high
629
630
pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low
631
632
aese v2.16b, v19.16b
633
aesmc v2.16b, v2.16b //AES block 4k+6 - round 1
634
eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high
635
636
aese v0.16b, v20.16b
637
aesmc v0.16b, v0.16b //AES block 4k+4 - round 2
638
639
pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low
640
movi v8.8b, #0xc2
641
642
aese v2.16b, v20.16b
643
aesmc v2.16b, v2.16b //AES block 4k+6 - round 2
644
eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low
645
646
aese v3.16b, v20.16b
647
aesmc v3.16b, v3.16b //AES block 4k+7 - round 2
648
649
pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid
650
eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid
651
652
aese v2.16b, v21.16b
653
aesmc v2.16b, v2.16b //AES block 4k+6 - round 3
654
655
aese v1.16b, v20.16b
656
aesmc v1.16b, v1.16b //AES block 4k+5 - round 2
657
eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high
658
659
aese v0.16b, v21.16b
660
aesmc v0.16b, v0.16b //AES block 4k+4 - round 3
661
662
eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid
663
shl d8, d8, #56 //mod_constant
664
665
aese v1.16b, v21.16b
666
aesmc v1.16b, v1.16b //AES block 4k+5 - round 3
667
eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low
668
669
aese v0.16b, v22.16b
670
aesmc v0.16b, v0.16b //AES block 4k+4 - round 4
671
672
pmull v28.1q, v9.1d, v8.1d
673
eor v10.16b, v10.16b, v9.16b //karatsuba tidy up
674
675
aese v1.16b, v22.16b
676
aesmc v1.16b, v1.16b //AES block 4k+5 - round 4
677
678
aese v0.16b, v23.16b
679
aesmc v0.16b, v0.16b //AES block 4k+4 - round 5
680
ext v9.16b, v9.16b, v9.16b, #8
681
682
aese v3.16b, v21.16b
683
aesmc v3.16b, v3.16b //AES block 4k+7 - round 3
684
685
aese v2.16b, v22.16b
686
aesmc v2.16b, v2.16b //AES block 4k+6 - round 4
687
eor v10.16b, v10.16b, v11.16b
688
689
aese v0.16b, v24.16b
690
aesmc v0.16b, v0.16b //AES block 4k+4 - round 6
691
692
aese v3.16b, v22.16b
693
aesmc v3.16b, v3.16b //AES block 4k+7 - round 4
694
695
aese v1.16b, v23.16b
696
aesmc v1.16b, v1.16b //AES block 4k+5 - round 5
697
698
aese v2.16b, v23.16b
699
aesmc v2.16b, v2.16b //AES block 4k+6 - round 5
700
eor v10.16b, v10.16b, v28.16b
701
702
aese v3.16b, v23.16b
703
aesmc v3.16b, v3.16b //AES block 4k+7 - round 5
704
705
aese v1.16b, v24.16b
706
aesmc v1.16b, v1.16b //AES block 4k+5 - round 6
707
708
aese v2.16b, v24.16b
709
aesmc v2.16b, v2.16b //AES block 4k+6 - round 6
710
711
aese v3.16b, v24.16b
712
aesmc v3.16b, v3.16b //AES block 4k+7 - round 6
713
eor v10.16b, v10.16b, v9.16b
714
715
aese v0.16b, v25.16b
716
aesmc v0.16b, v0.16b //AES block 4k+4 - round 7
717
718
aese v2.16b, v25.16b
719
aesmc v2.16b, v2.16b //AES block 4k+6 - round 7
720
721
aese v3.16b, v25.16b
722
aesmc v3.16b, v3.16b //AES block 4k+7 - round 7
723
724
pmull v28.1q, v10.1d, v8.1d
725
726
aese v1.16b, v25.16b
727
aesmc v1.16b, v1.16b //AES block 4k+5 - round 7
728
ext v10.16b, v10.16b, v10.16b, #8
729
730
aese v3.16b, v26.16b
731
aesmc v3.16b, v3.16b //AES block 4k+7 - round 8
732
733
aese v0.16b, v26.16b
734
aesmc v0.16b, v0.16b //AES block 4k+4 - round 8
735
eor v11.16b, v11.16b, v28.16b
736
737
aese v1.16b, v26.16b
738
aesmc v1.16b, v1.16b //AES block 4k+5 - round 8
739
740
aese v3.16b, v27.16b //AES block 4k+7 - round 9
741
742
aese v2.16b, v26.16b
743
aesmc v2.16b, v2.16b //AES block 4k+6 - round 8
744
745
aese v0.16b, v27.16b //AES block 4k+4 - round 9
746
747
aese v1.16b, v27.16b //AES block 4k+5 - round 9
748
eor v11.16b, v11.16b, v10.16b
749
750
aese v2.16b, v27.16b //AES block 4k+6 - round 9
751
.L128_enc_tail: //TAIL
752
753
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
754
ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext
755
#ifdef __AARCH64EB__
756
rev x6, x6
757
rev x7, x7
758
#endif
759
cmp x5, #48
760
761
ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag
762
eor x6, x6, x13 //AES block 4k+4 - round 10 low
763
eor x7, x7, x14 //AES block 4k+4 - round 10 high
764
765
fmov d4, x6 //AES block 4k+4 - mov low
766
767
fmov v4.d[1], x7 //AES block 4k+4 - mov high
768
769
eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result
770
771
b.gt .L128_enc_blocks_more_than_3
772
773
sub w12, w12, #1
774
movi v11.8b, #0
775
mov v3.16b, v2.16b
776
777
cmp x5, #32
778
mov v2.16b, v1.16b
779
movi v9.8b, #0
780
781
movi v10.8b, #0
782
b.gt .L128_enc_blocks_more_than_2
783
784
mov v3.16b, v1.16b
785
cmp x5, #16
786
787
sub w12, w12, #1
788
b.gt .L128_enc_blocks_more_than_1
789
790
sub w12, w12, #1
791
b .L128_enc_blocks_less_than_1
792
.L128_enc_blocks_more_than_3: //blocks left > 3
793
st1 { v5.16b}, [x2], #16 //AES final-3 block - store result
794
795
ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high
796
#ifdef __AARCH64EB__
797
rev x6, x6
798
rev x7, x7
799
#endif
800
rev64 v4.16b, v5.16b //GHASH final-3 block
801
802
eor v4.16b, v4.16b, v8.16b //feed in partial tag
803
eor x7, x7, x14 //AES final-2 block - round 10 high
804
eor x6, x6, x13 //AES final-2 block - round 10 low
805
806
fmov d5, x6 //AES final-2 block - mov low
807
808
movi v8.8b, #0 //suppress further partial tag feed in
809
fmov v5.d[1], x7 //AES final-2 block - mov high
810
811
pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low
812
mov d22, v4.d[1] //GHASH final-3 block - mid
813
814
pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high
815
816
mov d10, v17.d[1] //GHASH final-3 block - mid
817
818
eor v5.16b, v5.16b, v1.16b //AES final-2 block - result
819
eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid
820
821
pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid
822
.L128_enc_blocks_more_than_2: //blocks left > 2
823
824
st1 { v5.16b}, [x2], #16 //AES final-2 block - store result
825
826
rev64 v4.16b, v5.16b //GHASH final-2 block
827
ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high
828
#ifdef __AARCH64EB__
829
rev x6, x6
830
rev x7, x7
831
#endif
832
eor v4.16b, v4.16b, v8.16b //feed in partial tag
833
834
eor x6, x6, x13 //AES final-1 block - round 10 low
835
836
fmov d5, x6 //AES final-1 block - mov low
837
eor x7, x7, x14 //AES final-1 block - round 10 high
838
839
pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high
840
fmov v5.d[1], x7 //AES final-1 block - mov high
841
842
mov d22, v4.d[1] //GHASH final-2 block - mid
843
844
pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low
845
846
eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high
847
848
eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid
849
850
eor v5.16b, v5.16b, v2.16b //AES final-1 block - result
851
852
eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low
853
854
pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid
855
856
movi v8.8b, #0 //suppress further partial tag feed in
857
858
eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid
859
.L128_enc_blocks_more_than_1: //blocks left > 1
860
861
st1 { v5.16b}, [x2], #16 //AES final-1 block - store result
862
863
rev64 v4.16b, v5.16b //GHASH final-1 block
864
ldp x6, x7, [x0], #16 //AES final block - load input low & high
865
#ifdef __AARCH64EB__
866
rev x6, x6
867
rev x7, x7
868
#endif
869
eor v4.16b, v4.16b, v8.16b //feed in partial tag
870
871
eor x7, x7, x14 //AES final block - round 10 high
872
eor x6, x6, x13 //AES final block - round 10 low
873
874
fmov d5, x6 //AES final block - mov low
875
876
pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high
877
fmov v5.d[1], x7 //AES final block - mov high
878
879
mov d22, v4.d[1] //GHASH final-1 block - mid
880
881
pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low
882
883
eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid
884
885
eor v5.16b, v5.16b, v3.16b //AES final block - result
886
887
ins v22.d[1], v22.d[0] //GHASH final-1 block - mid
888
889
pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid
890
891
eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low
892
893
eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high
894
895
eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid
896
movi v8.8b, #0 //suppress further partial tag feed in
897
.L128_enc_blocks_less_than_1: //blocks left <= 1
898
899
and x1, x1, #127 //bit_length %= 128
900
mvn x13, xzr //rk10_l = 0xffffffffffffffff
901
902
mvn x14, xzr //rk10_h = 0xffffffffffffffff
903
sub x1, x1, #128 //bit_length -= 128
904
905
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
906
907
and x1, x1, #127 //bit_length %= 128
908
909
lsr x14, x14, x1 //rk10_h is mask for top 64b of last block
910
cmp x1, #64
911
912
csel x6, x13, x14, lt
913
csel x7, x14, xzr, lt
914
915
fmov d0, x6 //ctr0b is mask for last block
916
917
fmov v0.d[1], x7
918
919
and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits
920
921
rev64 v4.16b, v5.16b //GHASH final block
922
923
eor v4.16b, v4.16b, v8.16b //feed in partial tag
924
925
mov d8, v4.d[1] //GHASH final block - mid
926
927
pmull v21.1q, v4.1d, v12.1d //GHASH final block - low
928
ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
929
930
eor v8.8b, v8.8b, v4.8b //GHASH final block - mid
931
#ifndef __AARCH64EB__
932
rev w9, w12
933
#else
934
mov w9, w12
935
#endif
936
pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high
937
938
pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid
939
940
eor v11.16b, v11.16b, v21.16b //GHASH final block - low
941
942
eor v9.16b, v9.16b, v20.16b //GHASH final block - high
943
944
eor v10.16b, v10.16b, v8.16b //GHASH final block - mid
945
movi v8.8b, #0xc2
946
947
eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up
948
949
shl d8, d8, #56 //mod_constant
950
951
eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up
952
953
pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid
954
955
ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment
956
957
eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid
958
959
eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid
960
961
pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low
962
963
ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment
964
965
bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing
966
967
eor v11.16b, v11.16b, v9.16b //MODULO - fold into low
968
st1 { v5.16b}, [x2] //store all 16B
969
970
str w9, [x16, #12] //store the updated counter
971
972
eor v11.16b, v11.16b, v10.16b //MODULO - fold into low
973
ext v11.16b, v11.16b, v11.16b, #8
974
rev64 v11.16b, v11.16b
975
mov x0, x15
976
st1 { v11.16b }, [x3]
977
ldp x21, x22, [sp, #16]
978
ldp x23, x24, [sp, #32]
979
ldp d8, d9, [sp, #48]
980
ldp d10, d11, [sp, #64]
981
ldp d12, d13, [sp, #80]
982
ldp d14, d15, [sp, #96]
983
ldp x19, x20, [sp], #112
984
ret
985
986
.L128_enc_ret:
987
mov w0, #0x0
988
ret
989
.size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
990
.globl aes_gcm_dec_128_kernel
991
.type aes_gcm_dec_128_kernel,%function
992
.align 4
993
aes_gcm_dec_128_kernel:
994
AARCH64_VALID_CALL_TARGET
995
cbz x1, .L128_dec_ret
996
stp x19, x20, [sp, #-112]!
997
mov x16, x4
998
mov x8, x5
999
stp x21, x22, [sp, #16]
1000
stp x23, x24, [sp, #32]
1001
stp d8, d9, [sp, #48]
1002
stp d10, d11, [sp, #64]
1003
stp d12, d13, [sp, #80]
1004
stp d14, d15, [sp, #96]
1005
1006
lsr x5, x1, #3 //byte_len
1007
mov x15, x5
1008
ldp x10, x11, [x16] //ctr96_b64, ctr96_t32
1009
#ifdef __AARCH64EB__
1010
rev x10, x10
1011
rev x11, x11
1012
#endif
1013
ldp x13, x14, [x8, #160] //load rk10
1014
#ifdef __AARCH64EB__
1015
ror x14, x14, 32
1016
ror x13, x13, 32
1017
#endif
1018
sub x5, x5, #1 //byte_len - 1
1019
ld1 {v18.4s}, [x8], #16 //load rk0
1020
1021
and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
1022
ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible
1023
1024
ldr q13, [x3, #64] //load h2l | h2h
1025
#ifndef __AARCH64EB__
1026
ext v13.16b, v13.16b, v13.16b, #8
1027
#endif
1028
lsr x12, x11, #32
1029
fmov d2, x10 //CTR block 2
1030
1031
ld1 {v19.4s}, [x8], #16 //load rk1
1032
orr w11, w11, w11
1033
rev w12, w12 //rev_ctr32
1034
1035
fmov d1, x10 //CTR block 1
1036
add w12, w12, #1 //increment rev_ctr32
1037
1038
aese v0.16b, v18.16b
1039
aesmc v0.16b, v0.16b //AES block 0 - round 0
1040
rev w9, w12 //CTR block 1
1041
1042
orr x9, x11, x9, lsl #32 //CTR block 1
1043
ld1 {v20.4s}, [x8], #16 //load rk2
1044
add w12, w12, #1 //CTR block 1
1045
1046
fmov v1.d[1], x9 //CTR block 1
1047
rev w9, w12 //CTR block 2
1048
add w12, w12, #1 //CTR block 2
1049
1050
aese v0.16b, v19.16b
1051
aesmc v0.16b, v0.16b //AES block 0 - round 1
1052
orr x9, x11, x9, lsl #32 //CTR block 2
1053
1054
fmov v2.d[1], x9 //CTR block 2
1055
rev w9, w12 //CTR block 3
1056
1057
fmov d3, x10 //CTR block 3
1058
orr x9, x11, x9, lsl #32 //CTR block 3
1059
add w12, w12, #1 //CTR block 3
1060
1061
fmov v3.d[1], x9 //CTR block 3
1062
add x4, x0, x1, lsr #3 //end_input_ptr
1063
1064
aese v1.16b, v18.16b
1065
aesmc v1.16b, v1.16b //AES block 1 - round 0
1066
ld1 {v21.4s}, [x8], #16 //load rk3
1067
1068
aese v0.16b, v20.16b
1069
aesmc v0.16b, v0.16b //AES block 0 - round 2
1070
ld1 {v22.4s}, [x8], #16 //load rk4
1071
1072
aese v2.16b, v18.16b
1073
aesmc v2.16b, v2.16b //AES block 2 - round 0
1074
ld1 {v23.4s}, [x8], #16 //load rk5
1075
1076
aese v1.16b, v19.16b
1077
aesmc v1.16b, v1.16b //AES block 1 - round 1
1078
ld1 {v24.4s}, [x8], #16 //load rk6
1079
1080
aese v3.16b, v18.16b
1081
aesmc v3.16b, v3.16b //AES block 3 - round 0
1082
1083
aese v2.16b, v19.16b
1084
aesmc v2.16b, v2.16b //AES block 2 - round 1
1085
1086
aese v1.16b, v20.16b
1087
aesmc v1.16b, v1.16b //AES block 1 - round 2
1088
1089
aese v3.16b, v19.16b
1090
aesmc v3.16b, v3.16b //AES block 3 - round 1
1091
ld1 { v11.16b}, [x3]
1092
ext v11.16b, v11.16b, v11.16b, #8
1093
rev64 v11.16b, v11.16b
1094
1095
aese v0.16b, v21.16b
1096
aesmc v0.16b, v0.16b //AES block 0 - round 3
1097
ld1 {v25.4s}, [x8], #16 //load rk7
1098
1099
aese v1.16b, v21.16b
1100
aesmc v1.16b, v1.16b //AES block 1 - round 3
1101
1102
aese v3.16b, v20.16b
1103
aesmc v3.16b, v3.16b //AES block 3 - round 2
1104
1105
aese v2.16b, v20.16b
1106
aesmc v2.16b, v2.16b //AES block 2 - round 2
1107
ld1 {v26.4s}, [x8], #16 //load rk8
1108
1109
aese v1.16b, v22.16b
1110
aesmc v1.16b, v1.16b //AES block 1 - round 4
1111
1112
aese v3.16b, v21.16b
1113
aesmc v3.16b, v3.16b //AES block 3 - round 3
1114
1115
aese v2.16b, v21.16b
1116
aesmc v2.16b, v2.16b //AES block 2 - round 3
1117
ldr q14, [x3, #80] //load h3l | h3h
1118
#ifndef __AARCH64EB__
1119
ext v14.16b, v14.16b, v14.16b, #8
1120
#endif
1121
aese v0.16b, v22.16b
1122
aesmc v0.16b, v0.16b //AES block 0 - round 4
1123
ld1 {v27.4s}, [x8], #16 //load rk9
1124
1125
aese v1.16b, v23.16b
1126
aesmc v1.16b, v1.16b //AES block 1 - round 5
1127
1128
aese v2.16b, v22.16b
1129
aesmc v2.16b, v2.16b //AES block 2 - round 4
1130
1131
aese v3.16b, v22.16b
1132
aesmc v3.16b, v3.16b //AES block 3 - round 4
1133
1134
aese v0.16b, v23.16b
1135
aesmc v0.16b, v0.16b //AES block 0 - round 5
1136
1137
aese v2.16b, v23.16b
1138
aesmc v2.16b, v2.16b //AES block 2 - round 5
1139
ldr q12, [x3, #32] //load h1l | h1h
1140
#ifndef __AARCH64EB__
1141
ext v12.16b, v12.16b, v12.16b, #8
1142
#endif
1143
aese v3.16b, v23.16b
1144
aesmc v3.16b, v3.16b //AES block 3 - round 5
1145
1146
aese v0.16b, v24.16b
1147
aesmc v0.16b, v0.16b //AES block 0 - round 6
1148
1149
aese v1.16b, v24.16b
1150
aesmc v1.16b, v1.16b //AES block 1 - round 6
1151
1152
aese v3.16b, v24.16b
1153
aesmc v3.16b, v3.16b //AES block 3 - round 6
1154
1155
aese v2.16b, v24.16b
1156
aesmc v2.16b, v2.16b //AES block 2 - round 6
1157
trn1 v8.2d, v12.2d, v13.2d //h2h | h1h
1158
1159
ldr q15, [x3, #112] //load h4l | h4h
1160
#ifndef __AARCH64EB__
1161
ext v15.16b, v15.16b, v15.16b, #8
1162
#endif
1163
trn2 v16.2d, v12.2d, v13.2d //h2l | h1l
1164
add x5, x5, x0
1165
1166
aese v1.16b, v25.16b
1167
aesmc v1.16b, v1.16b //AES block 1 - round 7
1168
1169
aese v2.16b, v25.16b
1170
aesmc v2.16b, v2.16b //AES block 2 - round 7
1171
1172
aese v0.16b, v25.16b
1173
aesmc v0.16b, v0.16b //AES block 0 - round 7
1174
eor v16.16b, v16.16b, v8.16b //h2k | h1k
1175
1176
aese v3.16b, v25.16b
1177
aesmc v3.16b, v3.16b //AES block 3 - round 7
1178
1179
aese v1.16b, v26.16b
1180
aesmc v1.16b, v1.16b //AES block 1 - round 8
1181
trn2 v17.2d, v14.2d, v15.2d //h4l | h3l
1182
1183
aese v2.16b, v26.16b
1184
aesmc v2.16b, v2.16b //AES block 2 - round 8
1185
1186
aese v3.16b, v26.16b
1187
aesmc v3.16b, v3.16b //AES block 3 - round 8
1188
1189
aese v0.16b, v26.16b
1190
aesmc v0.16b, v0.16b //AES block 0 - round 8
1191
trn1 v9.2d, v14.2d, v15.2d //h4h | h3h
1192
1193
aese v2.16b, v27.16b //AES block 2 - round 9
1194
1195
aese v3.16b, v27.16b //AES block 3 - round 9
1196
1197
aese v0.16b, v27.16b //AES block 0 - round 9
1198
cmp x0, x5 //check if we have <= 4 blocks
1199
1200
aese v1.16b, v27.16b //AES block 1 - round 9
1201
eor v17.16b, v17.16b, v9.16b //h4k | h3k
1202
b.ge .L128_dec_tail //handle tail
1203
1204
ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0 - load ciphertext; AES block 1 - load ciphertext
1205
1206
eor v1.16b, v5.16b, v1.16b //AES block 1 - result
1207
ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext
1208
1209
eor v0.16b, v4.16b, v0.16b //AES block 0 - result
1210
rev64 v4.16b, v4.16b //GHASH block 0
1211
rev w9, w12 //CTR block 4
1212
1213
orr x9, x11, x9, lsl #32 //CTR block 4
1214
add w12, w12, #1 //CTR block 4
1215
ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext
1216
1217
rev64 v5.16b, v5.16b //GHASH block 1
1218
mov x19, v1.d[0] //AES block 1 - mov low
1219
1220
mov x20, v1.d[1] //AES block 1 - mov high
1221
1222
mov x6, v0.d[0] //AES block 0 - mov low
1223
cmp x0, x5 //check if we have <= 8 blocks
1224
1225
mov x7, v0.d[1] //AES block 0 - mov high
1226
1227
fmov d0, x10 //CTR block 4
1228
1229
fmov v0.d[1], x9 //CTR block 4
1230
rev w9, w12 //CTR block 5
1231
eor x19, x19, x13 //AES block 1 - round 10 low
1232
#ifdef __AARCH64EB__
1233
rev x19, x19
1234
#endif
1235
fmov d1, x10 //CTR block 5
1236
add w12, w12, #1 //CTR block 5
1237
orr x9, x11, x9, lsl #32 //CTR block 5
1238
1239
fmov v1.d[1], x9 //CTR block 5
1240
rev w9, w12 //CTR block 6
1241
add w12, w12, #1 //CTR block 6
1242
1243
orr x9, x11, x9, lsl #32 //CTR block 6
1244
1245
eor x20, x20, x14 //AES block 1 - round 10 high
1246
#ifdef __AARCH64EB__
1247
rev x20, x20
1248
#endif
1249
eor x6, x6, x13 //AES block 0 - round 10 low
1250
#ifdef __AARCH64EB__
1251
rev x6, x6
1252
#endif
1253
eor v2.16b, v6.16b, v2.16b //AES block 2 - result
1254
1255
eor x7, x7, x14 //AES block 0 - round 10 high
1256
#ifdef __AARCH64EB__
1257
rev x7, x7
1258
#endif
1259
stp x6, x7, [x2], #16 //AES block 0 - store result
1260
1261
stp x19, x20, [x2], #16 //AES block 1 - store result
1262
b.ge .L128_dec_prepretail //do prepretail
1263
1264
.L128_dec_main_loop: //main loop start
1265
eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result
1266
ext v11.16b, v11.16b, v11.16b, #8 //PRE 0
1267
mov x21, v2.d[0] //AES block 4k+2 - mov low
1268
1269
pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high
1270
mov x22, v2.d[1] //AES block 4k+2 - mov high
1271
1272
aese v1.16b, v18.16b
1273
aesmc v1.16b, v1.16b //AES block 4k+5 - round 0
1274
fmov d2, x10 //CTR block 4k+6
1275
1276
rev64 v6.16b, v6.16b //GHASH block 4k+2
1277
fmov v2.d[1], x9 //CTR block 4k+6
1278
rev w9, w12 //CTR block 4k+7
1279
1280
mov x23, v3.d[0] //AES block 4k+3 - mov low
1281
eor v4.16b, v4.16b, v11.16b //PRE 1
1282
mov d30, v5.d[1] //GHASH block 4k+1 - mid
1283
1284
aese v1.16b, v19.16b
1285
aesmc v1.16b, v1.16b //AES block 4k+5 - round 1
1286
rev64 v7.16b, v7.16b //GHASH block 4k+3
1287
1288
pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low
1289
mov x24, v3.d[1] //AES block 4k+3 - mov high
1290
orr x9, x11, x9, lsl #32 //CTR block 4k+7
1291
1292
pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low
1293
fmov d3, x10 //CTR block 4k+7
1294
eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid
1295
1296
aese v1.16b, v20.16b
1297
aesmc v1.16b, v1.16b //AES block 4k+5 - round 2
1298
fmov v3.d[1], x9 //CTR block 4k+7
1299
1300
aese v2.16b, v18.16b
1301
aesmc v2.16b, v2.16b //AES block 4k+6 - round 0
1302
mov d10, v17.d[1] //GHASH block 4k - mid
1303
1304
pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high
1305
eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low
1306
1307
pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low
1308
1309
aese v1.16b, v21.16b
1310
aesmc v1.16b, v1.16b //AES block 4k+5 - round 3
1311
mov d8, v4.d[1] //GHASH block 4k - mid
1312
1313
aese v3.16b, v18.16b
1314
aesmc v3.16b, v3.16b //AES block 4k+7 - round 0
1315
eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high
1316
1317
aese v0.16b, v18.16b
1318
aesmc v0.16b, v0.16b //AES block 4k+4 - round 0
1319
1320
pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low
1321
eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid
1322
1323
aese v3.16b, v19.16b
1324
aesmc v3.16b, v3.16b //AES block 4k+7 - round 1
1325
eor x23, x23, x13 //AES block 4k+3 - round 10 low
1326
#ifdef __AARCH64EB__
1327
rev x23, x23
1328
#endif
1329
pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid
1330
eor x22, x22, x14 //AES block 4k+2 - round 10 high
1331
#ifdef __AARCH64EB__
1332
rev x22, x22
1333
#endif
1334
mov d31, v6.d[1] //GHASH block 4k+2 - mid
1335
1336
aese v0.16b, v19.16b
1337
aesmc v0.16b, v0.16b //AES block 4k+4 - round 1
1338
eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low
1339
1340
pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid
1341
1342
aese v3.16b, v20.16b
1343
aesmc v3.16b, v3.16b //AES block 4k+7 - round 2
1344
eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid
1345
1346
aese v0.16b, v20.16b
1347
aesmc v0.16b, v0.16b //AES block 4k+4 - round 2
1348
1349
aese v1.16b, v22.16b
1350
aesmc v1.16b, v1.16b //AES block 4k+5 - round 4
1351
eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid
1352
1353
pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high
1354
1355
aese v0.16b, v21.16b
1356
aesmc v0.16b, v0.16b //AES block 4k+4 - round 3
1357
ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid
1358
1359
pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high
1360
1361
aese v2.16b, v19.16b
1362
aesmc v2.16b, v2.16b //AES block 4k+6 - round 1
1363
mov d30, v7.d[1] //GHASH block 4k+3 - mid
1364
1365
aese v0.16b, v22.16b
1366
aesmc v0.16b, v0.16b //AES block 4k+4 - round 4
1367
eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high
1368
1369
pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid
1370
eor x24, x24, x14 //AES block 4k+3 - round 10 high
1371
#ifdef __AARCH64EB__
1372
rev x24, x24
1373
#endif
1374
aese v2.16b, v20.16b
1375
aesmc v2.16b, v2.16b //AES block 4k+6 - round 2
1376
eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid
1377
1378
aese v1.16b, v23.16b
1379
aesmc v1.16b, v1.16b //AES block 4k+5 - round 5
1380
eor x21, x21, x13 //AES block 4k+2 - round 10 low
1381
#ifdef __AARCH64EB__
1382
rev x21, x21
1383
#endif
1384
aese v0.16b, v23.16b
1385
aesmc v0.16b, v0.16b //AES block 4k+4 - round 5
1386
movi v8.8b, #0xc2
1387
1388
aese v2.16b, v21.16b
1389
aesmc v2.16b, v2.16b //AES block 4k+6 - round 3
1390
eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low
1391
1392
aese v1.16b, v24.16b
1393
aesmc v1.16b, v1.16b //AES block 4k+5 - round 6
1394
1395
aese v0.16b, v24.16b
1396
aesmc v0.16b, v0.16b //AES block 4k+4 - round 6
1397
eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid
1398
1399
aese v2.16b, v22.16b
1400
aesmc v2.16b, v2.16b //AES block 4k+6 - round 4
1401
stp x21, x22, [x2], #16 //AES block 4k+2 - store result
1402
1403
pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid
1404
eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high
1405
ld1 {v4.16b}, [x0], #16 //AES block 4k+3 - load ciphertext
1406
1407
aese v1.16b, v25.16b
1408
aesmc v1.16b, v1.16b //AES block 4k+5 - round 7
1409
add w12, w12, #1 //CTR block 4k+7
1410
1411
aese v0.16b, v25.16b
1412
aesmc v0.16b, v0.16b //AES block 4k+4 - round 7
1413
shl d8, d8, #56 //mod_constant
1414
1415
aese v2.16b, v23.16b
1416
aesmc v2.16b, v2.16b //AES block 4k+6 - round 5
1417
eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid
1418
1419
aese v1.16b, v26.16b
1420
aesmc v1.16b, v1.16b //AES block 4k+5 - round 8
1421
stp x23, x24, [x2], #16 //AES block 4k+3 - store result
1422
1423
aese v0.16b, v26.16b
1424
aesmc v0.16b, v0.16b //AES block 4k+4 - round 8
1425
eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up
1426
1427
aese v3.16b, v21.16b
1428
aesmc v3.16b, v3.16b //AES block 4k+7 - round 3
1429
rev w9, w12 //CTR block 4k+8
1430
1431
pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid
1432
ld1 {v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext
1433
ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment
1434
1435
aese v0.16b, v27.16b //AES block 4k+4 - round 9
1436
orr x9, x11, x9, lsl #32 //CTR block 4k+8
1437
1438
aese v3.16b, v22.16b
1439
aesmc v3.16b, v3.16b //AES block 4k+7 - round 4
1440
eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up
1441
1442
aese v1.16b, v27.16b //AES block 4k+5 - round 9
1443
1444
aese v2.16b, v24.16b
1445
aesmc v2.16b, v2.16b //AES block 4k+6 - round 6
1446
eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result
1447
1448
aese v3.16b, v23.16b
1449
aesmc v3.16b, v3.16b //AES block 4k+7 - round 5
1450
ld1 {v6.16b}, [x0], #16 //AES block 4k+5 - load ciphertext
1451
1452
add w12, w12, #1 //CTR block 4k+8
1453
eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid
1454
eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result
1455
1456
aese v2.16b, v25.16b
1457
aesmc v2.16b, v2.16b //AES block 4k+6 - round 7
1458
ld1 {v7.16b}, [x0], #16 //AES block 4k+6 - load ciphertext
1459
1460
aese v3.16b, v24.16b
1461
aesmc v3.16b, v3.16b //AES block 4k+7 - round 6
1462
1463
rev64 v5.16b, v5.16b //GHASH block 4k+5
1464
eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid
1465
mov x7, v0.d[1] //AES block 4k+4 - mov high
1466
1467
aese v2.16b, v26.16b
1468
aesmc v2.16b, v2.16b //AES block 4k+6 - round 8
1469
mov x6, v0.d[0] //AES block 4k+4 - mov low
1470
1471
aese v3.16b, v25.16b
1472
aesmc v3.16b, v3.16b //AES block 4k+7 - round 7
1473
fmov d0, x10 //CTR block 4k+8
1474
1475
pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low
1476
fmov v0.d[1], x9 //CTR block 4k+8
1477
rev w9, w12 //CTR block 4k+9
1478
1479
aese v2.16b, v27.16b //AES block 4k+6 - round 9
1480
orr x9, x11, x9, lsl #32 //CTR block 4k+9
1481
ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment
1482
1483
aese v3.16b, v26.16b
1484
aesmc v3.16b, v3.16b //AES block 4k+7 - round 8
1485
eor x7, x7, x14 //AES block 4k+4 - round 10 high
1486
#ifdef __AARCH64EB__
1487
rev x7, x7
1488
#endif
1489
eor v11.16b, v11.16b, v8.16b //MODULO - fold into low
1490
mov x20, v1.d[1] //AES block 4k+5 - mov high
1491
eor x6, x6, x13 //AES block 4k+4 - round 10 low
1492
#ifdef __AARCH64EB__
1493
rev x6, x6
1494
#endif
1495
eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result
1496
mov x19, v1.d[0] //AES block 4k+5 - mov low
1497
add w12, w12, #1 //CTR block 4k+9
1498
1499
aese v3.16b, v27.16b //AES block 4k+7 - round 9
1500
fmov d1, x10 //CTR block 4k+9
1501
cmp x0, x5 //.LOOP CONTROL
1502
1503
rev64 v4.16b, v4.16b //GHASH block 4k+4
1504
eor v11.16b, v11.16b, v10.16b //MODULO - fold into low
1505
fmov v1.d[1], x9 //CTR block 4k+9
1506
1507
rev w9, w12 //CTR block 4k+10
1508
add w12, w12, #1 //CTR block 4k+10
1509
1510
eor x20, x20, x14 //AES block 4k+5 - round 10 high
1511
#ifdef __AARCH64EB__
1512
rev x20, x20
1513
#endif
1514
stp x6, x7, [x2], #16 //AES block 4k+4 - store result
1515
1516
eor x19, x19, x13 //AES block 4k+5 - round 10 low
1517
#ifdef __AARCH64EB__
1518
rev x19, x19
1519
#endif
1520
stp x19, x20, [x2], #16 //AES block 4k+5 - store result
1521
1522
orr x9, x11, x9, lsl #32 //CTR block 4k+10
1523
b.lt .L128_dec_main_loop
1524
1525
.L128_dec_prepretail: //PREPRETAIL
1526
ext v11.16b, v11.16b, v11.16b, #8 //PRE 0
1527
mov x21, v2.d[0] //AES block 4k+2 - mov low
1528
mov d30, v5.d[1] //GHASH block 4k+1 - mid
1529
1530
aese v0.16b, v18.16b
1531
aesmc v0.16b, v0.16b //AES block 4k+4 - round 0
1532
eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result
1533
1534
aese v1.16b, v18.16b
1535
aesmc v1.16b, v1.16b //AES block 4k+5 - round 0
1536
mov x22, v2.d[1] //AES block 4k+2 - mov high
1537
1538
eor v4.16b, v4.16b, v11.16b //PRE 1
1539
fmov d2, x10 //CTR block 4k+6
1540
rev64 v6.16b, v6.16b //GHASH block 4k+2
1541
1542
aese v0.16b, v19.16b
1543
aesmc v0.16b, v0.16b //AES block 4k+4 - round 1
1544
fmov v2.d[1], x9 //CTR block 4k+6
1545
1546
rev w9, w12 //CTR block 4k+7
1547
mov x23, v3.d[0] //AES block 4k+3 - mov low
1548
eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid
1549
1550
pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low
1551
mov d10, v17.d[1] //GHASH block 4k - mid
1552
mov x24, v3.d[1] //AES block 4k+3 - mov high
1553
1554
aese v1.16b, v19.16b
1555
aesmc v1.16b, v1.16b //AES block 4k+5 - round 1
1556
mov d31, v6.d[1] //GHASH block 4k+2 - mid
1557
1558
aese v0.16b, v20.16b
1559
aesmc v0.16b, v0.16b //AES block 4k+4 - round 2
1560
orr x9, x11, x9, lsl #32 //CTR block 4k+7
1561
1562
pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low
1563
mov d8, v4.d[1] //GHASH block 4k - mid
1564
fmov d3, x10 //CTR block 4k+7
1565
1566
aese v2.16b, v18.16b
1567
aesmc v2.16b, v2.16b //AES block 4k+6 - round 0
1568
fmov v3.d[1], x9 //CTR block 4k+7
1569
1570
pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid
1571
eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid
1572
1573
rev64 v7.16b, v7.16b //GHASH block 4k+3
1574
1575
aese v2.16b, v19.16b
1576
aesmc v2.16b, v2.16b //AES block 4k+6 - round 1
1577
eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid
1578
1579
pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high
1580
1581
aese v3.16b, v18.16b
1582
aesmc v3.16b, v3.16b //AES block 4k+7 - round 0
1583
ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid
1584
1585
pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high
1586
1587
pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid
1588
eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low
1589
1590
pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low
1591
1592
pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid
1593
eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high
1594
1595
eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid
1596
1597
pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high
1598
1599
pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high
1600
mov d30, v7.d[1] //GHASH block 4k+3 - mid
1601
1602
aese v1.16b, v20.16b
1603
aesmc v1.16b, v1.16b //AES block 4k+5 - round 2
1604
eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid
1605
1606
pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low
1607
1608
eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high
1609
movi v8.8b, #0xc2
1610
1611
aese v3.16b, v19.16b
1612
aesmc v3.16b, v3.16b //AES block 4k+7 - round 1
1613
eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid
1614
1615
eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low
1616
1617
aese v2.16b, v20.16b
1618
aesmc v2.16b, v2.16b //AES block 4k+6 - round 2
1619
eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high
1620
1621
aese v3.16b, v20.16b
1622
aesmc v3.16b, v3.16b //AES block 4k+7 - round 2
1623
eor x23, x23, x13 //AES block 4k+3 - round 10 low
1624
#ifdef __AARCH64EB__
1625
rev x23, x23
1626
#endif
1627
pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid
1628
eor x21, x21, x13 //AES block 4k+2 - round 10 low
1629
#ifdef __AARCH64EB__
1630
rev x21, x21
1631
#endif
1632
eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low
1633
1634
aese v2.16b, v21.16b
1635
aesmc v2.16b, v2.16b //AES block 4k+6 - round 3
1636
1637
aese v1.16b, v21.16b
1638
aesmc v1.16b, v1.16b //AES block 4k+5 - round 3
1639
shl d8, d8, #56 //mod_constant
1640
1641
aese v0.16b, v21.16b
1642
aesmc v0.16b, v0.16b //AES block 4k+4 - round 3
1643
1644
aese v2.16b, v22.16b
1645
aesmc v2.16b, v2.16b //AES block 4k+6 - round 4
1646
eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid
1647
1648
aese v1.16b, v22.16b
1649
aesmc v1.16b, v1.16b //AES block 4k+5 - round 4
1650
1651
aese v3.16b, v21.16b
1652
aesmc v3.16b, v3.16b //AES block 4k+7 - round 3
1653
eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up
1654
1655
aese v2.16b, v23.16b
1656
aesmc v2.16b, v2.16b //AES block 4k+6 - round 5
1657
1658
aese v1.16b, v23.16b
1659
aesmc v1.16b, v1.16b //AES block 4k+5 - round 5
1660
1661
aese v3.16b, v22.16b
1662
aesmc v3.16b, v3.16b //AES block 4k+7 - round 4
1663
1664
aese v0.16b, v22.16b
1665
aesmc v0.16b, v0.16b //AES block 4k+4 - round 4
1666
eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up
1667
1668
pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid
1669
1670
aese v1.16b, v24.16b
1671
aesmc v1.16b, v1.16b //AES block 4k+5 - round 6
1672
ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment
1673
1674
aese v3.16b, v23.16b
1675
aesmc v3.16b, v3.16b //AES block 4k+7 - round 5
1676
1677
aese v0.16b, v23.16b
1678
aesmc v0.16b, v0.16b //AES block 4k+4 - round 5
1679
eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid
1680
1681
aese v1.16b, v25.16b
1682
aesmc v1.16b, v1.16b //AES block 4k+5 - round 7
1683
1684
aese v2.16b, v24.16b
1685
aesmc v2.16b, v2.16b //AES block 4k+6 - round 6
1686
1687
aese v0.16b, v24.16b
1688
aesmc v0.16b, v0.16b //AES block 4k+4 - round 6
1689
1690
aese v1.16b, v26.16b
1691
aesmc v1.16b, v1.16b //AES block 4k+5 - round 8
1692
eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid
1693
1694
aese v3.16b, v24.16b
1695
aesmc v3.16b, v3.16b //AES block 4k+7 - round 6
1696
1697
aese v0.16b, v25.16b
1698
aesmc v0.16b, v0.16b //AES block 4k+4 - round 7
1699
1700
aese v1.16b, v27.16b //AES block 4k+5 - round 9
1701
1702
pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low
1703
eor x24, x24, x14 //AES block 4k+3 - round 10 high
1704
#ifdef __AARCH64EB__
1705
rev x24, x24
1706
#endif
1707
aese v2.16b, v25.16b
1708
aesmc v2.16b, v2.16b //AES block 4k+6 - round 7
1709
ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment
1710
1711
aese v3.16b, v25.16b
1712
aesmc v3.16b, v3.16b //AES block 4k+7 - round 7
1713
1714
aese v0.16b, v26.16b
1715
aesmc v0.16b, v0.16b //AES block 4k+4 - round 8
1716
eor v11.16b, v11.16b, v8.16b //MODULO - fold into low
1717
1718
aese v2.16b, v26.16b
1719
aesmc v2.16b, v2.16b //AES block 4k+6 - round 8
1720
1721
aese v3.16b, v26.16b
1722
aesmc v3.16b, v3.16b //AES block 4k+7 - round 8
1723
eor x22, x22, x14 //AES block 4k+2 - round 10 high
1724
#ifdef __AARCH64EB__
1725
rev x22, x22
1726
#endif
1727
aese v0.16b, v27.16b //AES block 4k+4 - round 9
1728
stp x21, x22, [x2], #16 //AES block 4k+2 - store result
1729
1730
aese v2.16b, v27.16b //AES block 4k+6 - round 9
1731
add w12, w12, #1 //CTR block 4k+7
1732
stp x23, x24, [x2], #16 //AES block 4k+3 - store result
1733
1734
aese v3.16b, v27.16b //AES block 4k+7 - round 9
1735
eor v11.16b, v11.16b, v10.16b //MODULO - fold into low
1736
.L128_dec_tail: //TAIL
1737
1738
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
1739
ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext
1740
1741
eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result
1742
1743
mov x7, v0.d[1] //AES block 4k+4 - mov high
1744
1745
mov x6, v0.d[0] //AES block 4k+4 - mov low
1746
1747
cmp x5, #48
1748
1749
eor x7, x7, x14 //AES block 4k+4 - round 10 high
1750
#ifdef __AARCH64EB__
1751
rev x7, x7
1752
#endif
1753
ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag
1754
eor x6, x6, x13 //AES block 4k+4 - round 10 low
1755
#ifdef __AARCH64EB__
1756
rev x6, x6
1757
#endif
1758
b.gt .L128_dec_blocks_more_than_3
1759
1760
mov v3.16b, v2.16b
1761
sub w12, w12, #1
1762
movi v11.8b, #0
1763
1764
movi v9.8b, #0
1765
mov v2.16b, v1.16b
1766
1767
movi v10.8b, #0
1768
cmp x5, #32
1769
b.gt .L128_dec_blocks_more_than_2
1770
1771
cmp x5, #16
1772
1773
mov v3.16b, v1.16b
1774
sub w12, w12, #1
1775
b.gt .L128_dec_blocks_more_than_1
1776
1777
sub w12, w12, #1
1778
b .L128_dec_blocks_less_than_1
1779
.L128_dec_blocks_more_than_3: //blocks left > 3
1780
rev64 v4.16b, v5.16b //GHASH final-3 block
1781
ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext
1782
1783
eor v4.16b, v4.16b, v8.16b //feed in partial tag
1784
1785
mov d10, v17.d[1] //GHASH final-3 block - mid
1786
stp x6, x7, [x2], #16 //AES final-3 block - store result
1787
eor v0.16b, v5.16b, v1.16b //AES final-2 block - result
1788
1789
mov d22, v4.d[1] //GHASH final-3 block - mid
1790
mov x7, v0.d[1] //AES final-2 block - mov high
1791
1792
pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low
1793
mov x6, v0.d[0] //AES final-2 block - mov low
1794
1795
pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high
1796
1797
eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid
1798
1799
movi v8.8b, #0 //suppress further partial tag feed in
1800
eor x7, x7, x14 //AES final-2 block - round 10 high
1801
#ifdef __AARCH64EB__
1802
rev x7, x7
1803
#endif
1804
pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid
1805
eor x6, x6, x13 //AES final-2 block - round 10 low
1806
#ifdef __AARCH64EB__
1807
rev x6, x6
1808
#endif
1809
.L128_dec_blocks_more_than_2: //blocks left > 2
1810
1811
rev64 v4.16b, v5.16b //GHASH final-2 block
1812
ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext
1813
1814
eor v4.16b, v4.16b, v8.16b //feed in partial tag
1815
1816
eor v0.16b, v5.16b, v2.16b //AES final-1 block - result
1817
stp x6, x7, [x2], #16 //AES final-2 block - store result
1818
1819
mov d22, v4.d[1] //GHASH final-2 block - mid
1820
1821
pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low
1822
1823
pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high
1824
mov x6, v0.d[0] //AES final-1 block - mov low
1825
1826
mov x7, v0.d[1] //AES final-1 block - mov high
1827
eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid
1828
1829
movi v8.8b, #0 //suppress further partial tag feed in
1830
1831
pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid
1832
1833
eor x6, x6, x13 //AES final-1 block - round 10 low
1834
#ifdef __AARCH64EB__
1835
rev x6, x6
1836
#endif
1837
eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low
1838
1839
eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high
1840
1841
eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid
1842
eor x7, x7, x14 //AES final-1 block - round 10 high
1843
#ifdef __AARCH64EB__
1844
rev x7, x7
1845
#endif
1846
.L128_dec_blocks_more_than_1: //blocks left > 1
1847
1848
rev64 v4.16b, v5.16b //GHASH final-1 block
1849
1850
ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext
1851
eor v4.16b, v4.16b, v8.16b //feed in partial tag
1852
1853
mov d22, v4.d[1] //GHASH final-1 block - mid
1854
1855
eor v0.16b, v5.16b, v3.16b //AES final block - result
1856
1857
eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid
1858
1859
stp x6, x7, [x2], #16 //AES final-1 block - store result
1860
mov x6, v0.d[0] //AES final block - mov low
1861
1862
mov x7, v0.d[1] //AES final block - mov high
1863
ins v22.d[1], v22.d[0] //GHASH final-1 block - mid
1864
1865
pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low
1866
1867
pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high
1868
1869
pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid
1870
movi v8.8b, #0 //suppress further partial tag feed in
1871
1872
eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low
1873
1874
eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high
1875
eor x7, x7, x14 //AES final block - round 10 high
1876
#ifdef __AARCH64EB__
1877
rev x7, x7
1878
#endif
1879
eor x6, x6, x13 //AES final block - round 10 low
1880
#ifdef __AARCH64EB__
1881
rev x6, x6
1882
#endif
1883
eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid
1884
.L128_dec_blocks_less_than_1: //blocks left <= 1
1885
1886
mvn x14, xzr //rk10_h = 0xffffffffffffffff
1887
and x1, x1, #127 //bit_length %= 128
1888
1889
mvn x13, xzr //rk10_l = 0xffffffffffffffff
1890
sub x1, x1, #128 //bit_length -= 128
1891
1892
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
1893
1894
and x1, x1, #127 //bit_length %= 128
1895
1896
lsr x14, x14, x1 //rk10_h is mask for top 64b of last block
1897
cmp x1, #64
1898
1899
csel x10, x14, xzr, lt
1900
csel x9, x13, x14, lt
1901
1902
fmov d0, x9 //ctr0b is mask for last block
1903
1904
mov v0.d[1], x10
1905
1906
and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits
1907
1908
rev64 v4.16b, v5.16b //GHASH final block
1909
1910
eor v4.16b, v4.16b, v8.16b //feed in partial tag
1911
1912
ldp x4, x5, [x2] //load existing bytes we need to not overwrite
1913
1914
and x7, x7, x10
1915
1916
pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high
1917
mov d8, v4.d[1] //GHASH final block - mid
1918
1919
eor v8.8b, v8.8b, v4.8b //GHASH final block - mid
1920
eor v9.16b, v9.16b, v20.16b //GHASH final block - high
1921
1922
pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid
1923
1924
pmull v21.1q, v4.1d, v12.1d //GHASH final block - low
1925
bic x4, x4, x9 //mask out low existing bytes
1926
and x6, x6, x9
1927
1928
#ifndef __AARCH64EB__
1929
rev w9, w12
1930
#else
1931
mov w9, w12
1932
#endif
1933
1934
eor v10.16b, v10.16b, v8.16b //GHASH final block - mid
1935
movi v8.8b, #0xc2
1936
1937
eor v11.16b, v11.16b, v21.16b //GHASH final block - low
1938
1939
bic x5, x5, x10 //mask out high existing bytes
1940
shl d8, d8, #56 //mod_constant
1941
1942
eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up
1943
1944
pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid
1945
1946
eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up
1947
1948
orr x6, x6, x4
1949
str w9, [x16, #12] //store the updated counter
1950
1951
orr x7, x7, x5
1952
stp x6, x7, [x2]
1953
ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment
1954
1955
eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid
1956
1957
eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid
1958
1959
pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low
1960
ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment
1961
1962
eor v11.16b, v11.16b, v8.16b //MODULO - fold into low
1963
1964
eor v11.16b, v11.16b, v10.16b //MODULO - fold into low
1965
ext v11.16b, v11.16b, v11.16b, #8
1966
rev64 v11.16b, v11.16b
1967
mov x0, x15
1968
st1 { v11.16b }, [x3]
1969
1970
ldp x21, x22, [sp, #16]
1971
ldp x23, x24, [sp, #32]
1972
ldp d8, d9, [sp, #48]
1973
ldp d10, d11, [sp, #64]
1974
ldp d12, d13, [sp, #80]
1975
ldp d14, d15, [sp, #96]
1976
ldp x19, x20, [sp], #112
1977
ret
1978
1979
.L128_dec_ret:
1980
mov w0, #0x0
1981
ret
1982
.size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
1983
.globl aes_gcm_enc_192_kernel
1984
.type aes_gcm_enc_192_kernel,%function
1985
.align 4
1986
aes_gcm_enc_192_kernel:
1987
AARCH64_VALID_CALL_TARGET
1988
cbz x1, .L192_enc_ret
1989
stp x19, x20, [sp, #-112]!
1990
mov x16, x4
1991
mov x8, x5
1992
stp x21, x22, [sp, #16]
1993
stp x23, x24, [sp, #32]
1994
stp d8, d9, [sp, #48]
1995
stp d10, d11, [sp, #64]
1996
stp d12, d13, [sp, #80]
1997
stp d14, d15, [sp, #96]
1998
1999
ldp x10, x11, [x16] //ctr96_b64, ctr96_t32
2000
#ifdef __AARCH64EB__
2001
rev x10, x10
2002
rev x11, x11
2003
#endif
2004
ldp x13, x14, [x8, #192] //load rk12
2005
#ifdef __AARCH64EB__
2006
ror x13, x13, #32
2007
ror x14, x14, #32
2008
#endif
2009
ld1 {v18.4s}, [x8], #16 //load rk0
2010
2011
ld1 {v19.4s}, [x8], #16 //load rk1
2012
2013
ld1 {v20.4s}, [x8], #16 //load rk2
2014
2015
lsr x12, x11, #32
2016
ld1 {v21.4s}, [x8], #16 //load rk3
2017
orr w11, w11, w11
2018
2019
ld1 {v22.4s}, [x8], #16 //load rk4
2020
rev w12, w12 //rev_ctr32
2021
2022
add w12, w12, #1 //increment rev_ctr32
2023
fmov d3, x10 //CTR block 3
2024
2025
rev w9, w12 //CTR block 1
2026
add w12, w12, #1 //CTR block 1
2027
fmov d1, x10 //CTR block 1
2028
2029
orr x9, x11, x9, lsl #32 //CTR block 1
2030
ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible
2031
2032
fmov v1.d[1], x9 //CTR block 1
2033
rev w9, w12 //CTR block 2
2034
add w12, w12, #1 //CTR block 2
2035
2036
fmov d2, x10 //CTR block 2
2037
orr x9, x11, x9, lsl #32 //CTR block 2
2038
2039
fmov v2.d[1], x9 //CTR block 2
2040
rev w9, w12 //CTR block 3
2041
2042
orr x9, x11, x9, lsl #32 //CTR block 3
2043
ld1 {v23.4s}, [x8], #16 //load rk5
2044
2045
fmov v3.d[1], x9 //CTR block 3
2046
2047
ld1 {v24.4s}, [x8], #16 //load rk6
2048
2049
ld1 {v25.4s}, [x8], #16 //load rk7
2050
2051
aese v0.16b, v18.16b
2052
aesmc v0.16b, v0.16b //AES block 0 - round 0
2053
ld1 { v11.16b}, [x3]
2054
ext v11.16b, v11.16b, v11.16b, #8
2055
rev64 v11.16b, v11.16b
2056
2057
aese v3.16b, v18.16b
2058
aesmc v3.16b, v3.16b //AES block 3 - round 0
2059
ld1 {v26.4s}, [x8], #16 //load rk8
2060
2061
aese v1.16b, v18.16b
2062
aesmc v1.16b, v1.16b //AES block 1 - round 0
2063
ldr q15, [x3, #112] //load h4l | h4h
2064
#ifndef __AARCH64EB__
2065
ext v15.16b, v15.16b, v15.16b, #8
2066
#endif
2067
aese v2.16b, v18.16b
2068
aesmc v2.16b, v2.16b //AES block 2 - round 0
2069
ld1 {v27.4s}, [x8], #16 //load rk9
2070
2071
aese v0.16b, v19.16b
2072
aesmc v0.16b, v0.16b //AES block 0 - round 1
2073
ld1 {v28.4s}, [x8], #16 //load rk10
2074
2075
aese v1.16b, v19.16b
2076
aesmc v1.16b, v1.16b //AES block 1 - round 1
2077
ldr q12, [x3, #32] //load h1l | h1h
2078
#ifndef __AARCH64EB__
2079
ext v12.16b, v12.16b, v12.16b, #8
2080
#endif
2081
aese v2.16b, v19.16b
2082
aesmc v2.16b, v2.16b //AES block 2 - round 1
2083
ld1 {v29.4s}, [x8], #16 //load rk11
2084
2085
aese v3.16b, v19.16b
2086
aesmc v3.16b, v3.16b //AES block 3 - round 1
2087
ldr q14, [x3, #80] //load h3l | h3h
2088
#ifndef __AARCH64EB__
2089
ext v14.16b, v14.16b, v14.16b, #8
2090
#endif
2091
aese v0.16b, v20.16b
2092
aesmc v0.16b, v0.16b //AES block 0 - round 2
2093
2094
aese v2.16b, v20.16b
2095
aesmc v2.16b, v2.16b //AES block 2 - round 2
2096
2097
aese v3.16b, v20.16b
2098
aesmc v3.16b, v3.16b //AES block 3 - round 2
2099
2100
aese v0.16b, v21.16b
2101
aesmc v0.16b, v0.16b //AES block 0 - round 3
2102
trn1 v9.2d, v14.2d, v15.2d //h4h | h3h
2103
2104
aese v2.16b, v21.16b
2105
aesmc v2.16b, v2.16b //AES block 2 - round 3
2106
2107
aese v1.16b, v20.16b
2108
aesmc v1.16b, v1.16b //AES block 1 - round 2
2109
trn2 v17.2d, v14.2d, v15.2d //h4l | h3l
2110
2111
aese v0.16b, v22.16b
2112
aesmc v0.16b, v0.16b //AES block 0 - round 4
2113
2114
aese v3.16b, v21.16b
2115
aesmc v3.16b, v3.16b //AES block 3 - round 3
2116
2117
aese v1.16b, v21.16b
2118
aesmc v1.16b, v1.16b //AES block 1 - round 3
2119
2120
aese v0.16b, v23.16b
2121
aesmc v0.16b, v0.16b //AES block 0 - round 5
2122
2123
aese v2.16b, v22.16b
2124
aesmc v2.16b, v2.16b //AES block 2 - round 4
2125
2126
aese v1.16b, v22.16b
2127
aesmc v1.16b, v1.16b //AES block 1 - round 4
2128
2129
aese v0.16b, v24.16b
2130
aesmc v0.16b, v0.16b //AES block 0 - round 6
2131
2132
aese v3.16b, v22.16b
2133
aesmc v3.16b, v3.16b //AES block 3 - round 4
2134
2135
aese v2.16b, v23.16b
2136
aesmc v2.16b, v2.16b //AES block 2 - round 5
2137
2138
aese v1.16b, v23.16b
2139
aesmc v1.16b, v1.16b //AES block 1 - round 5
2140
2141
aese v3.16b, v23.16b
2142
aesmc v3.16b, v3.16b //AES block 3 - round 5
2143
2144
aese v2.16b, v24.16b
2145
aesmc v2.16b, v2.16b //AES block 2 - round 6
2146
ldr q13, [x3, #64] //load h2l | h2h
2147
#ifndef __AARCH64EB__
2148
ext v13.16b, v13.16b, v13.16b, #8
2149
#endif
2150
aese v1.16b, v24.16b
2151
aesmc v1.16b, v1.16b //AES block 1 - round 6
2152
2153
aese v3.16b, v24.16b
2154
aesmc v3.16b, v3.16b //AES block 3 - round 6
2155
2156
aese v0.16b, v25.16b
2157
aesmc v0.16b, v0.16b //AES block 0 - round 7
2158
2159
aese v1.16b, v25.16b
2160
aesmc v1.16b, v1.16b //AES block 1 - round 7
2161
trn2 v16.2d, v12.2d, v13.2d //h2l | h1l
2162
2163
aese v3.16b, v25.16b
2164
aesmc v3.16b, v3.16b //AES block 3 - round 7
2165
2166
aese v0.16b, v26.16b
2167
aesmc v0.16b, v0.16b //AES block 0 - round 8
2168
2169
aese v2.16b, v25.16b
2170
aesmc v2.16b, v2.16b //AES block 2 - round 7
2171
trn1 v8.2d, v12.2d, v13.2d //h2h | h1h
2172
2173
aese v1.16b, v26.16b
2174
aesmc v1.16b, v1.16b //AES block 1 - round 8
2175
2176
aese v3.16b, v26.16b
2177
aesmc v3.16b, v3.16b //AES block 3 - round 8
2178
2179
aese v2.16b, v26.16b
2180
aesmc v2.16b, v2.16b //AES block 2 - round 8
2181
2182
aese v0.16b, v27.16b
2183
aesmc v0.16b, v0.16b //AES block 0 - round 9
2184
2185
aese v3.16b, v27.16b
2186
aesmc v3.16b, v3.16b //AES block 3 - round 9
2187
2188
aese v2.16b, v27.16b
2189
aesmc v2.16b, v2.16b //AES block 2 - round 9
2190
2191
aese v1.16b, v27.16b
2192
aesmc v1.16b, v1.16b //AES block 1 - round 9
2193
2194
aese v0.16b, v28.16b
2195
aesmc v0.16b, v0.16b //AES block 0 - round 10
2196
2197
aese v2.16b, v28.16b
2198
aesmc v2.16b, v2.16b //AES block 2 - round 10
2199
2200
aese v1.16b, v28.16b
2201
aesmc v1.16b, v1.16b //AES block 1 - round 10
2202
lsr x5, x1, #3 //byte_len
2203
mov x15, x5
2204
2205
aese v3.16b, v28.16b
2206
aesmc v3.16b, v3.16b //AES block 3 - round 10
2207
sub x5, x5, #1 //byte_len - 1
2208
2209
eor v16.16b, v16.16b, v8.16b //h2k | h1k
2210
and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
2211
2212
eor v17.16b, v17.16b, v9.16b //h4k | h3k
2213
2214
aese v2.16b, v29.16b //AES block 2 - round 11
2215
add x4, x0, x1, lsr #3 //end_input_ptr
2216
add x5, x5, x0
2217
2218
aese v1.16b, v29.16b //AES block 1 - round 11
2219
cmp x0, x5 //check if we have <= 4 blocks
2220
2221
aese v0.16b, v29.16b //AES block 0 - round 11
2222
add w12, w12, #1 //CTR block 3
2223
2224
aese v3.16b, v29.16b //AES block 3 - round 11
2225
b.ge .L192_enc_tail //handle tail
2226
2227
rev w9, w12 //CTR block 4
2228
ldp x6, x7, [x0, #0] //AES block 0 - load plaintext
2229
#ifdef __AARCH64EB__
2230
rev x6, x6
2231
rev x7, x7
2232
#endif
2233
orr x9, x11, x9, lsl #32 //CTR block 4
2234
ldp x21, x22, [x0, #32] //AES block 2 - load plaintext
2235
#ifdef __AARCH64EB__
2236
rev x21, x21
2237
rev x22, x22
2238
#endif
2239
ldp x23, x24, [x0, #48] //AES block 3 - load plaintext
2240
#ifdef __AARCH64EB__
2241
rev x23, x23
2242
rev x24, x24
2243
#endif
2244
ldp x19, x20, [x0, #16] //AES block 1 - load plaintext
2245
#ifdef __AARCH64EB__
2246
rev x19, x19
2247
rev x20, x20
2248
#endif
2249
add x0, x0, #64 //AES input_ptr update
2250
cmp x0, x5 //check if we have <= 8 blocks
2251
2252
eor x6, x6, x13 //AES block 0 - round 12 low
2253
2254
eor x7, x7, x14 //AES block 0 - round 12 high
2255
eor x22, x22, x14 //AES block 2 - round 12 high
2256
fmov d4, x6 //AES block 0 - mov low
2257
2258
eor x24, x24, x14 //AES block 3 - round 12 high
2259
fmov v4.d[1], x7 //AES block 0 - mov high
2260
2261
eor x21, x21, x13 //AES block 2 - round 12 low
2262
eor x19, x19, x13 //AES block 1 - round 12 low
2263
2264
fmov d5, x19 //AES block 1 - mov low
2265
eor x20, x20, x14 //AES block 1 - round 12 high
2266
2267
fmov v5.d[1], x20 //AES block 1 - mov high
2268
2269
eor x23, x23, x13 //AES block 3 - round 12 low
2270
fmov d6, x21 //AES block 2 - mov low
2271
2272
add w12, w12, #1 //CTR block 4
2273
eor v4.16b, v4.16b, v0.16b //AES block 0 - result
2274
fmov d0, x10 //CTR block 4
2275
2276
fmov v0.d[1], x9 //CTR block 4
2277
rev w9, w12 //CTR block 5
2278
2279
orr x9, x11, x9, lsl #32 //CTR block 5
2280
add w12, w12, #1 //CTR block 5
2281
2282
fmov d7, x23 //AES block 3 - mov low
2283
st1 { v4.16b}, [x2], #16 //AES block 0 - store result
2284
2285
fmov v6.d[1], x22 //AES block 2 - mov high
2286
2287
eor v5.16b, v5.16b, v1.16b //AES block 1 - result
2288
fmov d1, x10 //CTR block 5
2289
st1 { v5.16b}, [x2], #16 //AES block 1 - store result
2290
2291
fmov v7.d[1], x24 //AES block 3 - mov high
2292
2293
fmov v1.d[1], x9 //CTR block 5
2294
rev w9, w12 //CTR block 6
2295
2296
orr x9, x11, x9, lsl #32 //CTR block 6
2297
2298
add w12, w12, #1 //CTR block 6
2299
eor v6.16b, v6.16b, v2.16b //AES block 2 - result
2300
fmov d2, x10 //CTR block 6
2301
2302
fmov v2.d[1], x9 //CTR block 6
2303
rev w9, w12 //CTR block 7
2304
2305
orr x9, x11, x9, lsl #32 //CTR block 7
2306
st1 { v6.16b}, [x2], #16 //AES block 2 - store result
2307
2308
eor v7.16b, v7.16b, v3.16b //AES block 3 - result
2309
st1 { v7.16b}, [x2], #16 //AES block 3 - store result
2310
b.ge .L192_enc_prepretail //do prepretail
2311
2312
.L192_enc_main_loop: //main loop start
2313
aese v2.16b, v18.16b
2314
aesmc v2.16b, v2.16b //AES block 4k+6 - round 0
2315
rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free)
2316
2317
aese v1.16b, v18.16b
2318
aesmc v1.16b, v1.16b //AES block 4k+5 - round 0
2319
ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext
2320
#ifdef __AARCH64EB__
2321
rev x19, x19
2322
rev x20, x20
2323
#endif
2324
ext v11.16b, v11.16b, v11.16b, #8 //PRE 0
2325
fmov d3, x10 //CTR block 4k+3
2326
rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free)
2327
2328
aese v2.16b, v19.16b
2329
aesmc v2.16b, v2.16b //AES block 4k+6 - round 1
2330
fmov v3.d[1], x9 //CTR block 4k+3
2331
2332
pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high
2333
rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free)
2334
ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext
2335
#ifdef __AARCH64EB__
2336
rev x21, x21
2337
rev x22, x22
2338
#endif
2339
aese v0.16b, v18.16b
2340
aesmc v0.16b, v0.16b //AES block 4k+4 - round 0
2341
ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext
2342
#ifdef __AARCH64EB__
2343
rev x23, x23
2344
rev x24, x24
2345
#endif
2346
pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low
2347
eor v4.16b, v4.16b, v11.16b //PRE 1
2348
2349
aese v1.16b, v19.16b
2350
aesmc v1.16b, v1.16b //AES block 4k+5 - round 1
2351
2352
aese v0.16b, v19.16b
2353
aesmc v0.16b, v0.16b //AES block 4k+4 - round 1
2354
rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free)
2355
2356
aese v3.16b, v18.16b
2357
aesmc v3.16b, v3.16b //AES block 4k+7 - round 0
2358
eor x24, x24, x14 //AES block 4k+3 - round 12 high
2359
2360
pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low
2361
mov d8, v4.d[1] //GHASH block 4k - mid
2362
2363
aese v0.16b, v20.16b
2364
aesmc v0.16b, v0.16b //AES block 4k+4 - round 2
2365
2366
aese v3.16b, v19.16b
2367
aesmc v3.16b, v3.16b //AES block 4k+7 - round 1
2368
eor x21, x21, x13 //AES block 4k+6 - round 12 low
2369
2370
eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid
2371
eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low
2372
2373
aese v0.16b, v21.16b
2374
aesmc v0.16b, v0.16b //AES block 4k+4 - round 3
2375
eor x19, x19, x13 //AES block 4k+5 - round 12 low
2376
2377
aese v1.16b, v20.16b
2378
aesmc v1.16b, v1.16b //AES block 4k+5 - round 2
2379
mov d31, v6.d[1] //GHASH block 4k+2 - mid
2380
2381
pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high
2382
mov d4, v5.d[1] //GHASH block 4k+1 - mid
2383
2384
aese v2.16b, v20.16b
2385
aesmc v2.16b, v2.16b //AES block 4k+6 - round 2
2386
2387
aese v1.16b, v21.16b
2388
aesmc v1.16b, v1.16b //AES block 4k+5 - round 3
2389
2390
mov d10, v17.d[1] //GHASH block 4k - mid
2391
eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high
2392
2393
aese v3.16b, v20.16b
2394
aesmc v3.16b, v3.16b //AES block 4k+7 - round 2
2395
eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid
2396
2397
pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high
2398
2399
aese v0.16b, v22.16b
2400
aesmc v0.16b, v0.16b //AES block 4k+4 - round 4
2401
eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid
2402
2403
aese v3.16b, v21.16b
2404
aesmc v3.16b, v3.16b //AES block 4k+7 - round 3
2405
2406
pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high
2407
eor x20, x20, x14 //AES block 4k+5 - round 12 high
2408
ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid
2409
2410
aese v0.16b, v23.16b
2411
aesmc v0.16b, v0.16b //AES block 4k+4 - round 5
2412
add w12, w12, #1 //CTR block 4k+3
2413
2414
aese v3.16b, v22.16b
2415
aesmc v3.16b, v3.16b //AES block 4k+7 - round 4
2416
eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high
2417
2418
pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid
2419
eor x22, x22, x14 //AES block 4k+6 - round 12 high
2420
2421
pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid
2422
eor x23, x23, x13 //AES block 4k+3 - round 12 low
2423
mov d30, v7.d[1] //GHASH block 4k+3 - mid
2424
2425
pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid
2426
rev w9, w12 //CTR block 4k+8
2427
2428
pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low
2429
orr x9, x11, x9, lsl #32 //CTR block 4k+8
2430
2431
aese v2.16b, v21.16b
2432
aesmc v2.16b, v2.16b //AES block 4k+6 - round 3
2433
eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid
2434
2435
aese v1.16b, v22.16b
2436
aesmc v1.16b, v1.16b //AES block 4k+5 - round 4
2437
ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext
2438
#ifdef __AARCH64EB__
2439
rev x6, x6
2440
rev x7, x7
2441
#endif
2442
aese v0.16b, v24.16b
2443
aesmc v0.16b, v0.16b //AES block 4k+4 - round 6
2444
eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low
2445
2446
aese v2.16b, v22.16b
2447
aesmc v2.16b, v2.16b //AES block 4k+6 - round 4
2448
add x0, x0, #64 //AES input_ptr update
2449
2450
aese v1.16b, v23.16b
2451
aesmc v1.16b, v1.16b //AES block 4k+5 - round 5
2452
movi v8.8b, #0xc2
2453
2454
pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low
2455
eor x7, x7, x14 //AES block 4k+4 - round 12 high
2456
eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid
2457
2458
aese v2.16b, v23.16b
2459
aesmc v2.16b, v2.16b //AES block 4k+6 - round 5
2460
eor x6, x6, x13 //AES block 4k+4 - round 12 low
2461
2462
aese v1.16b, v24.16b
2463
aesmc v1.16b, v1.16b //AES block 4k+5 - round 6
2464
shl d8, d8, #56 //mod_constant
2465
2466
aese v3.16b, v23.16b
2467
aesmc v3.16b, v3.16b //AES block 4k+7 - round 5
2468
eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high
2469
2470
aese v0.16b, v25.16b
2471
aesmc v0.16b, v0.16b //AES block 4k+4 - round 7
2472
fmov d5, x19 //AES block 4k+5 - mov low
2473
2474
aese v1.16b, v25.16b
2475
aesmc v1.16b, v1.16b //AES block 4k+5 - round 7
2476
eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid
2477
2478
aese v3.16b, v24.16b
2479
aesmc v3.16b, v3.16b //AES block 4k+7 - round 6
2480
fmov v5.d[1], x20 //AES block 4k+5 - mov high
2481
2482
aese v0.16b, v26.16b
2483
aesmc v0.16b, v0.16b //AES block 4k+4 - round 8
2484
eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low
2485
2486
pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid
2487
cmp x0, x5 //.LOOP CONTROL
2488
fmov d4, x6 //AES block 4k+4 - mov low
2489
2490
aese v2.16b, v24.16b
2491
aesmc v2.16b, v2.16b //AES block 4k+6 - round 6
2492
fmov v4.d[1], x7 //AES block 4k+4 - mov high
2493
2494
aese v1.16b, v26.16b
2495
aesmc v1.16b, v1.16b //AES block 4k+5 - round 8
2496
fmov d7, x23 //AES block 4k+3 - mov low
2497
2498
eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid
2499
eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up
2500
add w12, w12, #1 //CTR block 4k+8
2501
2502
aese v2.16b, v25.16b
2503
aesmc v2.16b, v2.16b //AES block 4k+6 - round 7
2504
fmov v7.d[1], x24 //AES block 4k+3 - mov high
2505
2506
pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid
2507
ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment
2508
fmov d6, x21 //AES block 4k+6 - mov low
2509
2510
aese v3.16b, v25.16b
2511
aesmc v3.16b, v3.16b //AES block 4k+7 - round 7
2512
2513
aese v0.16b, v27.16b
2514
aesmc v0.16b, v0.16b //AES block 4k+4 - round 9
2515
eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up
2516
2517
aese v2.16b, v26.16b
2518
aesmc v2.16b, v2.16b //AES block 4k+6 - round 8
2519
2520
aese v3.16b, v26.16b
2521
aesmc v3.16b, v3.16b //AES block 4k+7 - round 8
2522
2523
aese v1.16b, v27.16b
2524
aesmc v1.16b, v1.16b //AES block 4k+5 - round 9
2525
2526
aese v0.16b, v28.16b
2527
aesmc v0.16b, v0.16b //AES block 4k+4 - round 10
2528
eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid
2529
2530
aese v3.16b, v27.16b
2531
aesmc v3.16b, v3.16b //AES block 4k+7 - round 9
2532
2533
aese v2.16b, v27.16b
2534
aesmc v2.16b, v2.16b //AES block 4k+6 - round 9
2535
2536
aese v0.16b, v29.16b //AES block 4k+4 - round 11
2537
2538
aese v1.16b, v28.16b
2539
aesmc v1.16b, v1.16b //AES block 4k+5 - round 10
2540
eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid
2541
2542
aese v2.16b, v28.16b
2543
aesmc v2.16b, v2.16b //AES block 4k+6 - round 10
2544
2545
eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result
2546
fmov d0, x10 //CTR block 4k+8
2547
2548
aese v1.16b, v29.16b //AES block 4k+5 - round 11
2549
fmov v0.d[1], x9 //CTR block 4k+8
2550
rev w9, w12 //CTR block 4k+9
2551
2552
pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low
2553
fmov v6.d[1], x22 //AES block 4k+6 - mov high
2554
st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result
2555
2556
aese v3.16b, v28.16b
2557
aesmc v3.16b, v3.16b //AES block 4k+7 - round 10
2558
orr x9, x11, x9, lsl #32 //CTR block 4k+9
2559
2560
eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result
2561
add w12, w12, #1 //CTR block 4k+9
2562
fmov d1, x10 //CTR block 4k+9
2563
2564
aese v2.16b, v29.16b //AES block 4k+6 - round 11
2565
fmov v1.d[1], x9 //CTR block 4k+9
2566
rev w9, w12 //CTR block 4k+10
2567
2568
add w12, w12, #1 //CTR block 4k+10
2569
ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment
2570
orr x9, x11, x9, lsl #32 //CTR block 4k+10
2571
2572
st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result
2573
eor v11.16b, v11.16b, v9.16b //MODULO - fold into low
2574
2575
aese v3.16b, v29.16b //AES block 4k+7 - round 11
2576
eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result
2577
fmov d2, x10 //CTR block 4k+10
2578
2579
st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result
2580
fmov v2.d[1], x9 //CTR block 4k+10
2581
rev w9, w12 //CTR block 4k+11
2582
2583
eor v11.16b, v11.16b, v10.16b //MODULO - fold into low
2584
orr x9, x11, x9, lsl #32 //CTR block 4k+11
2585
2586
eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result
2587
st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result
2588
b.lt .L192_enc_main_loop
2589
2590
.L192_enc_prepretail: //PREPRETAIL
2591
aese v0.16b, v18.16b
2592
aesmc v0.16b, v0.16b //AES block 4k+4 - round 0
2593
rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free)
2594
2595
fmov d3, x10 //CTR block 4k+3
2596
ext v11.16b, v11.16b, v11.16b, #8 //PRE 0
2597
add w12, w12, #1 //CTR block 4k+3
2598
2599
aese v1.16b, v18.16b
2600
aesmc v1.16b, v1.16b //AES block 4k+5 - round 0
2601
rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free)
2602
2603
aese v2.16b, v18.16b
2604
aesmc v2.16b, v2.16b //AES block 4k+6 - round 0
2605
2606
fmov v3.d[1], x9 //CTR block 4k+3
2607
eor v4.16b, v4.16b, v11.16b //PRE 1
2608
mov d10, v17.d[1] //GHASH block 4k - mid
2609
2610
aese v1.16b, v19.16b
2611
aesmc v1.16b, v1.16b //AES block 4k+5 - round 1
2612
rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free)
2613
2614
pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high
2615
2616
pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low
2617
mov d8, v4.d[1] //GHASH block 4k - mid
2618
2619
pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low
2620
rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free)
2621
2622
pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high
2623
2624
eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid
2625
mov d4, v5.d[1] //GHASH block 4k+1 - mid
2626
2627
eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low
2628
mov d31, v6.d[1] //GHASH block 4k+2 - mid
2629
2630
aese v3.16b, v18.16b
2631
aesmc v3.16b, v3.16b //AES block 4k+7 - round 0
2632
eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high
2633
2634
pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high
2635
2636
eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid
2637
eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid
2638
2639
aese v3.16b, v19.16b
2640
aesmc v3.16b, v3.16b //AES block 4k+7 - round 1
2641
2642
aese v2.16b, v19.16b
2643
aesmc v2.16b, v2.16b //AES block 4k+6 - round 1
2644
eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high
2645
2646
aese v0.16b, v19.16b
2647
aesmc v0.16b, v0.16b //AES block 4k+4 - round 1
2648
2649
aese v1.16b, v20.16b
2650
aesmc v1.16b, v1.16b //AES block 4k+5 - round 2
2651
mov d30, v7.d[1] //GHASH block 4k+3 - mid
2652
2653
pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high
2654
ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid
2655
2656
aese v0.16b, v20.16b
2657
aesmc v0.16b, v0.16b //AES block 4k+4 - round 2
2658
2659
pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid
2660
eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid
2661
2662
aese v1.16b, v21.16b
2663
aesmc v1.16b, v1.16b //AES block 4k+5 - round 3
2664
2665
pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid
2666
2667
pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid
2668
2669
pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid
2670
eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high
2671
2672
pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low
2673
2674
aese v0.16b, v21.16b
2675
aesmc v0.16b, v0.16b //AES block 4k+4 - round 3
2676
eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid
2677
2678
aese v3.16b, v20.16b
2679
aesmc v3.16b, v3.16b //AES block 4k+7 - round 2
2680
2681
aese v2.16b, v20.16b
2682
aesmc v2.16b, v2.16b //AES block 4k+6 - round 2
2683
eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low
2684
2685
aese v0.16b, v22.16b
2686
aesmc v0.16b, v0.16b //AES block 4k+4 - round 4
2687
2688
aese v3.16b, v21.16b
2689
aesmc v3.16b, v3.16b //AES block 4k+7 - round 3
2690
eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid
2691
2692
aese v2.16b, v21.16b
2693
aesmc v2.16b, v2.16b //AES block 4k+6 - round 3
2694
2695
pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low
2696
movi v8.8b, #0xc2
2697
2698
aese v3.16b, v22.16b
2699
aesmc v3.16b, v3.16b //AES block 4k+7 - round 4
2700
2701
aese v2.16b, v22.16b
2702
aesmc v2.16b, v2.16b //AES block 4k+6 - round 4
2703
2704
aese v1.16b, v22.16b
2705
aesmc v1.16b, v1.16b //AES block 4k+5 - round 4
2706
eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid
2707
2708
aese v3.16b, v23.16b
2709
aesmc v3.16b, v3.16b //AES block 4k+7 - round 5
2710
2711
aese v2.16b, v23.16b
2712
aesmc v2.16b, v2.16b //AES block 4k+6 - round 5
2713
2714
aese v1.16b, v23.16b
2715
aesmc v1.16b, v1.16b //AES block 4k+5 - round 5
2716
eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low
2717
2718
aese v0.16b, v23.16b
2719
aesmc v0.16b, v0.16b //AES block 4k+4 - round 5
2720
2721
aese v3.16b, v24.16b
2722
aesmc v3.16b, v3.16b //AES block 4k+7 - round 6
2723
eor v10.16b, v10.16b, v9.16b //karatsuba tidy up
2724
2725
aese v1.16b, v24.16b
2726
aesmc v1.16b, v1.16b //AES block 4k+5 - round 6
2727
2728
aese v0.16b, v24.16b
2729
aesmc v0.16b, v0.16b //AES block 4k+4 - round 6
2730
shl d8, d8, #56 //mod_constant
2731
2732
aese v3.16b, v25.16b
2733
aesmc v3.16b, v3.16b //AES block 4k+7 - round 7
2734
2735
aese v1.16b, v25.16b
2736
aesmc v1.16b, v1.16b //AES block 4k+5 - round 7
2737
eor v10.16b, v10.16b, v11.16b
2738
2739
aese v0.16b, v25.16b
2740
aesmc v0.16b, v0.16b //AES block 4k+4 - round 7
2741
2742
pmull v30.1q, v9.1d, v8.1d
2743
2744
aese v2.16b, v24.16b
2745
aesmc v2.16b, v2.16b //AES block 4k+6 - round 6
2746
ext v9.16b, v9.16b, v9.16b, #8
2747
2748
aese v0.16b, v26.16b
2749
aesmc v0.16b, v0.16b //AES block 4k+4 - round 8
2750
2751
aese v1.16b, v26.16b
2752
aesmc v1.16b, v1.16b //AES block 4k+5 - round 8
2753
eor v10.16b, v10.16b, v30.16b
2754
2755
aese v2.16b, v25.16b
2756
aesmc v2.16b, v2.16b //AES block 4k+6 - round 7
2757
2758
aese v3.16b, v26.16b
2759
aesmc v3.16b, v3.16b //AES block 4k+7 - round 8
2760
2761
aese v0.16b, v27.16b
2762
aesmc v0.16b, v0.16b //AES block 4k+4 - round 9
2763
2764
aese v2.16b, v26.16b
2765
aesmc v2.16b, v2.16b //AES block 4k+6 - round 8
2766
eor v10.16b, v10.16b, v9.16b
2767
2768
aese v3.16b, v27.16b
2769
aesmc v3.16b, v3.16b //AES block 4k+7 - round 9
2770
2771
aese v1.16b, v27.16b
2772
aesmc v1.16b, v1.16b //AES block 4k+5 - round 9
2773
2774
aese v2.16b, v27.16b
2775
aesmc v2.16b, v2.16b //AES block 4k+6 - round 9
2776
2777
pmull v30.1q, v10.1d, v8.1d
2778
2779
ext v10.16b, v10.16b, v10.16b, #8
2780
2781
aese v3.16b, v28.16b
2782
aesmc v3.16b, v3.16b //AES block 4k+7 - round 10
2783
2784
aese v0.16b, v28.16b
2785
aesmc v0.16b, v0.16b //AES block 4k+4 - round 10
2786
2787
aese v2.16b, v28.16b
2788
aesmc v2.16b, v2.16b //AES block 4k+6 - round 10
2789
2790
aese v1.16b, v28.16b
2791
aesmc v1.16b, v1.16b //AES block 4k+5 - round 10
2792
eor v11.16b, v11.16b, v30.16b
2793
2794
aese v0.16b, v29.16b //AES block 4k+4 - round 11
2795
2796
aese v3.16b, v29.16b //AES block 4k+7 - round 11
2797
2798
aese v2.16b, v29.16b //AES block 4k+6 - round 11
2799
2800
aese v1.16b, v29.16b //AES block 4k+5 - round 11
2801
eor v11.16b, v11.16b, v10.16b
2802
.L192_enc_tail: //TAIL
2803
2804
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
2805
ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext
2806
#ifdef __AARCH64EB__
2807
rev x6, x6
2808
rev x7, x7
2809
#endif
2810
eor x6, x6, x13 //AES block 4k+4 - round 12 low
2811
eor x7, x7, x14 //AES block 4k+4 - round 12 high
2812
2813
fmov d4, x6 //AES block 4k+4 - mov low
2814
2815
fmov v4.d[1], x7 //AES block 4k+4 - mov high
2816
cmp x5, #48
2817
2818
eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result
2819
2820
ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag
2821
b.gt .L192_enc_blocks_more_than_3
2822
2823
sub w12, w12, #1
2824
movi v10.8b, #0
2825
2826
mov v3.16b, v2.16b
2827
movi v9.8b, #0
2828
cmp x5, #32
2829
2830
mov v2.16b, v1.16b
2831
movi v11.8b, #0
2832
b.gt .L192_enc_blocks_more_than_2
2833
2834
sub w12, w12, #1
2835
2836
mov v3.16b, v1.16b
2837
cmp x5, #16
2838
b.gt .L192_enc_blocks_more_than_1
2839
2840
sub w12, w12, #1
2841
b .L192_enc_blocks_less_than_1
2842
.L192_enc_blocks_more_than_3: //blocks left > 3
2843
st1 { v5.16b}, [x2], #16 //AES final-3 block - store result
2844
2845
ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high
2846
#ifdef __AARCH64EB__
2847
rev x6, x6
2848
rev x7, x7
2849
#endif
2850
rev64 v4.16b, v5.16b //GHASH final-3 block
2851
2852
eor x6, x6, x13 //AES final-2 block - round 12 low
2853
eor v4.16b, v4.16b, v8.16b //feed in partial tag
2854
2855
eor x7, x7, x14 //AES final-2 block - round 12 high
2856
fmov d5, x6 //AES final-2 block - mov low
2857
2858
fmov v5.d[1], x7 //AES final-2 block - mov high
2859
2860
mov d22, v4.d[1] //GHASH final-3 block - mid
2861
2862
pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low
2863
2864
mov d10, v17.d[1] //GHASH final-3 block - mid
2865
2866
eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid
2867
2868
movi v8.8b, #0 //suppress further partial tag feed in
2869
2870
pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high
2871
2872
pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid
2873
eor v5.16b, v5.16b, v1.16b //AES final-2 block - result
2874
.L192_enc_blocks_more_than_2: //blocks left > 2
2875
2876
st1 { v5.16b}, [x2], #16 //AES final-2 block - store result
2877
2878
rev64 v4.16b, v5.16b //GHASH final-2 block
2879
ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high
2880
#ifdef __AARCH64EB__
2881
rev x6, x6
2882
rev x7, x7
2883
#endif
2884
eor v4.16b, v4.16b, v8.16b //feed in partial tag
2885
2886
eor x7, x7, x14 //AES final-1 block - round 12 high
2887
2888
pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high
2889
mov d22, v4.d[1] //GHASH final-2 block - mid
2890
2891
pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low
2892
eor x6, x6, x13 //AES final-1 block - round 12 low
2893
2894
fmov d5, x6 //AES final-1 block - mov low
2895
2896
fmov v5.d[1], x7 //AES final-1 block - mov high
2897
eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high
2898
eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid
2899
2900
eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low
2901
2902
pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid
2903
2904
movi v8.8b, #0 //suppress further partial tag feed in
2905
2906
eor v5.16b, v5.16b, v2.16b //AES final-1 block - result
2907
2908
eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid
2909
.L192_enc_blocks_more_than_1: //blocks left > 1
2910
2911
st1 { v5.16b}, [x2], #16 //AES final-1 block - store result
2912
2913
ldp x6, x7, [x0], #16 //AES final block - load input low & high
2914
#ifdef __AARCH64EB__
2915
rev x6, x6
2916
rev x7, x7
2917
#endif
2918
rev64 v4.16b, v5.16b //GHASH final-1 block
2919
2920
eor x6, x6, x13 //AES final block - round 12 low
2921
eor v4.16b, v4.16b, v8.16b //feed in partial tag
2922
movi v8.8b, #0 //suppress further partial tag feed in
2923
2924
mov d22, v4.d[1] //GHASH final-1 block - mid
2925
2926
eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid
2927
eor x7, x7, x14 //AES final block - round 12 high
2928
fmov d5, x6 //AES final block - mov low
2929
2930
pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high
2931
fmov v5.d[1], x7 //AES final block - mov high
2932
2933
ins v22.d[1], v22.d[0] //GHASH final-1 block - mid
2934
2935
eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high
2936
2937
pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low
2938
2939
pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid
2940
2941
eor v5.16b, v5.16b, v3.16b //AES final block - result
2942
2943
eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low
2944
2945
eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid
2946
.L192_enc_blocks_less_than_1: //blocks left <= 1
2947
2948
ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
2949
#ifndef __AARCH64EB__
2950
rev w9, w12
2951
#else
2952
mov w9, w12
2953
#endif
2954
and x1, x1, #127 //bit_length %= 128
2955
2956
sub x1, x1, #128 //bit_length -= 128
2957
mvn x14, xzr //rk12_h = 0xffffffffffffffff
2958
2959
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
2960
mvn x13, xzr //rk12_l = 0xffffffffffffffff
2961
2962
and x1, x1, #127 //bit_length %= 128
2963
2964
lsr x14, x14, x1 //rk12_h is mask for top 64b of last block
2965
cmp x1, #64
2966
2967
csel x6, x13, x14, lt
2968
csel x7, x14, xzr, lt
2969
2970
fmov d0, x6 //ctr0b is mask for last block
2971
2972
fmov v0.d[1], x7
2973
2974
and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits
2975
2976
rev64 v4.16b, v5.16b //GHASH final block
2977
2978
eor v4.16b, v4.16b, v8.16b //feed in partial tag
2979
2980
mov d8, v4.d[1] //GHASH final block - mid
2981
2982
pmull v21.1q, v4.1d, v12.1d //GHASH final block - low
2983
2984
pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high
2985
2986
eor v8.8b, v8.8b, v4.8b //GHASH final block - mid
2987
2988
eor v11.16b, v11.16b, v21.16b //GHASH final block - low
2989
2990
eor v9.16b, v9.16b, v20.16b //GHASH final block - high
2991
2992
pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid
2993
2994
eor v10.16b, v10.16b, v8.16b //GHASH final block - mid
2995
movi v8.8b, #0xc2
2996
2997
eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up
2998
2999
shl d8, d8, #56 //mod_constant
3000
3001
bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing
3002
3003
eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up
3004
3005
pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid
3006
3007
ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment
3008
3009
eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid
3010
3011
eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid
3012
3013
pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low
3014
3015
ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment
3016
3017
eor v11.16b, v11.16b, v9.16b //MODULO - fold into low
3018
str w9, [x16, #12] //store the updated counter
3019
3020
st1 { v5.16b}, [x2] //store all 16B
3021
3022
eor v11.16b, v11.16b, v10.16b //MODULO - fold into low
3023
ext v11.16b, v11.16b, v11.16b, #8
3024
rev64 v11.16b, v11.16b
3025
mov x0, x15
3026
st1 { v11.16b }, [x3]
3027
3028
ldp x21, x22, [sp, #16]
3029
ldp x23, x24, [sp, #32]
3030
ldp d8, d9, [sp, #48]
3031
ldp d10, d11, [sp, #64]
3032
ldp d12, d13, [sp, #80]
3033
ldp d14, d15, [sp, #96]
3034
ldp x19, x20, [sp], #112
3035
ret
3036
3037
.L192_enc_ret:
3038
mov w0, #0x0
3039
ret
3040
.size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
3041
.globl aes_gcm_dec_192_kernel
3042
.type aes_gcm_dec_192_kernel,%function
3043
.align 4
3044
aes_gcm_dec_192_kernel:
3045
AARCH64_VALID_CALL_TARGET
3046
cbz x1, .L192_dec_ret
3047
stp x19, x20, [sp, #-112]!
3048
mov x16, x4
3049
mov x8, x5
3050
stp x21, x22, [sp, #16]
3051
stp x23, x24, [sp, #32]
3052
stp d8, d9, [sp, #48]
3053
stp d10, d11, [sp, #64]
3054
stp d12, d13, [sp, #80]
3055
stp d14, d15, [sp, #96]
3056
3057
add x4, x0, x1, lsr #3 //end_input_ptr
3058
ldp x10, x11, [x16] //ctr96_b64, ctr96_t32
3059
#ifdef __AARCH64EB__
3060
rev x10, x10
3061
rev x11, x11
3062
#endif
3063
ldp x13, x14, [x8, #192] //load rk12
3064
#ifdef __AARCH64EB__
3065
ror x13, x13, #32
3066
ror x14, x14, #32
3067
#endif
3068
ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible
3069
3070
ld1 {v18.4s}, [x8], #16 //load rk0
3071
3072
lsr x5, x1, #3 //byte_len
3073
mov x15, x5
3074
ld1 {v19.4s}, [x8], #16 //load rk1
3075
3076
lsr x12, x11, #32
3077
orr w11, w11, w11
3078
fmov d3, x10 //CTR block 3
3079
3080
rev w12, w12 //rev_ctr32
3081
fmov d1, x10 //CTR block 1
3082
3083
add w12, w12, #1 //increment rev_ctr32
3084
ld1 {v20.4s}, [x8], #16 //load rk2
3085
3086
aese v0.16b, v18.16b
3087
aesmc v0.16b, v0.16b //AES block 0 - round 0
3088
rev w9, w12 //CTR block 1
3089
3090
add w12, w12, #1 //CTR block 1
3091
orr x9, x11, x9, lsl #32 //CTR block 1
3092
ld1 {v21.4s}, [x8], #16 //load rk3
3093
3094
fmov v1.d[1], x9 //CTR block 1
3095
rev w9, w12 //CTR block 2
3096
add w12, w12, #1 //CTR block 2
3097
3098
fmov d2, x10 //CTR block 2
3099
orr x9, x11, x9, lsl #32 //CTR block 2
3100
3101
fmov v2.d[1], x9 //CTR block 2
3102
rev w9, w12 //CTR block 3
3103
3104
aese v0.16b, v19.16b
3105
aesmc v0.16b, v0.16b //AES block 0 - round 1
3106
orr x9, x11, x9, lsl #32 //CTR block 3
3107
3108
fmov v3.d[1], x9 //CTR block 3
3109
3110
ld1 {v22.4s}, [x8], #16 //load rk4
3111
3112
aese v0.16b, v20.16b
3113
aesmc v0.16b, v0.16b //AES block 0 - round 2
3114
3115
aese v2.16b, v18.16b
3116
aesmc v2.16b, v2.16b //AES block 2 - round 0
3117
ld1 {v23.4s}, [x8], #16 //load rk5
3118
3119
aese v1.16b, v18.16b
3120
aesmc v1.16b, v1.16b //AES block 1 - round 0
3121
ldr q15, [x3, #112] //load h4l | h4h
3122
#ifndef __AARCH64EB__
3123
ext v15.16b, v15.16b, v15.16b, #8
3124
#endif
3125
aese v3.16b, v18.16b
3126
aesmc v3.16b, v3.16b //AES block 3 - round 0
3127
ldr q13, [x3, #64] //load h2l | h2h
3128
#ifndef __AARCH64EB__
3129
ext v13.16b, v13.16b, v13.16b, #8
3130
#endif
3131
aese v2.16b, v19.16b
3132
aesmc v2.16b, v2.16b //AES block 2 - round 1
3133
ldr q14, [x3, #80] //load h3l | h3h
3134
#ifndef __AARCH64EB__
3135
ext v14.16b, v14.16b, v14.16b, #8
3136
#endif
3137
aese v1.16b, v19.16b
3138
aesmc v1.16b, v1.16b //AES block 1 - round 1
3139
3140
aese v3.16b, v19.16b
3141
aesmc v3.16b, v3.16b //AES block 3 - round 1
3142
ldr q12, [x3, #32] //load h1l | h1h
3143
#ifndef __AARCH64EB__
3144
ext v12.16b, v12.16b, v12.16b, #8
3145
#endif
3146
aese v2.16b, v20.16b
3147
aesmc v2.16b, v2.16b //AES block 2 - round 2
3148
ld1 {v24.4s}, [x8], #16 //load rk6
3149
3150
aese v0.16b, v21.16b
3151
aesmc v0.16b, v0.16b //AES block 0 - round 3
3152
ld1 {v25.4s}, [x8], #16 //load rk7
3153
3154
aese v1.16b, v20.16b
3155
aesmc v1.16b, v1.16b //AES block 1 - round 2
3156
ld1 {v26.4s}, [x8], #16 //load rk8
3157
3158
aese v3.16b, v20.16b
3159
aesmc v3.16b, v3.16b //AES block 3 - round 2
3160
ld1 {v27.4s}, [x8], #16 //load rk9
3161
3162
aese v2.16b, v21.16b
3163
aesmc v2.16b, v2.16b //AES block 2 - round 3
3164
ld1 { v11.16b}, [x3]
3165
ext v11.16b, v11.16b, v11.16b, #8
3166
rev64 v11.16b, v11.16b
3167
3168
aese v1.16b, v21.16b
3169
aesmc v1.16b, v1.16b //AES block 1 - round 3
3170
add w12, w12, #1 //CTR block 3
3171
3172
aese v3.16b, v21.16b
3173
aesmc v3.16b, v3.16b //AES block 3 - round 3
3174
trn1 v9.2d, v14.2d, v15.2d //h4h | h3h
3175
3176
aese v0.16b, v22.16b
3177
aesmc v0.16b, v0.16b //AES block 0 - round 4
3178
ld1 {v28.4s}, [x8], #16 //load rk10
3179
3180
aese v1.16b, v22.16b
3181
aesmc v1.16b, v1.16b //AES block 1 - round 4
3182
trn2 v17.2d, v14.2d, v15.2d //h4l | h3l
3183
3184
aese v2.16b, v22.16b
3185
aesmc v2.16b, v2.16b //AES block 2 - round 4
3186
3187
aese v3.16b, v22.16b
3188
aesmc v3.16b, v3.16b //AES block 3 - round 4
3189
trn2 v16.2d, v12.2d, v13.2d //h2l | h1l
3190
3191
aese v0.16b, v23.16b
3192
aesmc v0.16b, v0.16b //AES block 0 - round 5
3193
ld1 {v29.4s}, [x8], #16 //load rk11
3194
3195
aese v1.16b, v23.16b
3196
aesmc v1.16b, v1.16b //AES block 1 - round 5
3197
3198
aese v2.16b, v23.16b
3199
aesmc v2.16b, v2.16b //AES block 2 - round 5
3200
3201
aese v3.16b, v23.16b
3202
aesmc v3.16b, v3.16b //AES block 3 - round 5
3203
3204
aese v0.16b, v24.16b
3205
aesmc v0.16b, v0.16b //AES block 0 - round 6
3206
3207
aese v2.16b, v24.16b
3208
aesmc v2.16b, v2.16b //AES block 2 - round 6
3209
3210
aese v3.16b, v24.16b
3211
aesmc v3.16b, v3.16b //AES block 3 - round 6
3212
3213
aese v0.16b, v25.16b
3214
aesmc v0.16b, v0.16b //AES block 0 - round 7
3215
3216
aese v2.16b, v25.16b
3217
aesmc v2.16b, v2.16b //AES block 2 - round 7
3218
3219
aese v3.16b, v25.16b
3220
aesmc v3.16b, v3.16b //AES block 3 - round 7
3221
3222
aese v1.16b, v24.16b
3223
aesmc v1.16b, v1.16b //AES block 1 - round 6
3224
3225
aese v2.16b, v26.16b
3226
aesmc v2.16b, v2.16b //AES block 2 - round 8
3227
3228
aese v3.16b, v26.16b
3229
aesmc v3.16b, v3.16b //AES block 3 - round 8
3230
3231
aese v1.16b, v25.16b
3232
aesmc v1.16b, v1.16b //AES block 1 - round 7
3233
3234
aese v2.16b, v27.16b
3235
aesmc v2.16b, v2.16b //AES block 2 - round 9
3236
3237
aese v3.16b, v27.16b
3238
aesmc v3.16b, v3.16b //AES block 3 - round 9
3239
3240
aese v1.16b, v26.16b
3241
aesmc v1.16b, v1.16b //AES block 1 - round 8
3242
sub x5, x5, #1 //byte_len - 1
3243
3244
aese v0.16b, v26.16b
3245
aesmc v0.16b, v0.16b //AES block 0 - round 8
3246
and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
3247
3248
aese v3.16b, v28.16b
3249
aesmc v3.16b, v3.16b //AES block 3 - round 10
3250
add x5, x5, x0
3251
3252
aese v1.16b, v27.16b
3253
aesmc v1.16b, v1.16b //AES block 1 - round 9
3254
cmp x0, x5 //check if we have <= 4 blocks
3255
3256
aese v0.16b, v27.16b
3257
aesmc v0.16b, v0.16b //AES block 0 - round 9
3258
trn1 v8.2d, v12.2d, v13.2d //h2h | h1h
3259
3260
aese v3.16b, v29.16b //AES block 3 - round 11
3261
3262
aese v2.16b, v28.16b
3263
aesmc v2.16b, v2.16b //AES block 2 - round 10
3264
3265
aese v1.16b, v28.16b
3266
aesmc v1.16b, v1.16b //AES block 1 - round 10
3267
3268
aese v0.16b, v28.16b
3269
aesmc v0.16b, v0.16b //AES block 0 - round 10
3270
eor v16.16b, v16.16b, v8.16b //h2k | h1k
3271
3272
aese v2.16b, v29.16b //AES block 2 - round 11
3273
3274
aese v1.16b, v29.16b //AES block 1 - round 11
3275
eor v17.16b, v17.16b, v9.16b //h4k | h3k
3276
3277
aese v0.16b, v29.16b //AES block 0 - round 11
3278
b.ge .L192_dec_tail //handle tail
3279
3280
ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext
3281
3282
eor v1.16b, v5.16b, v1.16b //AES block 1 - result
3283
3284
eor v0.16b, v4.16b, v0.16b //AES block 0 - result
3285
rev w9, w12 //CTR block 4
3286
ld1 {v6.16b, v7.16b}, [x0], #32 //AES block 2,3 - load ciphertext
3287
3288
mov x19, v1.d[0] //AES block 1 - mov low
3289
3290
mov x20, v1.d[1] //AES block 1 - mov high
3291
3292
mov x6, v0.d[0] //AES block 0 - mov low
3293
orr x9, x11, x9, lsl #32 //CTR block 4
3294
add w12, w12, #1 //CTR block 4
3295
3296
mov x7, v0.d[1] //AES block 0 - mov high
3297
rev64 v4.16b, v4.16b //GHASH block 0
3298
3299
fmov d0, x10 //CTR block 4
3300
rev64 v5.16b, v5.16b //GHASH block 1
3301
cmp x0, x5 //check if we have <= 8 blocks
3302
3303
eor x19, x19, x13 //AES block 1 - round 12 low
3304
#ifdef __AARCH64EB__
3305
rev x19, x19
3306
#endif
3307
fmov v0.d[1], x9 //CTR block 4
3308
rev w9, w12 //CTR block 5
3309
3310
orr x9, x11, x9, lsl #32 //CTR block 5
3311
fmov d1, x10 //CTR block 5
3312
eor x20, x20, x14 //AES block 1 - round 12 high
3313
#ifdef __AARCH64EB__
3314
rev x20, x20
3315
#endif
3316
add w12, w12, #1 //CTR block 5
3317
fmov v1.d[1], x9 //CTR block 5
3318
eor x6, x6, x13 //AES block 0 - round 12 low
3319
#ifdef __AARCH64EB__
3320
rev x6, x6
3321
#endif
3322
rev w9, w12 //CTR block 6
3323
eor x7, x7, x14 //AES block 0 - round 12 high
3324
#ifdef __AARCH64EB__
3325
rev x7, x7
3326
#endif
3327
stp x6, x7, [x2], #16 //AES block 0 - store result
3328
orr x9, x11, x9, lsl #32 //CTR block 6
3329
3330
stp x19, x20, [x2], #16 //AES block 1 - store result
3331
3332
add w12, w12, #1 //CTR block 6
3333
eor v2.16b, v6.16b, v2.16b //AES block 2 - result
3334
b.ge .L192_dec_prepretail //do prepretail
3335
3336
.L192_dec_main_loop: //main loop start
3337
aese v1.16b, v18.16b
3338
aesmc v1.16b, v1.16b //AES block 4k+5 - round 0
3339
ext v11.16b, v11.16b, v11.16b, #8 //PRE 0
3340
3341
pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low
3342
mov x21, v2.d[0] //AES block 4k+2 - mov low
3343
3344
mov x22, v2.d[1] //AES block 4k+2 - mov high
3345
eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result
3346
rev64 v7.16b, v7.16b //GHASH block 4k+3
3347
3348
aese v1.16b, v19.16b
3349
aesmc v1.16b, v1.16b //AES block 4k+5 - round 1
3350
fmov d2, x10 //CTR block 4k+6
3351
3352
aese v0.16b, v18.16b
3353
aesmc v0.16b, v0.16b //AES block 4k+4 - round 0
3354
eor v4.16b, v4.16b, v11.16b //PRE 1
3355
3356
pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high
3357
fmov v2.d[1], x9 //CTR block 4k+6
3358
3359
aese v1.16b, v20.16b
3360
aesmc v1.16b, v1.16b //AES block 4k+5 - round 2
3361
mov x24, v3.d[1] //AES block 4k+3 - mov high
3362
3363
aese v0.16b, v19.16b
3364
aesmc v0.16b, v0.16b //AES block 4k+4 - round 1
3365
mov x23, v3.d[0] //AES block 4k+3 - mov low
3366
3367
pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high
3368
fmov d3, x10 //CTR block 4k+7
3369
mov d8, v4.d[1] //GHASH block 4k - mid
3370
3371
pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low
3372
mov d10, v17.d[1] //GHASH block 4k - mid
3373
rev w9, w12 //CTR block 4k+7
3374
3375
aese v2.16b, v18.16b
3376
aesmc v2.16b, v2.16b //AES block 4k+6 - round 0
3377
orr x9, x11, x9, lsl #32 //CTR block 4k+7
3378
3379
fmov v3.d[1], x9 //CTR block 4k+7
3380
eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid
3381
mov d4, v5.d[1] //GHASH block 4k+1 - mid
3382
3383
aese v1.16b, v21.16b
3384
aesmc v1.16b, v1.16b //AES block 4k+5 - round 3
3385
3386
aese v0.16b, v20.16b
3387
aesmc v0.16b, v0.16b //AES block 4k+4 - round 2
3388
eor x22, x22, x14 //AES block 4k+2 - round 12 high
3389
#ifdef __AARCH64EB__
3390
rev x22, x22
3391
#endif
3392
aese v2.16b, v19.16b
3393
aesmc v2.16b, v2.16b //AES block 4k+6 - round 1
3394
eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid
3395
3396
pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid
3397
3398
aese v3.16b, v18.16b
3399
aesmc v3.16b, v3.16b //AES block 4k+7 - round 0
3400
rev64 v6.16b, v6.16b //GHASH block 4k+2
3401
3402
aese v2.16b, v20.16b
3403
aesmc v2.16b, v2.16b //AES block 4k+6 - round 2
3404
3405
pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid
3406
eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low
3407
eor x21, x21, x13 //AES block 4k+2 - round 12 low
3408
#ifdef __AARCH64EB__
3409
rev x21, x21
3410
#endif
3411
aese v1.16b, v22.16b
3412
aesmc v1.16b, v1.16b //AES block 4k+5 - round 4
3413
3414
aese v0.16b, v21.16b
3415
aesmc v0.16b, v0.16b //AES block 4k+4 - round 3
3416
3417
eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid
3418
mov d31, v6.d[1] //GHASH block 4k+2 - mid
3419
3420
aese v3.16b, v19.16b
3421
aesmc v3.16b, v3.16b //AES block 4k+7 - round 1
3422
eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high
3423
3424
aese v0.16b, v22.16b
3425
aesmc v0.16b, v0.16b //AES block 4k+4 - round 4
3426
3427
pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high
3428
eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid
3429
3430
pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low
3431
3432
aese v0.16b, v23.16b
3433
aesmc v0.16b, v0.16b //AES block 4k+4 - round 5
3434
3435
eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high
3436
mov d30, v7.d[1] //GHASH block 4k+3 - mid
3437
3438
aese v1.16b, v23.16b
3439
aesmc v1.16b, v1.16b //AES block 4k+5 - round 5
3440
3441
pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high
3442
3443
aese v3.16b, v20.16b
3444
aesmc v3.16b, v3.16b //AES block 4k+7 - round 2
3445
eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid
3446
3447
aese v1.16b, v24.16b
3448
aesmc v1.16b, v1.16b //AES block 4k+5 - round 6
3449
3450
aese v0.16b, v24.16b
3451
aesmc v0.16b, v0.16b //AES block 4k+4 - round 6
3452
ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid
3453
3454
aese v3.16b, v21.16b
3455
aesmc v3.16b, v3.16b //AES block 4k+7 - round 3
3456
3457
pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid
3458
eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low
3459
3460
aese v0.16b, v25.16b
3461
aesmc v0.16b, v0.16b //AES block 4k+4 - round 7
3462
3463
pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid
3464
eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high
3465
3466
aese v1.16b, v25.16b
3467
aesmc v1.16b, v1.16b //AES block 4k+5 - round 7
3468
3469
aese v0.16b, v26.16b
3470
aesmc v0.16b, v0.16b //AES block 4k+4 - round 8
3471
movi v8.8b, #0xc2
3472
3473
pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low
3474
3475
aese v1.16b, v26.16b
3476
aesmc v1.16b, v1.16b //AES block 4k+5 - round 8
3477
eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid
3478
3479
aese v2.16b, v21.16b
3480
aesmc v2.16b, v2.16b //AES block 4k+6 - round 3
3481
3482
aese v0.16b, v27.16b
3483
aesmc v0.16b, v0.16b //AES block 4k+4 - round 9
3484
eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low
3485
3486
aese v3.16b, v22.16b
3487
aesmc v3.16b, v3.16b //AES block 4k+7 - round 4
3488
3489
aese v2.16b, v22.16b
3490
aesmc v2.16b, v2.16b //AES block 4k+6 - round 4
3491
eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid
3492
3493
aese v0.16b, v28.16b
3494
aesmc v0.16b, v0.16b //AES block 4k+4 - round 10
3495
3496
aese v1.16b, v27.16b
3497
aesmc v1.16b, v1.16b //AES block 4k+5 - round 9
3498
eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up
3499
3500
aese v2.16b, v23.16b
3501
aesmc v2.16b, v2.16b //AES block 4k+6 - round 5
3502
3503
aese v3.16b, v23.16b
3504
aesmc v3.16b, v3.16b //AES block 4k+7 - round 5
3505
shl d8, d8, #56 //mod_constant
3506
3507
aese v1.16b, v28.16b
3508
aesmc v1.16b, v1.16b //AES block 4k+5 - round 10
3509
3510
aese v2.16b, v24.16b
3511
aesmc v2.16b, v2.16b //AES block 4k+6 - round 6
3512
ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext
3513
3514
aese v3.16b, v24.16b
3515
aesmc v3.16b, v3.16b //AES block 4k+7 - round 6
3516
eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up
3517
3518
pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid
3519
ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext
3520
eor x23, x23, x13 //AES block 4k+3 - round 12 low
3521
#ifdef __AARCH64EB__
3522
rev x23, x23
3523
#endif
3524
aese v2.16b, v25.16b
3525
aesmc v2.16b, v2.16b //AES block 4k+6 - round 7
3526
ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment
3527
3528
aese v0.16b, v29.16b //AES block 4k+4 - round 11
3529
add w12, w12, #1 //CTR block 4k+7
3530
3531
aese v3.16b, v25.16b
3532
aesmc v3.16b, v3.16b //AES block 4k+7 - round 7
3533
eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid
3534
3535
aese v2.16b, v26.16b
3536
aesmc v2.16b, v2.16b //AES block 4k+6 - round 8
3537
ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext
3538
3539
aese v1.16b, v29.16b //AES block 4k+5 - round 11
3540
ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext
3541
rev w9, w12 //CTR block 4k+8
3542
3543
aese v3.16b, v26.16b
3544
aesmc v3.16b, v3.16b //AES block 4k+7 - round 8
3545
stp x21, x22, [x2], #16 //AES block 4k+2 - store result
3546
3547
aese v2.16b, v27.16b
3548
aesmc v2.16b, v2.16b //AES block 4k+6 - round 9
3549
eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid
3550
3551
cmp x0, x5 //.LOOP CONTROL
3552
3553
eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result
3554
eor x24, x24, x14 //AES block 4k+3 - round 12 high
3555
#ifdef __AARCH64EB__
3556
rev x24, x24
3557
#endif
3558
eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result
3559
3560
aese v2.16b, v28.16b
3561
aesmc v2.16b, v2.16b //AES block 4k+6 - round 10
3562
orr x9, x11, x9, lsl #32 //CTR block 4k+8
3563
3564
aese v3.16b, v27.16b
3565
aesmc v3.16b, v3.16b //AES block 4k+7 - round 9
3566
3567
pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low
3568
mov x19, v1.d[0] //AES block 4k+5 - mov low
3569
3570
mov x6, v0.d[0] //AES block 4k+4 - mov low
3571
stp x23, x24, [x2], #16 //AES block 4k+3 - store result
3572
rev64 v5.16b, v5.16b //GHASH block 4k+5
3573
3574
aese v2.16b, v29.16b //AES block 4k+6 - round 11
3575
mov x7, v0.d[1] //AES block 4k+4 - mov high
3576
3577
aese v3.16b, v28.16b
3578
aesmc v3.16b, v3.16b //AES block 4k+7 - round 10
3579
mov x20, v1.d[1] //AES block 4k+5 - mov high
3580
3581
fmov d0, x10 //CTR block 4k+8
3582
add w12, w12, #1 //CTR block 4k+8
3583
ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment
3584
3585
eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result
3586
fmov v0.d[1], x9 //CTR block 4k+8
3587
rev w9, w12 //CTR block 4k+9
3588
3589
eor x6, x6, x13 //AES block 4k+4 - round 12 low
3590
#ifdef __AARCH64EB__
3591
rev x6, x6
3592
#endif
3593
orr x9, x11, x9, lsl #32 //CTR block 4k+9
3594
eor v11.16b, v11.16b, v8.16b //MODULO - fold into low
3595
3596
fmov d1, x10 //CTR block 4k+9
3597
add w12, w12, #1 //CTR block 4k+9
3598
eor x19, x19, x13 //AES block 4k+5 - round 12 low
3599
#ifdef __AARCH64EB__
3600
rev x19, x19
3601
#endif
3602
fmov v1.d[1], x9 //CTR block 4k+9
3603
rev w9, w12 //CTR block 4k+10
3604
eor x20, x20, x14 //AES block 4k+5 - round 12 high
3605
#ifdef __AARCH64EB__
3606
rev x20, x20
3607
#endif
3608
eor x7, x7, x14 //AES block 4k+4 - round 12 high
3609
#ifdef __AARCH64EB__
3610
rev x7, x7
3611
#endif
3612
stp x6, x7, [x2], #16 //AES block 4k+4 - store result
3613
eor v11.16b, v11.16b, v10.16b //MODULO - fold into low
3614
3615
add w12, w12, #1 //CTR block 4k+10
3616
rev64 v4.16b, v4.16b //GHASH block 4k+4
3617
orr x9, x11, x9, lsl #32 //CTR block 4k+10
3618
3619
aese v3.16b, v29.16b //AES block 4k+7 - round 11
3620
stp x19, x20, [x2], #16 //AES block 4k+5 - store result
3621
b.lt .L192_dec_main_loop
3622
3623
.L192_dec_prepretail: //PREPRETAIL
3624
mov x22, v2.d[1] //AES block 4k+2 - mov high
3625
ext v11.16b, v11.16b, v11.16b, #8 //PRE 0
3626
eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result
3627
3628
aese v1.16b, v18.16b
3629
aesmc v1.16b, v1.16b //AES block 4k+5 - round 0
3630
mov x21, v2.d[0] //AES block 4k+2 - mov low
3631
3632
aese v0.16b, v18.16b
3633
aesmc v0.16b, v0.16b //AES block 4k+4 - round 0
3634
mov d10, v17.d[1] //GHASH block 4k - mid
3635
3636
eor v4.16b, v4.16b, v11.16b //PRE 1
3637
fmov d2, x10 //CTR block 4k+6
3638
3639
aese v1.16b, v19.16b
3640
aesmc v1.16b, v1.16b //AES block 4k+5 - round 1
3641
mov x23, v3.d[0] //AES block 4k+3 - mov low
3642
3643
aese v0.16b, v19.16b
3644
aesmc v0.16b, v0.16b //AES block 4k+4 - round 1
3645
mov x24, v3.d[1] //AES block 4k+3 - mov high
3646
3647
pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low
3648
mov d8, v4.d[1] //GHASH block 4k - mid
3649
fmov d3, x10 //CTR block 4k+7
3650
3651
aese v1.16b, v20.16b
3652
aesmc v1.16b, v1.16b //AES block 4k+5 - round 2
3653
rev64 v6.16b, v6.16b //GHASH block 4k+2
3654
3655
pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high
3656
fmov v2.d[1], x9 //CTR block 4k+6
3657
rev w9, w12 //CTR block 4k+7
3658
3659
orr x9, x11, x9, lsl #32 //CTR block 4k+7
3660
eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid
3661
mov d4, v5.d[1] //GHASH block 4k+1 - mid
3662
3663
pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low
3664
eor x24, x24, x14 //AES block 4k+3 - round 12 high
3665
#ifdef __AARCH64EB__
3666
rev x24, x24
3667
#endif
3668
fmov v3.d[1], x9 //CTR block 4k+7
3669
3670
aese v0.16b, v20.16b
3671
aesmc v0.16b, v0.16b //AES block 4k+4 - round 2
3672
eor x21, x21, x13 //AES block 4k+2 - round 12 low
3673
#ifdef __AARCH64EB__
3674
rev x21, x21
3675
#endif
3676
pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high
3677
eor x22, x22, x14 //AES block 4k+2 - round 12 high
3678
#ifdef __AARCH64EB__
3679
rev x22, x22
3680
#endif
3681
eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid
3682
3683
pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid
3684
eor x23, x23, x13 //AES block 4k+3 - round 12 low
3685
#ifdef __AARCH64EB__
3686
rev x23, x23
3687
#endif
3688
stp x21, x22, [x2], #16 //AES block 4k+2 - store result
3689
3690
rev64 v7.16b, v7.16b //GHASH block 4k+3
3691
stp x23, x24, [x2], #16 //AES block 4k+3 - store result
3692
3693
aese v3.16b, v18.16b
3694
aesmc v3.16b, v3.16b //AES block 4k+7 - round 0
3695
eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high
3696
3697
pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid
3698
add w12, w12, #1 //CTR block 4k+7
3699
3700
pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high
3701
eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low
3702
3703
aese v2.16b, v18.16b
3704
aesmc v2.16b, v2.16b //AES block 4k+6 - round 0
3705
3706
eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid
3707
mov d31, v6.d[1] //GHASH block 4k+2 - mid
3708
3709
aese v3.16b, v19.16b
3710
aesmc v3.16b, v3.16b //AES block 4k+7 - round 1
3711
3712
aese v2.16b, v19.16b
3713
aesmc v2.16b, v2.16b //AES block 4k+6 - round 1
3714
eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high
3715
3716
eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid
3717
3718
pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low
3719
3720
aese v2.16b, v20.16b
3721
aesmc v2.16b, v2.16b //AES block 4k+6 - round 2
3722
mov d30, v7.d[1] //GHASH block 4k+3 - mid
3723
3724
aese v3.16b, v20.16b
3725
aesmc v3.16b, v3.16b //AES block 4k+7 - round 2
3726
ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid
3727
3728
pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low
3729
3730
aese v0.16b, v21.16b
3731
aesmc v0.16b, v0.16b //AES block 4k+4 - round 3
3732
eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid
3733
3734
aese v1.16b, v21.16b
3735
aesmc v1.16b, v1.16b //AES block 4k+5 - round 3
3736
3737
pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid
3738
eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low
3739
3740
aese v0.16b, v22.16b
3741
aesmc v0.16b, v0.16b //AES block 4k+4 - round 4
3742
3743
pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high
3744
movi v8.8b, #0xc2
3745
3746
pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid
3747
3748
aese v2.16b, v21.16b
3749
aesmc v2.16b, v2.16b //AES block 4k+6 - round 3
3750
3751
shl d8, d8, #56 //mod_constant
3752
eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high
3753
3754
aese v0.16b, v23.16b
3755
aesmc v0.16b, v0.16b //AES block 4k+4 - round 5
3756
eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid
3757
3758
aese v2.16b, v22.16b
3759
aesmc v2.16b, v2.16b //AES block 4k+6 - round 4
3760
3761
pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid
3762
eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low
3763
3764
aese v0.16b, v24.16b
3765
aesmc v0.16b, v0.16b //AES block 4k+4 - round 6
3766
3767
aese v3.16b, v21.16b
3768
aesmc v3.16b, v3.16b //AES block 4k+7 - round 3
3769
eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid
3770
3771
aese v2.16b, v23.16b
3772
aesmc v2.16b, v2.16b //AES block 4k+6 - round 5
3773
3774
aese v0.16b, v25.16b
3775
aesmc v0.16b, v0.16b //AES block 4k+4 - round 7
3776
eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up
3777
3778
aese v3.16b, v22.16b
3779
aesmc v3.16b, v3.16b //AES block 4k+7 - round 4
3780
3781
aese v2.16b, v24.16b
3782
aesmc v2.16b, v2.16b //AES block 4k+6 - round 6
3783
ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment
3784
3785
aese v0.16b, v26.16b
3786
aesmc v0.16b, v0.16b //AES block 4k+4 - round 8
3787
3788
aese v3.16b, v23.16b
3789
aesmc v3.16b, v3.16b //AES block 4k+7 - round 5
3790
eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up
3791
3792
aese v1.16b, v22.16b
3793
aesmc v1.16b, v1.16b //AES block 4k+5 - round 4
3794
3795
aese v2.16b, v25.16b
3796
aesmc v2.16b, v2.16b //AES block 4k+6 - round 7
3797
3798
aese v0.16b, v27.16b
3799
aesmc v0.16b, v0.16b //AES block 4k+4 - round 9
3800
3801
aese v1.16b, v23.16b
3802
aesmc v1.16b, v1.16b //AES block 4k+5 - round 5
3803
3804
aese v3.16b, v24.16b
3805
aesmc v3.16b, v3.16b //AES block 4k+7 - round 6
3806
eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid
3807
3808
aese v0.16b, v28.16b
3809
aesmc v0.16b, v0.16b //AES block 4k+4 - round 10
3810
3811
aese v1.16b, v24.16b
3812
aesmc v1.16b, v1.16b //AES block 4k+5 - round 6
3813
3814
aese v3.16b, v25.16b
3815
aesmc v3.16b, v3.16b //AES block 4k+7 - round 7
3816
3817
aese v2.16b, v26.16b
3818
aesmc v2.16b, v2.16b //AES block 4k+6 - round 8
3819
eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid
3820
3821
aese v1.16b, v25.16b
3822
aesmc v1.16b, v1.16b //AES block 4k+5 - round 7
3823
3824
aese v3.16b, v26.16b
3825
aesmc v3.16b, v3.16b //AES block 4k+7 - round 8
3826
3827
aese v2.16b, v27.16b
3828
aesmc v2.16b, v2.16b //AES block 4k+6 - round 9
3829
3830
aese v1.16b, v26.16b
3831
aesmc v1.16b, v1.16b //AES block 4k+5 - round 8
3832
3833
aese v3.16b, v27.16b
3834
aesmc v3.16b, v3.16b //AES block 4k+7 - round 9
3835
3836
pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low
3837
3838
aese v1.16b, v27.16b
3839
aesmc v1.16b, v1.16b //AES block 4k+5 - round 9
3840
3841
aese v2.16b, v28.16b
3842
aesmc v2.16b, v2.16b //AES block 4k+6 - round 10
3843
3844
aese v3.16b, v28.16b
3845
aesmc v3.16b, v3.16b //AES block 4k+7 - round 10
3846
ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment
3847
3848
aese v1.16b, v28.16b
3849
aesmc v1.16b, v1.16b //AES block 4k+5 - round 10
3850
3851
aese v0.16b, v29.16b
3852
eor v11.16b, v11.16b, v8.16b //MODULO - fold into low
3853
3854
aese v2.16b, v29.16b
3855
3856
aese v1.16b, v29.16b
3857
3858
aese v3.16b, v29.16b
3859
3860
eor v11.16b, v11.16b, v10.16b //MODULO - fold into low
3861
.L192_dec_tail: //TAIL
3862
3863
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
3864
ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext
3865
3866
eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result
3867
3868
mov x7, v0.d[1] //AES block 4k+4 - mov high
3869
3870
mov x6, v0.d[0] //AES block 4k+4 - mov low
3871
3872
ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag
3873
3874
cmp x5, #48
3875
3876
eor x7, x7, x14 //AES block 4k+4 - round 12 high
3877
#ifdef __AARCH64EB__
3878
rev x7, x7
3879
#endif
3880
eor x6, x6, x13 //AES block 4k+4 - round 12 low
3881
#ifdef __AARCH64EB__
3882
rev x6, x6
3883
#endif
3884
b.gt .L192_dec_blocks_more_than_3
3885
3886
movi v11.8b, #0
3887
movi v9.8b, #0
3888
3889
mov v3.16b, v2.16b
3890
mov v2.16b, v1.16b
3891
sub w12, w12, #1
3892
3893
movi v10.8b, #0
3894
cmp x5, #32
3895
b.gt .L192_dec_blocks_more_than_2
3896
3897
mov v3.16b, v1.16b
3898
cmp x5, #16
3899
sub w12, w12, #1
3900
3901
b.gt .L192_dec_blocks_more_than_1
3902
3903
sub w12, w12, #1
3904
b .L192_dec_blocks_less_than_1
3905
.L192_dec_blocks_more_than_3: //blocks left > 3
3906
rev64 v4.16b, v5.16b //GHASH final-3 block
3907
ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext
3908
3909
stp x6, x7, [x2], #16 //AES final-3 block - store result
3910
3911
eor v4.16b, v4.16b, v8.16b //feed in partial tag
3912
3913
eor v0.16b, v5.16b, v1.16b //AES final-2 block - result
3914
3915
pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low
3916
mov x6, v0.d[0] //AES final-2 block - mov low
3917
mov d22, v4.d[1] //GHASH final-3 block - mid
3918
3919
mov x7, v0.d[1] //AES final-2 block - mov high
3920
3921
mov d10, v17.d[1] //GHASH final-3 block - mid
3922
eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid
3923
3924
pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high
3925
3926
eor x6, x6, x13 //AES final-2 block - round 12 low
3927
#ifdef __AARCH64EB__
3928
rev x6, x6
3929
#endif
3930
movi v8.8b, #0 //suppress further partial tag feed in
3931
3932
pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid
3933
eor x7, x7, x14 //AES final-2 block - round 12 high
3934
#ifdef __AARCH64EB__
3935
rev x7, x7
3936
#endif
3937
.L192_dec_blocks_more_than_2: //blocks left > 2
3938
3939
rev64 v4.16b, v5.16b //GHASH final-2 block
3940
ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext
3941
3942
eor v4.16b, v4.16b, v8.16b //feed in partial tag
3943
3944
movi v8.8b, #0 //suppress further partial tag feed in
3945
3946
eor v0.16b, v5.16b, v2.16b //AES final-1 block - result
3947
3948
mov d22, v4.d[1] //GHASH final-2 block - mid
3949
3950
pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low
3951
3952
stp x6, x7, [x2], #16 //AES final-2 block - store result
3953
3954
eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid
3955
mov x7, v0.d[1] //AES final-1 block - mov high
3956
3957
eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low
3958
mov x6, v0.d[0] //AES final-1 block - mov low
3959
3960
pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high
3961
3962
pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid
3963
3964
eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high
3965
eor x7, x7, x14 //AES final-1 block - round 12 high
3966
#ifdef __AARCH64EB__
3967
rev x7, x7
3968
#endif
3969
eor x6, x6, x13 //AES final-1 block - round 12 low
3970
#ifdef __AARCH64EB__
3971
rev x6, x6
3972
#endif
3973
eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid
3974
.L192_dec_blocks_more_than_1: //blocks left > 1
3975
3976
rev64 v4.16b, v5.16b //GHASH final-1 block
3977
3978
eor v4.16b, v4.16b, v8.16b //feed in partial tag
3979
ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext
3980
3981
mov d22, v4.d[1] //GHASH final-1 block - mid
3982
3983
pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high
3984
3985
eor v0.16b, v5.16b, v3.16b //AES final block - result
3986
stp x6, x7, [x2], #16 //AES final-1 block - store result
3987
3988
eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid
3989
3990
eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high
3991
3992
pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low
3993
mov x7, v0.d[1] //AES final block - mov high
3994
3995
ins v22.d[1], v22.d[0] //GHASH final-1 block - mid
3996
mov x6, v0.d[0] //AES final block - mov low
3997
3998
pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid
3999
4000
movi v8.8b, #0 //suppress further partial tag feed in
4001
eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low
4002
eor x7, x7, x14 //AES final block - round 12 high
4003
#ifdef __AARCH64EB__
4004
rev x7, x7
4005
#endif
4006
eor x6, x6, x13 //AES final block - round 12 low
4007
#ifdef __AARCH64EB__
4008
rev x6, x6
4009
#endif
4010
eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid
4011
.L192_dec_blocks_less_than_1: //blocks left <= 1
4012
4013
mvn x13, xzr //rk12_l = 0xffffffffffffffff
4014
ldp x4, x5, [x2] //load existing bytes we need to not overwrite
4015
and x1, x1, #127 //bit_length %= 128
4016
4017
sub x1, x1, #128 //bit_length -= 128
4018
4019
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
4020
4021
and x1, x1, #127 //bit_length %= 128
4022
mvn x14, xzr //rk12_h = 0xffffffffffffffff
4023
4024
lsr x14, x14, x1 //rk12_h is mask for top 64b of last block
4025
cmp x1, #64
4026
4027
csel x9, x13, x14, lt
4028
csel x10, x14, xzr, lt
4029
4030
fmov d0, x9 //ctr0b is mask for last block
4031
and x6, x6, x9
4032
bic x4, x4, x9 //mask out low existing bytes
4033
4034
orr x6, x6, x4
4035
mov v0.d[1], x10
4036
#ifndef __AARCH64EB__
4037
rev w9, w12
4038
#else
4039
mov w9, w12
4040
#endif
4041
4042
and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits
4043
str w9, [x16, #12] //store the updated counter
4044
4045
rev64 v4.16b, v5.16b //GHASH final block
4046
4047
eor v4.16b, v4.16b, v8.16b //feed in partial tag
4048
bic x5, x5, x10 //mask out high existing bytes
4049
4050
and x7, x7, x10
4051
4052
pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high
4053
mov d8, v4.d[1] //GHASH final block - mid
4054
4055
pmull v21.1q, v4.1d, v12.1d //GHASH final block - low
4056
4057
eor v8.8b, v8.8b, v4.8b //GHASH final block - mid
4058
4059
eor v9.16b, v9.16b, v20.16b //GHASH final block - high
4060
4061
pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid
4062
4063
eor v11.16b, v11.16b, v21.16b //GHASH final block - low
4064
4065
eor v10.16b, v10.16b, v8.16b //GHASH final block - mid
4066
movi v8.8b, #0xc2
4067
4068
eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up
4069
4070
shl d8, d8, #56 //mod_constant
4071
4072
eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up
4073
4074
pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid
4075
orr x7, x7, x5
4076
stp x6, x7, [x2]
4077
4078
ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment
4079
4080
eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid
4081
4082
eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid
4083
4084
pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low
4085
4086
eor v11.16b, v11.16b, v8.16b //MODULO - fold into low
4087
4088
ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment
4089
4090
eor v11.16b, v11.16b, v10.16b //MODULO - fold into low
4091
ext v11.16b, v11.16b, v11.16b, #8
4092
rev64 v11.16b, v11.16b
4093
mov x0, x15
4094
st1 { v11.16b }, [x3]
4095
4096
ldp x21, x22, [sp, #16]
4097
ldp x23, x24, [sp, #32]
4098
ldp d8, d9, [sp, #48]
4099
ldp d10, d11, [sp, #64]
4100
ldp d12, d13, [sp, #80]
4101
ldp d14, d15, [sp, #96]
4102
ldp x19, x20, [sp], #112
4103
ret
4104
4105
.L192_dec_ret:
4106
mov w0, #0x0
4107
ret
4108
.size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
4109
.globl aes_gcm_enc_256_kernel
4110
.type aes_gcm_enc_256_kernel,%function
4111
.align 4
4112
aes_gcm_enc_256_kernel:
4113
AARCH64_VALID_CALL_TARGET
4114
cbz x1, .L256_enc_ret
4115
stp x19, x20, [sp, #-112]!
4116
mov x16, x4
4117
mov x8, x5
4118
stp x21, x22, [sp, #16]
4119
stp x23, x24, [sp, #32]
4120
stp d8, d9, [sp, #48]
4121
stp d10, d11, [sp, #64]
4122
stp d12, d13, [sp, #80]
4123
stp d14, d15, [sp, #96]
4124
4125
add x4, x0, x1, lsr #3 //end_input_ptr
4126
lsr x5, x1, #3 //byte_len
4127
mov x15, x5
4128
ldp x10, x11, [x16] //ctr96_b64, ctr96_t32
4129
#ifdef __AARCH64EB__
4130
rev x10, x10
4131
rev x11, x11
4132
#endif
4133
ldp x13, x14, [x8, #224] //load rk14
4134
#ifdef __AARCH64EB__
4135
ror x13, x13, #32
4136
ror x14, x14, #32
4137
#endif
4138
ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible
4139
sub x5, x5, #1 //byte_len - 1
4140
4141
ld1 {v18.4s}, [x8], #16 //load rk0
4142
and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
4143
4144
ld1 {v19.4s}, [x8], #16 //load rk1
4145
add x5, x5, x0
4146
4147
lsr x12, x11, #32
4148
fmov d2, x10 //CTR block 2
4149
orr w11, w11, w11
4150
4151
rev w12, w12 //rev_ctr32
4152
cmp x0, x5 //check if we have <= 4 blocks
4153
fmov d1, x10 //CTR block 1
4154
4155
aese v0.16b, v18.16b
4156
aesmc v0.16b, v0.16b //AES block 0 - round 0
4157
add w12, w12, #1 //increment rev_ctr32
4158
4159
rev w9, w12 //CTR block 1
4160
fmov d3, x10 //CTR block 3
4161
4162
orr x9, x11, x9, lsl #32 //CTR block 1
4163
add w12, w12, #1 //CTR block 1
4164
ld1 {v20.4s}, [x8], #16 //load rk2
4165
4166
fmov v1.d[1], x9 //CTR block 1
4167
rev w9, w12 //CTR block 2
4168
add w12, w12, #1 //CTR block 2
4169
4170
orr x9, x11, x9, lsl #32 //CTR block 2
4171
ld1 {v21.4s}, [x8], #16 //load rk3
4172
4173
fmov v2.d[1], x9 //CTR block 2
4174
rev w9, w12 //CTR block 3
4175
4176
aese v0.16b, v19.16b
4177
aesmc v0.16b, v0.16b //AES block 0 - round 1
4178
orr x9, x11, x9, lsl #32 //CTR block 3
4179
4180
fmov v3.d[1], x9 //CTR block 3
4181
4182
aese v1.16b, v18.16b
4183
aesmc v1.16b, v1.16b //AES block 1 - round 0
4184
ld1 {v22.4s}, [x8], #16 //load rk4
4185
4186
aese v0.16b, v20.16b
4187
aesmc v0.16b, v0.16b //AES block 0 - round 2
4188
ld1 {v23.4s}, [x8], #16 //load rk5
4189
4190
aese v2.16b, v18.16b
4191
aesmc v2.16b, v2.16b //AES block 2 - round 0
4192
ld1 {v24.4s}, [x8], #16 //load rk6
4193
4194
aese v1.16b, v19.16b
4195
aesmc v1.16b, v1.16b //AES block 1 - round 1
4196
ldr q14, [x3, #80] //load h3l | h3h
4197
#ifndef __AARCH64EB__
4198
ext v14.16b, v14.16b, v14.16b, #8
4199
#endif
4200
aese v3.16b, v18.16b
4201
aesmc v3.16b, v3.16b //AES block 3 - round 0
4202
ld1 {v25.4s}, [x8], #16 //load rk7
4203
4204
aese v2.16b, v19.16b
4205
aesmc v2.16b, v2.16b //AES block 2 - round 1
4206
ld1 {v26.4s}, [x8], #16 //load rk8
4207
4208
aese v1.16b, v20.16b
4209
aesmc v1.16b, v1.16b //AES block 1 - round 2
4210
ldr q13, [x3, #64] //load h2l | h2h
4211
#ifndef __AARCH64EB__
4212
ext v13.16b, v13.16b, v13.16b, #8
4213
#endif
4214
aese v3.16b, v19.16b
4215
aesmc v3.16b, v3.16b //AES block 3 - round 1
4216
ld1 {v27.4s}, [x8], #16 //load rk9
4217
4218
aese v2.16b, v20.16b
4219
aesmc v2.16b, v2.16b //AES block 2 - round 2
4220
ldr q15, [x3, #112] //load h4l | h4h
4221
#ifndef __AARCH64EB__
4222
ext v15.16b, v15.16b, v15.16b, #8
4223
#endif
4224
aese v1.16b, v21.16b
4225
aesmc v1.16b, v1.16b //AES block 1 - round 3
4226
ld1 {v28.4s}, [x8], #16 //load rk10
4227
4228
aese v3.16b, v20.16b
4229
aesmc v3.16b, v3.16b //AES block 3 - round 2
4230
ld1 {v29.4s}, [x8], #16 //load rk11
4231
4232
aese v2.16b, v21.16b
4233
aesmc v2.16b, v2.16b //AES block 2 - round 3
4234
add w12, w12, #1 //CTR block 3
4235
4236
aese v0.16b, v21.16b
4237
aesmc v0.16b, v0.16b //AES block 0 - round 3
4238
4239
aese v3.16b, v21.16b
4240
aesmc v3.16b, v3.16b //AES block 3 - round 3
4241
ld1 { v11.16b}, [x3]
4242
ext v11.16b, v11.16b, v11.16b, #8
4243
rev64 v11.16b, v11.16b
4244
4245
aese v2.16b, v22.16b
4246
aesmc v2.16b, v2.16b //AES block 2 - round 4
4247
4248
aese v0.16b, v22.16b
4249
aesmc v0.16b, v0.16b //AES block 0 - round 4
4250
4251
aese v1.16b, v22.16b
4252
aesmc v1.16b, v1.16b //AES block 1 - round 4
4253
4254
aese v3.16b, v22.16b
4255
aesmc v3.16b, v3.16b //AES block 3 - round 4
4256
4257
aese v0.16b, v23.16b
4258
aesmc v0.16b, v0.16b //AES block 0 - round 5
4259
4260
aese v1.16b, v23.16b
4261
aesmc v1.16b, v1.16b //AES block 1 - round 5
4262
4263
aese v3.16b, v23.16b
4264
aesmc v3.16b, v3.16b //AES block 3 - round 5
4265
4266
aese v2.16b, v23.16b
4267
aesmc v2.16b, v2.16b //AES block 2 - round 5
4268
4269
aese v1.16b, v24.16b
4270
aesmc v1.16b, v1.16b //AES block 1 - round 6
4271
trn2 v17.2d, v14.2d, v15.2d //h4l | h3l
4272
4273
aese v3.16b, v24.16b
4274
aesmc v3.16b, v3.16b //AES block 3 - round 6
4275
ld1 {v30.4s}, [x8], #16 //load rk12
4276
4277
aese v0.16b, v24.16b
4278
aesmc v0.16b, v0.16b //AES block 0 - round 6
4279
ldr q12, [x3, #32] //load h1l | h1h
4280
#ifndef __AARCH64EB__
4281
ext v12.16b, v12.16b, v12.16b, #8
4282
#endif
4283
aese v2.16b, v24.16b
4284
aesmc v2.16b, v2.16b //AES block 2 - round 6
4285
ld1 {v31.4s}, [x8], #16 //load rk13
4286
4287
aese v1.16b, v25.16b
4288
aesmc v1.16b, v1.16b //AES block 1 - round 7
4289
trn1 v9.2d, v14.2d, v15.2d //h4h | h3h
4290
4291
aese v0.16b, v25.16b
4292
aesmc v0.16b, v0.16b //AES block 0 - round 7
4293
4294
aese v2.16b, v25.16b
4295
aesmc v2.16b, v2.16b //AES block 2 - round 7
4296
4297
aese v3.16b, v25.16b
4298
aesmc v3.16b, v3.16b //AES block 3 - round 7
4299
trn2 v16.2d, v12.2d, v13.2d //h2l | h1l
4300
4301
aese v1.16b, v26.16b
4302
aesmc v1.16b, v1.16b //AES block 1 - round 8
4303
4304
aese v2.16b, v26.16b
4305
aesmc v2.16b, v2.16b //AES block 2 - round 8
4306
4307
aese v3.16b, v26.16b
4308
aesmc v3.16b, v3.16b //AES block 3 - round 8
4309
4310
aese v1.16b, v27.16b
4311
aesmc v1.16b, v1.16b //AES block 1 - round 9
4312
4313
aese v2.16b, v27.16b
4314
aesmc v2.16b, v2.16b //AES block 2 - round 9
4315
4316
aese v0.16b, v26.16b
4317
aesmc v0.16b, v0.16b //AES block 0 - round 8
4318
4319
aese v1.16b, v28.16b
4320
aesmc v1.16b, v1.16b //AES block 1 - round 10
4321
4322
aese v3.16b, v27.16b
4323
aesmc v3.16b, v3.16b //AES block 3 - round 9
4324
4325
aese v0.16b, v27.16b
4326
aesmc v0.16b, v0.16b //AES block 0 - round 9
4327
4328
aese v2.16b, v28.16b
4329
aesmc v2.16b, v2.16b //AES block 2 - round 10
4330
4331
aese v3.16b, v28.16b
4332
aesmc v3.16b, v3.16b //AES block 3 - round 10
4333
4334
aese v1.16b, v29.16b
4335
aesmc v1.16b, v1.16b //AES block 1 - round 11
4336
4337
aese v2.16b, v29.16b
4338
aesmc v2.16b, v2.16b //AES block 2 - round 11
4339
4340
aese v0.16b, v28.16b
4341
aesmc v0.16b, v0.16b //AES block 0 - round 10
4342
4343
aese v1.16b, v30.16b
4344
aesmc v1.16b, v1.16b //AES block 1 - round 12
4345
4346
aese v2.16b, v30.16b
4347
aesmc v2.16b, v2.16b //AES block 2 - round 12
4348
4349
aese v0.16b, v29.16b
4350
aesmc v0.16b, v0.16b //AES block 0 - round 11
4351
eor v17.16b, v17.16b, v9.16b //h4k | h3k
4352
4353
aese v3.16b, v29.16b
4354
aesmc v3.16b, v3.16b //AES block 3 - round 11
4355
4356
aese v2.16b, v31.16b //AES block 2 - round 13
4357
trn1 v8.2d, v12.2d, v13.2d //h2h | h1h
4358
4359
aese v0.16b, v30.16b
4360
aesmc v0.16b, v0.16b //AES block 0 - round 12
4361
4362
aese v3.16b, v30.16b
4363
aesmc v3.16b, v3.16b //AES block 3 - round 12
4364
4365
aese v1.16b, v31.16b //AES block 1 - round 13
4366
4367
aese v0.16b, v31.16b //AES block 0 - round 13
4368
4369
aese v3.16b, v31.16b //AES block 3 - round 13
4370
eor v16.16b, v16.16b, v8.16b //h2k | h1k
4371
b.ge .L256_enc_tail //handle tail
4372
4373
ldp x19, x20, [x0, #16] //AES block 1 - load plaintext
4374
#ifdef __AARCH64EB__
4375
rev x19, x19
4376
rev x20, x20
4377
#endif
4378
rev w9, w12 //CTR block 4
4379
ldp x6, x7, [x0, #0] //AES block 0 - load plaintext
4380
#ifdef __AARCH64EB__
4381
rev x6, x6
4382
rev x7, x7
4383
#endif
4384
ldp x23, x24, [x0, #48] //AES block 3 - load plaintext
4385
#ifdef __AARCH64EB__
4386
rev x23, x23
4387
rev x24, x24
4388
#endif
4389
ldp x21, x22, [x0, #32] //AES block 2 - load plaintext
4390
#ifdef __AARCH64EB__
4391
rev x21, x21
4392
rev x22, x22
4393
#endif
4394
add x0, x0, #64 //AES input_ptr update
4395
4396
eor x19, x19, x13 //AES block 1 - round 14 low
4397
eor x20, x20, x14 //AES block 1 - round 14 high
4398
4399
fmov d5, x19 //AES block 1 - mov low
4400
eor x6, x6, x13 //AES block 0 - round 14 low
4401
4402
eor x7, x7, x14 //AES block 0 - round 14 high
4403
eor x24, x24, x14 //AES block 3 - round 14 high
4404
fmov d4, x6 //AES block 0 - mov low
4405
4406
cmp x0, x5 //check if we have <= 8 blocks
4407
fmov v4.d[1], x7 //AES block 0 - mov high
4408
eor x23, x23, x13 //AES block 3 - round 14 low
4409
4410
eor x21, x21, x13 //AES block 2 - round 14 low
4411
fmov v5.d[1], x20 //AES block 1 - mov high
4412
4413
fmov d6, x21 //AES block 2 - mov low
4414
add w12, w12, #1 //CTR block 4
4415
4416
orr x9, x11, x9, lsl #32 //CTR block 4
4417
fmov d7, x23 //AES block 3 - mov low
4418
eor x22, x22, x14 //AES block 2 - round 14 high
4419
4420
fmov v6.d[1], x22 //AES block 2 - mov high
4421
4422
eor v4.16b, v4.16b, v0.16b //AES block 0 - result
4423
fmov d0, x10 //CTR block 4
4424
4425
fmov v0.d[1], x9 //CTR block 4
4426
rev w9, w12 //CTR block 5
4427
add w12, w12, #1 //CTR block 5
4428
4429
eor v5.16b, v5.16b, v1.16b //AES block 1 - result
4430
fmov d1, x10 //CTR block 5
4431
orr x9, x11, x9, lsl #32 //CTR block 5
4432
4433
fmov v1.d[1], x9 //CTR block 5
4434
rev w9, w12 //CTR block 6
4435
st1 { v4.16b}, [x2], #16 //AES block 0 - store result
4436
4437
fmov v7.d[1], x24 //AES block 3 - mov high
4438
orr x9, x11, x9, lsl #32 //CTR block 6
4439
eor v6.16b, v6.16b, v2.16b //AES block 2 - result
4440
4441
st1 { v5.16b}, [x2], #16 //AES block 1 - store result
4442
4443
add w12, w12, #1 //CTR block 6
4444
fmov d2, x10 //CTR block 6
4445
4446
fmov v2.d[1], x9 //CTR block 6
4447
st1 { v6.16b}, [x2], #16 //AES block 2 - store result
4448
rev w9, w12 //CTR block 7
4449
4450
orr x9, x11, x9, lsl #32 //CTR block 7
4451
4452
eor v7.16b, v7.16b, v3.16b //AES block 3 - result
4453
st1 { v7.16b}, [x2], #16 //AES block 3 - store result
4454
b.ge .L256_enc_prepretail //do prepretail
4455
4456
.L256_enc_main_loop: //main loop start
4457
aese v0.16b, v18.16b
4458
aesmc v0.16b, v0.16b //AES block 4k+4 - round 0
4459
rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free)
4460
4461
aese v1.16b, v18.16b
4462
aesmc v1.16b, v1.16b //AES block 4k+5 - round 0
4463
fmov d3, x10 //CTR block 4k+3
4464
4465
aese v2.16b, v18.16b
4466
aesmc v2.16b, v2.16b //AES block 4k+6 - round 0
4467
ext v11.16b, v11.16b, v11.16b, #8 //PRE 0
4468
4469
aese v0.16b, v19.16b
4470
aesmc v0.16b, v0.16b //AES block 4k+4 - round 1
4471
fmov v3.d[1], x9 //CTR block 4k+3
4472
4473
aese v1.16b, v19.16b
4474
aesmc v1.16b, v1.16b //AES block 4k+5 - round 1
4475
ldp x23, x24, [x0, #48] //AES block 4k+7 - load plaintext
4476
#ifdef __AARCH64EB__
4477
rev x23, x23
4478
rev x24, x24
4479
#endif
4480
aese v2.16b, v19.16b
4481
aesmc v2.16b, v2.16b //AES block 4k+6 - round 1
4482
ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext
4483
#ifdef __AARCH64EB__
4484
rev x21, x21
4485
rev x22, x22
4486
#endif
4487
aese v0.16b, v20.16b
4488
aesmc v0.16b, v0.16b //AES block 4k+4 - round 2
4489
eor v4.16b, v4.16b, v11.16b //PRE 1
4490
4491
aese v1.16b, v20.16b
4492
aesmc v1.16b, v1.16b //AES block 4k+5 - round 2
4493
4494
aese v3.16b, v18.16b
4495
aesmc v3.16b, v3.16b //AES block 4k+7 - round 0
4496
eor x23, x23, x13 //AES block 4k+7 - round 14 low
4497
4498
aese v0.16b, v21.16b
4499
aesmc v0.16b, v0.16b //AES block 4k+4 - round 3
4500
mov d10, v17.d[1] //GHASH block 4k - mid
4501
4502
pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high
4503
eor x22, x22, x14 //AES block 4k+6 - round 14 high
4504
mov d8, v4.d[1] //GHASH block 4k - mid
4505
4506
aese v3.16b, v19.16b
4507
aesmc v3.16b, v3.16b //AES block 4k+7 - round 1
4508
rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free)
4509
4510
aese v0.16b, v22.16b
4511
aesmc v0.16b, v0.16b //AES block 4k+4 - round 4
4512
4513
pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low
4514
eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid
4515
4516
aese v2.16b, v20.16b
4517
aesmc v2.16b, v2.16b //AES block 4k+6 - round 2
4518
4519
aese v0.16b, v23.16b
4520
aesmc v0.16b, v0.16b //AES block 4k+4 - round 5
4521
rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free)
4522
4523
pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high
4524
4525
pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid
4526
rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free)
4527
4528
pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low
4529
4530
eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high
4531
mov d4, v5.d[1] //GHASH block 4k+1 - mid
4532
4533
aese v1.16b, v21.16b
4534
aesmc v1.16b, v1.16b //AES block 4k+5 - round 3
4535
4536
aese v3.16b, v20.16b
4537
aesmc v3.16b, v3.16b //AES block 4k+7 - round 2
4538
eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low
4539
4540
aese v2.16b, v21.16b
4541
aesmc v2.16b, v2.16b //AES block 4k+6 - round 3
4542
4543
aese v1.16b, v22.16b
4544
aesmc v1.16b, v1.16b //AES block 4k+5 - round 4
4545
mov d8, v6.d[1] //GHASH block 4k+2 - mid
4546
4547
aese v3.16b, v21.16b
4548
aesmc v3.16b, v3.16b //AES block 4k+7 - round 3
4549
eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid
4550
4551
aese v2.16b, v22.16b
4552
aesmc v2.16b, v2.16b //AES block 4k+6 - round 4
4553
4554
aese v0.16b, v24.16b
4555
aesmc v0.16b, v0.16b //AES block 4k+4 - round 6
4556
eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid
4557
4558
aese v3.16b, v22.16b
4559
aesmc v3.16b, v3.16b //AES block 4k+7 - round 4
4560
4561
pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid
4562
4563
aese v0.16b, v25.16b
4564
aesmc v0.16b, v0.16b //AES block 4k+4 - round 7
4565
4566
aese v3.16b, v23.16b
4567
aesmc v3.16b, v3.16b //AES block 4k+7 - round 5
4568
ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid
4569
4570
aese v1.16b, v23.16b
4571
aesmc v1.16b, v1.16b //AES block 4k+5 - round 5
4572
4573
aese v0.16b, v26.16b
4574
aesmc v0.16b, v0.16b //AES block 4k+4 - round 8
4575
4576
aese v2.16b, v23.16b
4577
aesmc v2.16b, v2.16b //AES block 4k+6 - round 5
4578
4579
aese v1.16b, v24.16b
4580
aesmc v1.16b, v1.16b //AES block 4k+5 - round 6
4581
eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid
4582
4583
pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high
4584
4585
pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low
4586
4587
aese v1.16b, v25.16b
4588
aesmc v1.16b, v1.16b //AES block 4k+5 - round 7
4589
4590
pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low
4591
eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high
4592
4593
aese v3.16b, v24.16b
4594
aesmc v3.16b, v3.16b //AES block 4k+7 - round 6
4595
ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext
4596
#ifdef __AARCH64EB__
4597
rev x19, x19
4598
rev x20, x20
4599
#endif
4600
aese v1.16b, v26.16b
4601
aesmc v1.16b, v1.16b //AES block 4k+5 - round 8
4602
mov d4, v7.d[1] //GHASH block 4k+3 - mid
4603
4604
aese v2.16b, v24.16b
4605
aesmc v2.16b, v2.16b //AES block 4k+6 - round 6
4606
eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low
4607
4608
pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid
4609
4610
pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high
4611
eor v4.8b, v4.8b, v7.8b //GHASH block 4k+3 - mid
4612
4613
aese v2.16b, v25.16b
4614
aesmc v2.16b, v2.16b //AES block 4k+6 - round 7
4615
eor x19, x19, x13 //AES block 4k+5 - round 14 low
4616
4617
aese v1.16b, v27.16b
4618
aesmc v1.16b, v1.16b //AES block 4k+5 - round 9
4619
eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid
4620
4621
aese v3.16b, v25.16b
4622
aesmc v3.16b, v3.16b //AES block 4k+7 - round 7
4623
eor x21, x21, x13 //AES block 4k+6 - round 14 low
4624
4625
aese v0.16b, v27.16b
4626
aesmc v0.16b, v0.16b //AES block 4k+4 - round 9
4627
movi v8.8b, #0xc2
4628
4629
pmull v4.1q, v4.1d, v16.1d //GHASH block 4k+3 - mid
4630
eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high
4631
fmov d5, x19 //AES block 4k+5 - mov low
4632
4633
aese v2.16b, v26.16b
4634
aesmc v2.16b, v2.16b //AES block 4k+6 - round 8
4635
ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext
4636
#ifdef __AARCH64EB__
4637
rev x6, x6
4638
rev x7, x7
4639
#endif
4640
aese v0.16b, v28.16b
4641
aesmc v0.16b, v0.16b //AES block 4k+4 - round 10
4642
shl d8, d8, #56 //mod_constant
4643
4644
aese v3.16b, v26.16b
4645
aesmc v3.16b, v3.16b //AES block 4k+7 - round 8
4646
eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low
4647
4648
aese v2.16b, v27.16b
4649
aesmc v2.16b, v2.16b //AES block 4k+6 - round 9
4650
4651
aese v1.16b, v28.16b
4652
aesmc v1.16b, v1.16b //AES block 4k+5 - round 10
4653
eor v10.16b, v10.16b, v4.16b //GHASH block 4k+3 - mid
4654
4655
aese v3.16b, v27.16b
4656
aesmc v3.16b, v3.16b //AES block 4k+7 - round 9
4657
add w12, w12, #1 //CTR block 4k+3
4658
4659
aese v0.16b, v29.16b
4660
aesmc v0.16b, v0.16b //AES block 4k+4 - round 11
4661
eor v4.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up
4662
4663
aese v1.16b, v29.16b
4664
aesmc v1.16b, v1.16b //AES block 4k+5 - round 11
4665
add x0, x0, #64 //AES input_ptr update
4666
4667
pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid
4668
rev w9, w12 //CTR block 4k+8
4669
ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment
4670
4671
aese v2.16b, v28.16b
4672
aesmc v2.16b, v2.16b //AES block 4k+6 - round 10
4673
eor x6, x6, x13 //AES block 4k+4 - round 14 low
4674
4675
aese v1.16b, v30.16b
4676
aesmc v1.16b, v1.16b //AES block 4k+5 - round 12
4677
eor v10.16b, v10.16b, v4.16b //MODULO - karatsuba tidy up
4678
4679
aese v3.16b, v28.16b
4680
aesmc v3.16b, v3.16b //AES block 4k+7 - round 10
4681
eor x7, x7, x14 //AES block 4k+4 - round 14 high
4682
4683
fmov d4, x6 //AES block 4k+4 - mov low
4684
orr x9, x11, x9, lsl #32 //CTR block 4k+8
4685
eor v7.16b, v9.16b, v7.16b //MODULO - fold into mid
4686
4687
aese v0.16b, v30.16b
4688
aesmc v0.16b, v0.16b //AES block 4k+4 - round 12
4689
eor x20, x20, x14 //AES block 4k+5 - round 14 high
4690
4691
aese v2.16b, v29.16b
4692
aesmc v2.16b, v2.16b //AES block 4k+6 - round 11
4693
eor x24, x24, x14 //AES block 4k+7 - round 14 high
4694
4695
aese v3.16b, v29.16b
4696
aesmc v3.16b, v3.16b //AES block 4k+7 - round 11
4697
add w12, w12, #1 //CTR block 4k+8
4698
4699
aese v0.16b, v31.16b //AES block 4k+4 - round 13
4700
fmov v4.d[1], x7 //AES block 4k+4 - mov high
4701
eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid
4702
4703
aese v2.16b, v30.16b
4704
aesmc v2.16b, v2.16b //AES block 4k+6 - round 12
4705
fmov d7, x23 //AES block 4k+7 - mov low
4706
4707
aese v1.16b, v31.16b //AES block 4k+5 - round 13
4708
fmov v5.d[1], x20 //AES block 4k+5 - mov high
4709
4710
fmov d6, x21 //AES block 4k+6 - mov low
4711
cmp x0, x5 //.LOOP CONTROL
4712
4713
fmov v6.d[1], x22 //AES block 4k+6 - mov high
4714
4715
pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low
4716
eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result
4717
fmov d0, x10 //CTR block 4k+8
4718
4719
fmov v0.d[1], x9 //CTR block 4k+8
4720
rev w9, w12 //CTR block 4k+9
4721
add w12, w12, #1 //CTR block 4k+9
4722
4723
eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result
4724
fmov d1, x10 //CTR block 4k+9
4725
orr x9, x11, x9, lsl #32 //CTR block 4k+9
4726
4727
aese v3.16b, v30.16b
4728
aesmc v3.16b, v3.16b //AES block 4k+7 - round 12
4729
fmov v1.d[1], x9 //CTR block 4k+9
4730
4731
aese v2.16b, v31.16b //AES block 4k+6 - round 13
4732
rev w9, w12 //CTR block 4k+10
4733
st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result
4734
4735
orr x9, x11, x9, lsl #32 //CTR block 4k+10
4736
eor v11.16b, v11.16b, v9.16b //MODULO - fold into low
4737
fmov v7.d[1], x24 //AES block 4k+7 - mov high
4738
4739
ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment
4740
st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result
4741
add w12, w12, #1 //CTR block 4k+10
4742
4743
aese v3.16b, v31.16b //AES block 4k+7 - round 13
4744
eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result
4745
fmov d2, x10 //CTR block 4k+10
4746
4747
st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result
4748
fmov v2.d[1], x9 //CTR block 4k+10
4749
rev w9, w12 //CTR block 4k+11
4750
4751
eor v11.16b, v11.16b, v10.16b //MODULO - fold into low
4752
orr x9, x11, x9, lsl #32 //CTR block 4k+11
4753
4754
eor v7.16b, v7.16b, v3.16b //AES block 4k+7 - result
4755
st1 { v7.16b}, [x2], #16 //AES block 4k+7 - store result
4756
b.lt .L256_enc_main_loop
4757
4758
.L256_enc_prepretail: //PREPRETAIL
4759
aese v1.16b, v18.16b
4760
aesmc v1.16b, v1.16b //AES block 4k+5 - round 0
4761
rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free)
4762
4763
aese v2.16b, v18.16b
4764
aesmc v2.16b, v2.16b //AES block 4k+6 - round 0
4765
fmov d3, x10 //CTR block 4k+3
4766
4767
aese v0.16b, v18.16b
4768
aesmc v0.16b, v0.16b //AES block 4k+4 - round 0
4769
rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free)
4770
4771
fmov v3.d[1], x9 //CTR block 4k+3
4772
ext v11.16b, v11.16b, v11.16b, #8 //PRE 0
4773
4774
aese v2.16b, v19.16b
4775
aesmc v2.16b, v2.16b //AES block 4k+6 - round 1
4776
4777
aese v0.16b, v19.16b
4778
aesmc v0.16b, v0.16b //AES block 4k+4 - round 1
4779
4780
eor v4.16b, v4.16b, v11.16b //PRE 1
4781
rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free)
4782
4783
aese v2.16b, v20.16b
4784
aesmc v2.16b, v2.16b //AES block 4k+6 - round 2
4785
4786
aese v3.16b, v18.16b
4787
aesmc v3.16b, v3.16b //AES block 4k+7 - round 0
4788
mov d10, v17.d[1] //GHASH block 4k - mid
4789
4790
aese v1.16b, v19.16b
4791
aesmc v1.16b, v1.16b //AES block 4k+5 - round 1
4792
4793
pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low
4794
mov d8, v4.d[1] //GHASH block 4k - mid
4795
4796
pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high
4797
4798
aese v2.16b, v21.16b
4799
aesmc v2.16b, v2.16b //AES block 4k+6 - round 3
4800
4801
aese v1.16b, v20.16b
4802
aesmc v1.16b, v1.16b //AES block 4k+5 - round 2
4803
eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid
4804
4805
aese v0.16b, v20.16b
4806
aesmc v0.16b, v0.16b //AES block 4k+4 - round 2
4807
4808
aese v3.16b, v19.16b
4809
aesmc v3.16b, v3.16b //AES block 4k+7 - round 1
4810
4811
aese v1.16b, v21.16b
4812
aesmc v1.16b, v1.16b //AES block 4k+5 - round 3
4813
4814
pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid
4815
4816
pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high
4817
4818
pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low
4819
4820
aese v3.16b, v20.16b
4821
aesmc v3.16b, v3.16b //AES block 4k+7 - round 2
4822
4823
eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high
4824
mov d4, v5.d[1] //GHASH block 4k+1 - mid
4825
4826
aese v0.16b, v21.16b
4827
aesmc v0.16b, v0.16b //AES block 4k+4 - round 3
4828
eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low
4829
4830
aese v3.16b, v21.16b
4831
aesmc v3.16b, v3.16b //AES block 4k+7 - round 3
4832
4833
eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid
4834
mov d8, v6.d[1] //GHASH block 4k+2 - mid
4835
4836
aese v0.16b, v22.16b
4837
aesmc v0.16b, v0.16b //AES block 4k+4 - round 4
4838
rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free)
4839
4840
aese v3.16b, v22.16b
4841
aesmc v3.16b, v3.16b //AES block 4k+7 - round 4
4842
4843
pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid
4844
eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid
4845
add w12, w12, #1 //CTR block 4k+3
4846
4847
pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low
4848
4849
aese v3.16b, v23.16b
4850
aesmc v3.16b, v3.16b //AES block 4k+7 - round 5
4851
4852
aese v2.16b, v22.16b
4853
aesmc v2.16b, v2.16b //AES block 4k+6 - round 4
4854
eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid
4855
4856
pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high
4857
4858
eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low
4859
ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid
4860
4861
aese v2.16b, v23.16b
4862
aesmc v2.16b, v2.16b //AES block 4k+6 - round 5
4863
4864
eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high
4865
mov d4, v7.d[1] //GHASH block 4k+3 - mid
4866
4867
aese v1.16b, v22.16b
4868
aesmc v1.16b, v1.16b //AES block 4k+5 - round 4
4869
4870
pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid
4871
4872
eor v4.8b, v4.8b, v7.8b //GHASH block 4k+3 - mid
4873
4874
pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high
4875
4876
aese v1.16b, v23.16b
4877
aesmc v1.16b, v1.16b //AES block 4k+5 - round 5
4878
4879
pmull v4.1q, v4.1d, v16.1d //GHASH block 4k+3 - mid
4880
eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid
4881
4882
aese v0.16b, v23.16b
4883
aesmc v0.16b, v0.16b //AES block 4k+4 - round 5
4884
4885
aese v1.16b, v24.16b
4886
aesmc v1.16b, v1.16b //AES block 4k+5 - round 6
4887
4888
aese v2.16b, v24.16b
4889
aesmc v2.16b, v2.16b //AES block 4k+6 - round 6
4890
4891
aese v0.16b, v24.16b
4892
aesmc v0.16b, v0.16b //AES block 4k+4 - round 6
4893
movi v8.8b, #0xc2
4894
4895
aese v3.16b, v24.16b
4896
aesmc v3.16b, v3.16b //AES block 4k+7 - round 6
4897
4898
aese v1.16b, v25.16b
4899
aesmc v1.16b, v1.16b //AES block 4k+5 - round 7
4900
eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high
4901
4902
aese v0.16b, v25.16b
4903
aesmc v0.16b, v0.16b //AES block 4k+4 - round 7
4904
4905
aese v3.16b, v25.16b
4906
aesmc v3.16b, v3.16b //AES block 4k+7 - round 7
4907
shl d8, d8, #56 //mod_constant
4908
4909
aese v1.16b, v26.16b
4910
aesmc v1.16b, v1.16b //AES block 4k+5 - round 8
4911
eor v10.16b, v10.16b, v4.16b //GHASH block 4k+3 - mid
4912
4913
pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low
4914
4915
aese v3.16b, v26.16b
4916
aesmc v3.16b, v3.16b //AES block 4k+7 - round 8
4917
4918
aese v1.16b, v27.16b
4919
aesmc v1.16b, v1.16b //AES block 4k+5 - round 9
4920
4921
aese v0.16b, v26.16b
4922
aesmc v0.16b, v0.16b //AES block 4k+4 - round 8
4923
eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low
4924
4925
aese v3.16b, v27.16b
4926
aesmc v3.16b, v3.16b //AES block 4k+7 - round 9
4927
4928
eor v10.16b, v10.16b, v9.16b //karatsuba tidy up
4929
4930
pmull v4.1q, v9.1d, v8.1d
4931
ext v9.16b, v9.16b, v9.16b, #8
4932
4933
aese v3.16b, v28.16b
4934
aesmc v3.16b, v3.16b //AES block 4k+7 - round 10
4935
4936
aese v2.16b, v25.16b
4937
aesmc v2.16b, v2.16b //AES block 4k+6 - round 7
4938
eor v10.16b, v10.16b, v11.16b
4939
4940
aese v1.16b, v28.16b
4941
aesmc v1.16b, v1.16b //AES block 4k+5 - round 10
4942
4943
aese v0.16b, v27.16b
4944
aesmc v0.16b, v0.16b //AES block 4k+4 - round 9
4945
4946
aese v2.16b, v26.16b
4947
aesmc v2.16b, v2.16b //AES block 4k+6 - round 8
4948
4949
aese v1.16b, v29.16b
4950
aesmc v1.16b, v1.16b //AES block 4k+5 - round 11
4951
eor v10.16b, v10.16b, v4.16b
4952
4953
aese v0.16b, v28.16b
4954
aesmc v0.16b, v0.16b //AES block 4k+4 - round 10
4955
4956
aese v2.16b, v27.16b
4957
aesmc v2.16b, v2.16b //AES block 4k+6 - round 9
4958
4959
aese v1.16b, v30.16b
4960
aesmc v1.16b, v1.16b //AES block 4k+5 - round 12
4961
4962
aese v0.16b, v29.16b
4963
aesmc v0.16b, v0.16b //AES block 4k+4 - round 11
4964
eor v10.16b, v10.16b, v9.16b
4965
4966
aese v3.16b, v29.16b
4967
aesmc v3.16b, v3.16b //AES block 4k+7 - round 11
4968
4969
aese v2.16b, v28.16b
4970
aesmc v2.16b, v2.16b //AES block 4k+6 - round 10
4971
4972
aese v0.16b, v30.16b
4973
aesmc v0.16b, v0.16b //AES block 4k+4 - round 12
4974
4975
pmull v4.1q, v10.1d, v8.1d
4976
4977
aese v2.16b, v29.16b
4978
aesmc v2.16b, v2.16b //AES block 4k+6 - round 11
4979
ext v10.16b, v10.16b, v10.16b, #8
4980
4981
aese v3.16b, v30.16b
4982
aesmc v3.16b, v3.16b //AES block 4k+7 - round 12
4983
4984
aese v1.16b, v31.16b //AES block 4k+5 - round 13
4985
eor v11.16b, v11.16b, v4.16b
4986
4987
aese v2.16b, v30.16b
4988
aesmc v2.16b, v2.16b //AES block 4k+6 - round 12
4989
4990
aese v3.16b, v31.16b //AES block 4k+7 - round 13
4991
4992
aese v0.16b, v31.16b //AES block 4k+4 - round 13
4993
4994
aese v2.16b, v31.16b //AES block 4k+6 - round 13
4995
eor v11.16b, v11.16b, v10.16b
4996
.L256_enc_tail: //TAIL
4997
4998
ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag
4999
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
5000
ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext
5001
#ifdef __AARCH64EB__
5002
rev x6, x6
5003
rev x7, x7
5004
#endif
5005
eor x6, x6, x13 //AES block 4k+4 - round 14 low
5006
eor x7, x7, x14 //AES block 4k+4 - round 14 high
5007
5008
cmp x5, #48
5009
fmov d4, x6 //AES block 4k+4 - mov low
5010
5011
fmov v4.d[1], x7 //AES block 4k+4 - mov high
5012
5013
eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result
5014
b.gt .L256_enc_blocks_more_than_3
5015
5016
cmp x5, #32
5017
mov v3.16b, v2.16b
5018
movi v11.8b, #0
5019
5020
movi v9.8b, #0
5021
sub w12, w12, #1
5022
5023
mov v2.16b, v1.16b
5024
movi v10.8b, #0
5025
b.gt .L256_enc_blocks_more_than_2
5026
5027
mov v3.16b, v1.16b
5028
sub w12, w12, #1
5029
cmp x5, #16
5030
5031
b.gt .L256_enc_blocks_more_than_1
5032
5033
sub w12, w12, #1
5034
b .L256_enc_blocks_less_than_1
5035
.L256_enc_blocks_more_than_3: //blocks left > 3
5036
st1 { v5.16b}, [x2], #16 //AES final-3 block - store result
5037
5038
ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high
5039
#ifdef __AARCH64EB__
5040
rev x6, x6
5041
rev x7, x7
5042
#endif
5043
rev64 v4.16b, v5.16b //GHASH final-3 block
5044
5045
eor x6, x6, x13 //AES final-2 block - round 14 low
5046
eor v4.16b, v4.16b, v8.16b //feed in partial tag
5047
5048
eor x7, x7, x14 //AES final-2 block - round 14 high
5049
5050
mov d22, v4.d[1] //GHASH final-3 block - mid
5051
fmov d5, x6 //AES final-2 block - mov low
5052
5053
fmov v5.d[1], x7 //AES final-2 block - mov high
5054
5055
eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid
5056
movi v8.8b, #0 //suppress further partial tag feed in
5057
5058
mov d10, v17.d[1] //GHASH final-3 block - mid
5059
5060
pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low
5061
5062
pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high
5063
5064
pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid
5065
eor v5.16b, v5.16b, v1.16b //AES final-2 block - result
5066
.L256_enc_blocks_more_than_2: //blocks left > 2
5067
5068
st1 { v5.16b}, [x2], #16 //AES final-2 block - store result
5069
5070
ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high
5071
#ifdef __AARCH64EB__
5072
rev x6, x6
5073
rev x7, x7
5074
#endif
5075
rev64 v4.16b, v5.16b //GHASH final-2 block
5076
5077
eor x6, x6, x13 //AES final-1 block - round 14 low
5078
eor v4.16b, v4.16b, v8.16b //feed in partial tag
5079
5080
fmov d5, x6 //AES final-1 block - mov low
5081
eor x7, x7, x14 //AES final-1 block - round 14 high
5082
5083
fmov v5.d[1], x7 //AES final-1 block - mov high
5084
5085
movi v8.8b, #0 //suppress further partial tag feed in
5086
5087
pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high
5088
mov d22, v4.d[1] //GHASH final-2 block - mid
5089
5090
pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low
5091
5092
eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid
5093
5094
eor v5.16b, v5.16b, v2.16b //AES final-1 block - result
5095
5096
eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high
5097
5098
pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid
5099
5100
eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low
5101
5102
eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid
5103
.L256_enc_blocks_more_than_1: //blocks left > 1
5104
5105
st1 { v5.16b}, [x2], #16 //AES final-1 block - store result
5106
5107
rev64 v4.16b, v5.16b //GHASH final-1 block
5108
5109
ldp x6, x7, [x0], #16 //AES final block - load input low & high
5110
#ifdef __AARCH64EB__
5111
rev x6, x6
5112
rev x7, x7
5113
#endif
5114
eor v4.16b, v4.16b, v8.16b //feed in partial tag
5115
5116
movi v8.8b, #0 //suppress further partial tag feed in
5117
5118
eor x6, x6, x13 //AES final block - round 14 low
5119
mov d22, v4.d[1] //GHASH final-1 block - mid
5120
5121
pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high
5122
eor x7, x7, x14 //AES final block - round 14 high
5123
5124
eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid
5125
5126
eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high
5127
5128
ins v22.d[1], v22.d[0] //GHASH final-1 block - mid
5129
fmov d5, x6 //AES final block - mov low
5130
5131
fmov v5.d[1], x7 //AES final block - mov high
5132
5133
pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid
5134
5135
pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low
5136
5137
eor v5.16b, v5.16b, v3.16b //AES final block - result
5138
eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid
5139
5140
eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low
5141
.L256_enc_blocks_less_than_1: //blocks left <= 1
5142
5143
and x1, x1, #127 //bit_length %= 128
5144
5145
mvn x13, xzr //rk14_l = 0xffffffffffffffff
5146
sub x1, x1, #128 //bit_length -= 128
5147
5148
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
5149
ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
5150
5151
mvn x14, xzr //rk14_h = 0xffffffffffffffff
5152
and x1, x1, #127 //bit_length %= 128
5153
5154
lsr x14, x14, x1 //rk14_h is mask for top 64b of last block
5155
cmp x1, #64
5156
5157
csel x6, x13, x14, lt
5158
csel x7, x14, xzr, lt
5159
5160
fmov d0, x6 //ctr0b is mask for last block
5161
5162
fmov v0.d[1], x7
5163
5164
and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits
5165
5166
rev64 v4.16b, v5.16b //GHASH final block
5167
5168
eor v4.16b, v4.16b, v8.16b //feed in partial tag
5169
5170
bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing
5171
5172
pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high
5173
mov d8, v4.d[1] //GHASH final block - mid
5174
#ifndef __AARCH64EB__
5175
rev w9, w12
5176
#else
5177
mov w9, w12
5178
#endif
5179
5180
pmull v21.1q, v4.1d, v12.1d //GHASH final block - low
5181
5182
eor v9.16b, v9.16b, v20.16b //GHASH final block - high
5183
eor v8.8b, v8.8b, v4.8b //GHASH final block - mid
5184
5185
pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid
5186
5187
eor v11.16b, v11.16b, v21.16b //GHASH final block - low
5188
5189
eor v10.16b, v10.16b, v8.16b //GHASH final block - mid
5190
movi v8.8b, #0xc2
5191
5192
eor v4.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up
5193
5194
shl d8, d8, #56 //mod_constant
5195
5196
eor v10.16b, v10.16b, v4.16b //MODULO - karatsuba tidy up
5197
5198
pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid
5199
5200
ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment
5201
5202
eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid
5203
5204
eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid
5205
5206
pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low
5207
5208
ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment
5209
5210
str w9, [x16, #12] //store the updated counter
5211
5212
st1 { v5.16b}, [x2] //store all 16B
5213
eor v11.16b, v11.16b, v9.16b //MODULO - fold into low
5214
5215
eor v11.16b, v11.16b, v10.16b //MODULO - fold into low
5216
ext v11.16b, v11.16b, v11.16b, #8
5217
rev64 v11.16b, v11.16b
5218
mov x0, x15
5219
st1 { v11.16b }, [x3]
5220
5221
ldp x21, x22, [sp, #16]
5222
ldp x23, x24, [sp, #32]
5223
ldp d8, d9, [sp, #48]
5224
ldp d10, d11, [sp, #64]
5225
ldp d12, d13, [sp, #80]
5226
ldp d14, d15, [sp, #96]
5227
ldp x19, x20, [sp], #112
5228
ret
5229
5230
.L256_enc_ret:
5231
mov w0, #0x0
5232
ret
5233
.size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
5234
.globl aes_gcm_dec_256_kernel
5235
.type aes_gcm_dec_256_kernel,%function
5236
.align 4
5237
aes_gcm_dec_256_kernel:
5238
AARCH64_VALID_CALL_TARGET
5239
cbz x1, .L256_dec_ret
5240
stp x19, x20, [sp, #-112]!
5241
mov x16, x4
5242
mov x8, x5
5243
stp x21, x22, [sp, #16]
5244
stp x23, x24, [sp, #32]
5245
stp d8, d9, [sp, #48]
5246
stp d10, d11, [sp, #64]
5247
stp d12, d13, [sp, #80]
5248
stp d14, d15, [sp, #96]
5249
5250
lsr x5, x1, #3 //byte_len
5251
mov x15, x5
5252
ldp x10, x11, [x16] //ctr96_b64, ctr96_t32
5253
#ifdef __AARCH64EB__
5254
rev x10, x10
5255
rev x11, x11
5256
#endif
5257
ldp x13, x14, [x8, #224] //load rk14
5258
#ifdef __AARCH64EB__
5259
ror x14, x14, #32
5260
ror x13, x13, #32
5261
#endif
5262
ld1 {v18.4s}, [x8], #16 //load rk0
5263
sub x5, x5, #1 //byte_len - 1
5264
5265
ld1 {v19.4s}, [x8], #16 //load rk1
5266
and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
5267
5268
add x4, x0, x1, lsr #3 //end_input_ptr
5269
ld1 {v20.4s}, [x8], #16 //load rk2
5270
5271
lsr x12, x11, #32
5272
ld1 {v21.4s}, [x8], #16 //load rk3
5273
orr w11, w11, w11
5274
5275
ld1 {v22.4s}, [x8], #16 //load rk4
5276
add x5, x5, x0
5277
rev w12, w12 //rev_ctr32
5278
5279
add w12, w12, #1 //increment rev_ctr32
5280
fmov d3, x10 //CTR block 3
5281
5282
rev w9, w12 //CTR block 1
5283
add w12, w12, #1 //CTR block 1
5284
fmov d1, x10 //CTR block 1
5285
5286
orr x9, x11, x9, lsl #32 //CTR block 1
5287
ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible
5288
5289
fmov v1.d[1], x9 //CTR block 1
5290
rev w9, w12 //CTR block 2
5291
add w12, w12, #1 //CTR block 2
5292
5293
fmov d2, x10 //CTR block 2
5294
orr x9, x11, x9, lsl #32 //CTR block 2
5295
5296
fmov v2.d[1], x9 //CTR block 2
5297
rev w9, w12 //CTR block 3
5298
5299
orr x9, x11, x9, lsl #32 //CTR block 3
5300
ld1 {v23.4s}, [x8], #16 //load rk5
5301
5302
fmov v3.d[1], x9 //CTR block 3
5303
add w12, w12, #1 //CTR block 3
5304
5305
ld1 {v24.4s}, [x8], #16 //load rk6
5306
5307
ld1 {v25.4s}, [x8], #16 //load rk7
5308
5309
ld1 {v26.4s}, [x8], #16 //load rk8
5310
5311
aese v0.16b, v18.16b
5312
aesmc v0.16b, v0.16b //AES block 0 - round 0
5313
ldr q14, [x3, #80] //load h3l | h3h
5314
#ifndef __AARCH64EB__
5315
ext v14.16b, v14.16b, v14.16b, #8
5316
#endif
5317
5318
aese v3.16b, v18.16b
5319
aesmc v3.16b, v3.16b //AES block 3 - round 0
5320
ldr q15, [x3, #112] //load h4l | h4h
5321
#ifndef __AARCH64EB__
5322
ext v15.16b, v15.16b, v15.16b, #8
5323
#endif
5324
5325
aese v1.16b, v18.16b
5326
aesmc v1.16b, v1.16b //AES block 1 - round 0
5327
ldr q13, [x3, #64] //load h2l | h2h
5328
#ifndef __AARCH64EB__
5329
ext v13.16b, v13.16b, v13.16b, #8
5330
#endif
5331
5332
aese v2.16b, v18.16b
5333
aesmc v2.16b, v2.16b //AES block 2 - round 0
5334
ld1 {v27.4s}, [x8], #16 //load rk9
5335
5336
aese v0.16b, v19.16b
5337
aesmc v0.16b, v0.16b //AES block 0 - round 1
5338
5339
aese v1.16b, v19.16b
5340
aesmc v1.16b, v1.16b //AES block 1 - round 1
5341
ld1 { v11.16b}, [x3]
5342
ext v11.16b, v11.16b, v11.16b, #8
5343
rev64 v11.16b, v11.16b
5344
5345
aese v2.16b, v19.16b
5346
aesmc v2.16b, v2.16b //AES block 2 - round 1
5347
ld1 {v28.4s}, [x8], #16 //load rk10
5348
5349
aese v3.16b, v19.16b
5350
aesmc v3.16b, v3.16b //AES block 3 - round 1
5351
ld1 {v29.4s}, [x8], #16 //load rk11
5352
5353
aese v0.16b, v20.16b
5354
aesmc v0.16b, v0.16b //AES block 0 - round 2
5355
ldr q12, [x3, #32] //load h1l | h1h
5356
#ifndef __AARCH64EB__
5357
ext v12.16b, v12.16b, v12.16b, #8
5358
#endif
5359
aese v2.16b, v20.16b
5360
aesmc v2.16b, v2.16b //AES block 2 - round 2
5361
ld1 {v30.4s}, [x8], #16 //load rk12
5362
5363
aese v3.16b, v20.16b
5364
aesmc v3.16b, v3.16b //AES block 3 - round 2
5365
5366
aese v0.16b, v21.16b
5367
aesmc v0.16b, v0.16b //AES block 0 - round 3
5368
5369
aese v1.16b, v20.16b
5370
aesmc v1.16b, v1.16b //AES block 1 - round 2
5371
5372
aese v3.16b, v21.16b
5373
aesmc v3.16b, v3.16b //AES block 3 - round 3
5374
5375
aese v0.16b, v22.16b
5376
aesmc v0.16b, v0.16b //AES block 0 - round 4
5377
cmp x0, x5 //check if we have <= 4 blocks
5378
5379
aese v2.16b, v21.16b
5380
aesmc v2.16b, v2.16b //AES block 2 - round 3
5381
5382
aese v1.16b, v21.16b
5383
aesmc v1.16b, v1.16b //AES block 1 - round 3
5384
5385
aese v3.16b, v22.16b
5386
aesmc v3.16b, v3.16b //AES block 3 - round 4
5387
5388
aese v2.16b, v22.16b
5389
aesmc v2.16b, v2.16b //AES block 2 - round 4
5390
5391
aese v1.16b, v22.16b
5392
aesmc v1.16b, v1.16b //AES block 1 - round 4
5393
5394
aese v3.16b, v23.16b
5395
aesmc v3.16b, v3.16b //AES block 3 - round 5
5396
5397
aese v0.16b, v23.16b
5398
aesmc v0.16b, v0.16b //AES block 0 - round 5
5399
5400
aese v1.16b, v23.16b
5401
aesmc v1.16b, v1.16b //AES block 1 - round 5
5402
5403
aese v2.16b, v23.16b
5404
aesmc v2.16b, v2.16b //AES block 2 - round 5
5405
5406
aese v0.16b, v24.16b
5407
aesmc v0.16b, v0.16b //AES block 0 - round 6
5408
5409
aese v3.16b, v24.16b
5410
aesmc v3.16b, v3.16b //AES block 3 - round 6
5411
5412
aese v1.16b, v24.16b
5413
aesmc v1.16b, v1.16b //AES block 1 - round 6
5414
5415
aese v2.16b, v24.16b
5416
aesmc v2.16b, v2.16b //AES block 2 - round 6
5417
5418
aese v0.16b, v25.16b
5419
aesmc v0.16b, v0.16b //AES block 0 - round 7
5420
5421
aese v1.16b, v25.16b
5422
aesmc v1.16b, v1.16b //AES block 1 - round 7
5423
5424
aese v3.16b, v25.16b
5425
aesmc v3.16b, v3.16b //AES block 3 - round 7
5426
5427
aese v0.16b, v26.16b
5428
aesmc v0.16b, v0.16b //AES block 0 - round 8
5429
5430
aese v2.16b, v25.16b
5431
aesmc v2.16b, v2.16b //AES block 2 - round 7
5432
5433
aese v3.16b, v26.16b
5434
aesmc v3.16b, v3.16b //AES block 3 - round 8
5435
5436
aese v1.16b, v26.16b
5437
aesmc v1.16b, v1.16b //AES block 1 - round 8
5438
5439
aese v0.16b, v27.16b
5440
aesmc v0.16b, v0.16b //AES block 0 - round 9
5441
5442
aese v2.16b, v26.16b
5443
aesmc v2.16b, v2.16b //AES block 2 - round 8
5444
ld1 {v31.4s}, [x8], #16 //load rk13
5445
5446
aese v1.16b, v27.16b
5447
aesmc v1.16b, v1.16b //AES block 1 - round 9
5448
5449
aese v0.16b, v28.16b
5450
aesmc v0.16b, v0.16b //AES block 0 - round 10
5451
5452
aese v3.16b, v27.16b
5453
aesmc v3.16b, v3.16b //AES block 3 - round 9
5454
5455
aese v1.16b, v28.16b
5456
aesmc v1.16b, v1.16b //AES block 1 - round 10
5457
5458
aese v2.16b, v27.16b
5459
aesmc v2.16b, v2.16b //AES block 2 - round 9
5460
5461
aese v3.16b, v28.16b
5462
aesmc v3.16b, v3.16b //AES block 3 - round 10
5463
5464
aese v0.16b, v29.16b
5465
aesmc v0.16b, v0.16b //AES block 0 - round 11
5466
5467
aese v2.16b, v28.16b
5468
aesmc v2.16b, v2.16b //AES block 2 - round 10
5469
5470
aese v3.16b, v29.16b
5471
aesmc v3.16b, v3.16b //AES block 3 - round 11
5472
5473
aese v1.16b, v29.16b
5474
aesmc v1.16b, v1.16b //AES block 1 - round 11
5475
5476
aese v2.16b, v29.16b
5477
aesmc v2.16b, v2.16b //AES block 2 - round 11
5478
5479
trn1 v9.2d, v14.2d, v15.2d //h4h | h3h
5480
5481
trn2 v17.2d, v14.2d, v15.2d //h4l | h3l
5482
5483
trn1 v8.2d, v12.2d, v13.2d //h2h | h1h
5484
trn2 v16.2d, v12.2d, v13.2d //h2l | h1l
5485
5486
aese v1.16b, v30.16b
5487
aesmc v1.16b, v1.16b //AES block 1 - round 12
5488
5489
aese v0.16b, v30.16b
5490
aesmc v0.16b, v0.16b //AES block 0 - round 12
5491
5492
aese v2.16b, v30.16b
5493
aesmc v2.16b, v2.16b //AES block 2 - round 12
5494
5495
aese v3.16b, v30.16b
5496
aesmc v3.16b, v3.16b //AES block 3 - round 12
5497
eor v17.16b, v17.16b, v9.16b //h4k | h3k
5498
5499
aese v1.16b, v31.16b //AES block 1 - round 13
5500
5501
aese v2.16b, v31.16b //AES block 2 - round 13
5502
eor v16.16b, v16.16b, v8.16b //h2k | h1k
5503
5504
aese v3.16b, v31.16b //AES block 3 - round 13
5505
5506
aese v0.16b, v31.16b //AES block 0 - round 13
5507
b.ge .L256_dec_tail //handle tail
5508
5509
ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext
5510
5511
rev w9, w12 //CTR block 4
5512
5513
eor v0.16b, v4.16b, v0.16b //AES block 0 - result
5514
5515
eor v1.16b, v5.16b, v1.16b //AES block 1 - result
5516
rev64 v5.16b, v5.16b //GHASH block 1
5517
ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext
5518
5519
mov x7, v0.d[1] //AES block 0 - mov high
5520
5521
mov x6, v0.d[0] //AES block 0 - mov low
5522
rev64 v4.16b, v4.16b //GHASH block 0
5523
add w12, w12, #1 //CTR block 4
5524
5525
fmov d0, x10 //CTR block 4
5526
orr x9, x11, x9, lsl #32 //CTR block 4
5527
5528
fmov v0.d[1], x9 //CTR block 4
5529
rev w9, w12 //CTR block 5
5530
add w12, w12, #1 //CTR block 5
5531
5532
mov x19, v1.d[0] //AES block 1 - mov low
5533
5534
orr x9, x11, x9, lsl #32 //CTR block 5
5535
mov x20, v1.d[1] //AES block 1 - mov high
5536
eor x7, x7, x14 //AES block 0 - round 14 high
5537
#ifdef __AARCH64EB__
5538
rev x7, x7
5539
#endif
5540
eor x6, x6, x13 //AES block 0 - round 14 low
5541
#ifdef __AARCH64EB__
5542
rev x6, x6
5543
#endif
5544
stp x6, x7, [x2], #16 //AES block 0 - store result
5545
fmov d1, x10 //CTR block 5
5546
5547
ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext
5548
5549
fmov v1.d[1], x9 //CTR block 5
5550
rev w9, w12 //CTR block 6
5551
add w12, w12, #1 //CTR block 6
5552
5553
eor x19, x19, x13 //AES block 1 - round 14 low
5554
#ifdef __AARCH64EB__
5555
rev x19, x19
5556
#endif
5557
orr x9, x11, x9, lsl #32 //CTR block 6
5558
5559
eor x20, x20, x14 //AES block 1 - round 14 high
5560
#ifdef __AARCH64EB__
5561
rev x20, x20
5562
#endif
5563
stp x19, x20, [x2], #16 //AES block 1 - store result
5564
5565
eor v2.16b, v6.16b, v2.16b //AES block 2 - result
5566
cmp x0, x5 //check if we have <= 8 blocks
5567
b.ge .L256_dec_prepretail //do prepretail
5568
5569
.L256_dec_main_loop: //main loop start
5570
mov x21, v2.d[0] //AES block 4k+2 - mov low
5571
ext v11.16b, v11.16b, v11.16b, #8 //PRE 0
5572
eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result
5573
5574
aese v0.16b, v18.16b
5575
aesmc v0.16b, v0.16b //AES block 4k+4 - round 0
5576
mov x22, v2.d[1] //AES block 4k+2 - mov high
5577
5578
aese v1.16b, v18.16b
5579
aesmc v1.16b, v1.16b //AES block 4k+5 - round 0
5580
fmov d2, x10 //CTR block 4k+6
5581
5582
fmov v2.d[1], x9 //CTR block 4k+6
5583
eor v4.16b, v4.16b, v11.16b //PRE 1
5584
rev w9, w12 //CTR block 4k+7
5585
5586
aese v0.16b, v19.16b
5587
aesmc v0.16b, v0.16b //AES block 4k+4 - round 1
5588
mov x24, v3.d[1] //AES block 4k+3 - mov high
5589
5590
aese v1.16b, v19.16b
5591
aesmc v1.16b, v1.16b //AES block 4k+5 - round 1
5592
mov x23, v3.d[0] //AES block 4k+3 - mov low
5593
5594
pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high
5595
mov d8, v4.d[1] //GHASH block 4k - mid
5596
fmov d3, x10 //CTR block 4k+7
5597
5598
aese v0.16b, v20.16b
5599
aesmc v0.16b, v0.16b //AES block 4k+4 - round 2
5600
orr x9, x11, x9, lsl #32 //CTR block 4k+7
5601
5602
aese v2.16b, v18.16b
5603
aesmc v2.16b, v2.16b //AES block 4k+6 - round 0
5604
fmov v3.d[1], x9 //CTR block 4k+7
5605
5606
aese v1.16b, v20.16b
5607
aesmc v1.16b, v1.16b //AES block 4k+5 - round 2
5608
eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid
5609
5610
aese v0.16b, v21.16b
5611
aesmc v0.16b, v0.16b //AES block 4k+4 - round 3
5612
eor x22, x22, x14 //AES block 4k+2 - round 14 high
5613
#ifdef __AARCH64EB__
5614
rev x22, x22
5615
#endif
5616
aese v2.16b, v19.16b
5617
aesmc v2.16b, v2.16b //AES block 4k+6 - round 1
5618
mov d10, v17.d[1] //GHASH block 4k - mid
5619
5620
aese v1.16b, v21.16b
5621
aesmc v1.16b, v1.16b //AES block 4k+5 - round 3
5622
rev64 v6.16b, v6.16b //GHASH block 4k+2
5623
5624
aese v3.16b, v18.16b
5625
aesmc v3.16b, v3.16b //AES block 4k+7 - round 0
5626
eor x21, x21, x13 //AES block 4k+2 - round 14 low
5627
#ifdef __AARCH64EB__
5628
rev x21, x21
5629
#endif
5630
aese v2.16b, v20.16b
5631
aesmc v2.16b, v2.16b //AES block 4k+6 - round 2
5632
stp x21, x22, [x2], #16 //AES block 4k+2 - store result
5633
5634
pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low
5635
5636
pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high
5637
5638
aese v2.16b, v21.16b
5639
aesmc v2.16b, v2.16b //AES block 4k+6 - round 3
5640
rev64 v7.16b, v7.16b //GHASH block 4k+3
5641
5642
pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid
5643
eor x23, x23, x13 //AES block 4k+3 - round 14 low
5644
#ifdef __AARCH64EB__
5645
rev x23, x23
5646
#endif
5647
pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low
5648
eor x24, x24, x14 //AES block 4k+3 - round 14 high
5649
#ifdef __AARCH64EB__
5650
rev x24, x24
5651
#endif
5652
eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high
5653
5654
aese v2.16b, v22.16b
5655
aesmc v2.16b, v2.16b //AES block 4k+6 - round 4
5656
5657
aese v3.16b, v19.16b
5658
aesmc v3.16b, v3.16b //AES block 4k+7 - round 1
5659
mov d4, v5.d[1] //GHASH block 4k+1 - mid
5660
5661
aese v0.16b, v22.16b
5662
aesmc v0.16b, v0.16b //AES block 4k+4 - round 4
5663
eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low
5664
5665
aese v2.16b, v23.16b
5666
aesmc v2.16b, v2.16b //AES block 4k+6 - round 5
5667
add w12, w12, #1 //CTR block 4k+7
5668
5669
aese v3.16b, v20.16b
5670
aesmc v3.16b, v3.16b //AES block 4k+7 - round 2
5671
mov d8, v6.d[1] //GHASH block 4k+2 - mid
5672
5673
aese v1.16b, v22.16b
5674
aesmc v1.16b, v1.16b //AES block 4k+5 - round 4
5675
eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid
5676
5677
pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low
5678
5679
aese v3.16b, v21.16b
5680
aesmc v3.16b, v3.16b //AES block 4k+7 - round 3
5681
eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid
5682
5683
aese v1.16b, v23.16b
5684
aesmc v1.16b, v1.16b //AES block 4k+5 - round 5
5685
5686
aese v0.16b, v23.16b
5687
aesmc v0.16b, v0.16b //AES block 4k+4 - round 5
5688
eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low
5689
5690
pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid
5691
rev w9, w12 //CTR block 4k+8
5692
5693
aese v1.16b, v24.16b
5694
aesmc v1.16b, v1.16b //AES block 4k+5 - round 6
5695
ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid
5696
5697
aese v0.16b, v24.16b
5698
aesmc v0.16b, v0.16b //AES block 4k+4 - round 6
5699
add w12, w12, #1 //CTR block 4k+8
5700
5701
aese v3.16b, v22.16b
5702
aesmc v3.16b, v3.16b //AES block 4k+7 - round 4
5703
5704
aese v1.16b, v25.16b
5705
aesmc v1.16b, v1.16b //AES block 4k+5 - round 7
5706
eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid
5707
5708
aese v0.16b, v25.16b
5709
aesmc v0.16b, v0.16b //AES block 4k+4 - round 7
5710
5711
pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high
5712
mov d6, v7.d[1] //GHASH block 4k+3 - mid
5713
5714
aese v3.16b, v23.16b
5715
aesmc v3.16b, v3.16b //AES block 4k+7 - round 5
5716
5717
pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid
5718
5719
aese v0.16b, v26.16b
5720
aesmc v0.16b, v0.16b //AES block 4k+4 - round 8
5721
eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high
5722
5723
aese v3.16b, v24.16b
5724
aesmc v3.16b, v3.16b //AES block 4k+7 - round 6
5725
5726
pmull v4.1q, v7.1d, v12.1d //GHASH block 4k+3 - low
5727
orr x9, x11, x9, lsl #32 //CTR block 4k+8
5728
eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid
5729
5730
pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high
5731
5732
aese v0.16b, v27.16b
5733
aesmc v0.16b, v0.16b //AES block 4k+4 - round 9
5734
eor v6.8b, v6.8b, v7.8b //GHASH block 4k+3 - mid
5735
5736
aese v1.16b, v26.16b
5737
aesmc v1.16b, v1.16b //AES block 4k+5 - round 8
5738
5739
aese v2.16b, v24.16b
5740
aesmc v2.16b, v2.16b //AES block 4k+6 - round 6
5741
eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high
5742
5743
aese v0.16b, v28.16b
5744
aesmc v0.16b, v0.16b //AES block 4k+4 - round 10
5745
5746
pmull v6.1q, v6.1d, v16.1d //GHASH block 4k+3 - mid
5747
movi v8.8b, #0xc2
5748
5749
aese v2.16b, v25.16b
5750
aesmc v2.16b, v2.16b //AES block 4k+6 - round 7
5751
eor v11.16b, v11.16b, v4.16b //GHASH block 4k+3 - low
5752
5753
aese v0.16b, v29.16b
5754
aesmc v0.16b, v0.16b //AES block 4k+4 - round 11
5755
5756
aese v3.16b, v25.16b
5757
aesmc v3.16b, v3.16b //AES block 4k+7 - round 7
5758
shl d8, d8, #56 //mod_constant
5759
5760
aese v2.16b, v26.16b
5761
aesmc v2.16b, v2.16b //AES block 4k+6 - round 8
5762
eor v10.16b, v10.16b, v6.16b //GHASH block 4k+3 - mid
5763
5764
aese v0.16b, v30.16b
5765
aesmc v0.16b, v0.16b //AES block 4k+4 - round 12
5766
5767
pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid
5768
eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up
5769
5770
aese v1.16b, v27.16b
5771
aesmc v1.16b, v1.16b //AES block 4k+5 - round 9
5772
ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext
5773
5774
aese v0.16b, v31.16b //AES block 4k+4 - round 13
5775
ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment
5776
5777
aese v1.16b, v28.16b
5778
aesmc v1.16b, v1.16b //AES block 4k+5 - round 10
5779
eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up
5780
5781
aese v2.16b, v27.16b
5782
aesmc v2.16b, v2.16b //AES block 4k+6 - round 9
5783
ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext
5784
5785
aese v3.16b, v26.16b
5786
aesmc v3.16b, v3.16b //AES block 4k+7 - round 8
5787
eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result
5788
5789
aese v1.16b, v29.16b
5790
aesmc v1.16b, v1.16b //AES block 4k+5 - round 11
5791
stp x23, x24, [x2], #16 //AES block 4k+3 - store result
5792
5793
aese v2.16b, v28.16b
5794
aesmc v2.16b, v2.16b //AES block 4k+6 - round 10
5795
eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid
5796
5797
aese v3.16b, v27.16b
5798
aesmc v3.16b, v3.16b //AES block 4k+7 - round 9
5799
ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext
5800
5801
aese v1.16b, v30.16b
5802
aesmc v1.16b, v1.16b //AES block 4k+5 - round 12
5803
ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext
5804
5805
aese v2.16b, v29.16b
5806
aesmc v2.16b, v2.16b //AES block 4k+6 - round 11
5807
mov x7, v0.d[1] //AES block 4k+4 - mov high
5808
5809
aese v3.16b, v28.16b
5810
aesmc v3.16b, v3.16b //AES block 4k+7 - round 10
5811
eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid
5812
5813
aese v1.16b, v31.16b //AES block 4k+5 - round 13
5814
mov x6, v0.d[0] //AES block 4k+4 - mov low
5815
5816
aese v2.16b, v30.16b
5817
aesmc v2.16b, v2.16b //AES block 4k+6 - round 12
5818
fmov d0, x10 //CTR block 4k+8
5819
5820
aese v3.16b, v29.16b
5821
aesmc v3.16b, v3.16b //AES block 4k+7 - round 11
5822
fmov v0.d[1], x9 //CTR block 4k+8
5823
5824
pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low
5825
eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result
5826
rev w9, w12 //CTR block 4k+9
5827
5828
aese v2.16b, v31.16b //AES block 4k+6 - round 13
5829
orr x9, x11, x9, lsl #32 //CTR block 4k+9
5830
cmp x0, x5 //.LOOP CONTROL
5831
5832
add w12, w12, #1 //CTR block 4k+9
5833
5834
eor x6, x6, x13 //AES block 4k+4 - round 14 low
5835
#ifdef __AARCH64EB__
5836
rev x6, x6
5837
#endif
5838
eor x7, x7, x14 //AES block 4k+4 - round 14 high
5839
#ifdef __AARCH64EB__
5840
rev x7, x7
5841
#endif
5842
mov x20, v1.d[1] //AES block 4k+5 - mov high
5843
eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result
5844
eor v11.16b, v11.16b, v8.16b //MODULO - fold into low
5845
5846
aese v3.16b, v30.16b
5847
aesmc v3.16b, v3.16b //AES block 4k+7 - round 12
5848
mov x19, v1.d[0] //AES block 4k+5 - mov low
5849
5850
fmov d1, x10 //CTR block 4k+9
5851
ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment
5852
5853
fmov v1.d[1], x9 //CTR block 4k+9
5854
rev w9, w12 //CTR block 4k+10
5855
add w12, w12, #1 //CTR block 4k+10
5856
5857
aese v3.16b, v31.16b //AES block 4k+7 - round 13
5858
orr x9, x11, x9, lsl #32 //CTR block 4k+10
5859
5860
rev64 v5.16b, v5.16b //GHASH block 4k+5
5861
eor x20, x20, x14 //AES block 4k+5 - round 14 high
5862
#ifdef __AARCH64EB__
5863
rev x20, x20
5864
#endif
5865
stp x6, x7, [x2], #16 //AES block 4k+4 - store result
5866
5867
eor x19, x19, x13 //AES block 4k+5 - round 14 low
5868
#ifdef __AARCH64EB__
5869
rev x19, x19
5870
#endif
5871
stp x19, x20, [x2], #16 //AES block 4k+5 - store result
5872
5873
rev64 v4.16b, v4.16b //GHASH block 4k+4
5874
eor v11.16b, v11.16b, v10.16b //MODULO - fold into low
5875
b.lt .L256_dec_main_loop
5876
5877
5878
.L256_dec_prepretail: //PREPRETAIL
5879
ext v11.16b, v11.16b, v11.16b, #8 //PRE 0
5880
mov x21, v2.d[0] //AES block 4k+2 - mov low
5881
eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result
5882
5883
aese v0.16b, v18.16b
5884
aesmc v0.16b, v0.16b //AES block 4k+4 - round 0
5885
mov x22, v2.d[1] //AES block 4k+2 - mov high
5886
5887
aese v1.16b, v18.16b
5888
aesmc v1.16b, v1.16b //AES block 4k+5 - round 0
5889
fmov d2, x10 //CTR block 4k+6
5890
5891
fmov v2.d[1], x9 //CTR block 4k+6
5892
rev w9, w12 //CTR block 4k+7
5893
eor v4.16b, v4.16b, v11.16b //PRE 1
5894
5895
rev64 v6.16b, v6.16b //GHASH block 4k+2
5896
orr x9, x11, x9, lsl #32 //CTR block 4k+7
5897
mov x23, v3.d[0] //AES block 4k+3 - mov low
5898
5899
aese v1.16b, v19.16b
5900
aesmc v1.16b, v1.16b //AES block 4k+5 - round 1
5901
mov x24, v3.d[1] //AES block 4k+3 - mov high
5902
5903
pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low
5904
mov d8, v4.d[1] //GHASH block 4k - mid
5905
fmov d3, x10 //CTR block 4k+7
5906
5907
pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high
5908
fmov v3.d[1], x9 //CTR block 4k+7
5909
5910
aese v2.16b, v18.16b
5911
aesmc v2.16b, v2.16b //AES block 4k+6 - round 0
5912
mov d10, v17.d[1] //GHASH block 4k - mid
5913
5914
aese v0.16b, v19.16b
5915
aesmc v0.16b, v0.16b //AES block 4k+4 - round 1
5916
eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid
5917
5918
pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high
5919
5920
aese v2.16b, v19.16b
5921
aesmc v2.16b, v2.16b //AES block 4k+6 - round 1
5922
rev64 v7.16b, v7.16b //GHASH block 4k+3
5923
5924
aese v3.16b, v18.16b
5925
aesmc v3.16b, v3.16b //AES block 4k+7 - round 0
5926
5927
pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid
5928
eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high
5929
5930
pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low
5931
5932
aese v3.16b, v19.16b
5933
aesmc v3.16b, v3.16b //AES block 4k+7 - round 1
5934
mov d4, v5.d[1] //GHASH block 4k+1 - mid
5935
5936
aese v0.16b, v20.16b
5937
aesmc v0.16b, v0.16b //AES block 4k+4 - round 2
5938
5939
aese v1.16b, v20.16b
5940
aesmc v1.16b, v1.16b //AES block 4k+5 - round 2
5941
eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low
5942
5943
aese v2.16b, v20.16b
5944
aesmc v2.16b, v2.16b //AES block 4k+6 - round 2
5945
5946
aese v0.16b, v21.16b
5947
aesmc v0.16b, v0.16b //AES block 4k+4 - round 3
5948
mov d8, v6.d[1] //GHASH block 4k+2 - mid
5949
5950
aese v3.16b, v20.16b
5951
aesmc v3.16b, v3.16b //AES block 4k+7 - round 2
5952
eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid
5953
5954
pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low
5955
5956
aese v0.16b, v22.16b
5957
aesmc v0.16b, v0.16b //AES block 4k+4 - round 4
5958
5959
aese v3.16b, v21.16b
5960
aesmc v3.16b, v3.16b //AES block 4k+7 - round 3
5961
eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid
5962
5963
pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid
5964
5965
aese v0.16b, v23.16b
5966
aesmc v0.16b, v0.16b //AES block 4k+4 - round 5
5967
eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low
5968
5969
aese v3.16b, v22.16b
5970
aesmc v3.16b, v3.16b //AES block 4k+7 - round 4
5971
5972
pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high
5973
eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid
5974
5975
pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high
5976
5977
aese v3.16b, v23.16b
5978
aesmc v3.16b, v3.16b //AES block 4k+7 - round 5
5979
ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid
5980
5981
aese v2.16b, v21.16b
5982
aesmc v2.16b, v2.16b //AES block 4k+6 - round 3
5983
5984
aese v1.16b, v21.16b
5985
aesmc v1.16b, v1.16b //AES block 4k+5 - round 3
5986
eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high
5987
5988
pmull v4.1q, v7.1d, v12.1d //GHASH block 4k+3 - low
5989
5990
aese v2.16b, v22.16b
5991
aesmc v2.16b, v2.16b //AES block 4k+6 - round 4
5992
mov d6, v7.d[1] //GHASH block 4k+3 - mid
5993
5994
aese v1.16b, v22.16b
5995
aesmc v1.16b, v1.16b //AES block 4k+5 - round 4
5996
5997
pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid
5998
5999
aese v2.16b, v23.16b
6000
aesmc v2.16b, v2.16b //AES block 4k+6 - round 5
6001
eor v6.8b, v6.8b, v7.8b //GHASH block 4k+3 - mid
6002
6003
aese v1.16b, v23.16b
6004
aesmc v1.16b, v1.16b //AES block 4k+5 - round 5
6005
6006
aese v3.16b, v24.16b
6007
aesmc v3.16b, v3.16b //AES block 4k+7 - round 6
6008
eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid
6009
6010
aese v2.16b, v24.16b
6011
aesmc v2.16b, v2.16b //AES block 4k+6 - round 6
6012
6013
aese v0.16b, v24.16b
6014
aesmc v0.16b, v0.16b //AES block 4k+4 - round 6
6015
movi v8.8b, #0xc2
6016
6017
aese v1.16b, v24.16b
6018
aesmc v1.16b, v1.16b //AES block 4k+5 - round 6
6019
eor v11.16b, v11.16b, v4.16b //GHASH block 4k+3 - low
6020
6021
pmull v6.1q, v6.1d, v16.1d //GHASH block 4k+3 - mid
6022
6023
aese v3.16b, v25.16b
6024
aesmc v3.16b, v3.16b //AES block 4k+7 - round 7
6025
eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high
6026
6027
aese v1.16b, v25.16b
6028
aesmc v1.16b, v1.16b //AES block 4k+5 - round 7
6029
6030
aese v0.16b, v25.16b
6031
aesmc v0.16b, v0.16b //AES block 4k+4 - round 7
6032
eor v10.16b, v10.16b, v6.16b //GHASH block 4k+3 - mid
6033
6034
aese v3.16b, v26.16b
6035
aesmc v3.16b, v3.16b //AES block 4k+7 - round 8
6036
6037
aese v2.16b, v25.16b
6038
aesmc v2.16b, v2.16b //AES block 4k+6 - round 7
6039
eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up
6040
6041
aese v1.16b, v26.16b
6042
aesmc v1.16b, v1.16b //AES block 4k+5 - round 8
6043
6044
aese v0.16b, v26.16b
6045
aesmc v0.16b, v0.16b //AES block 4k+4 - round 8
6046
shl d8, d8, #56 //mod_constant
6047
6048
aese v2.16b, v26.16b
6049
aesmc v2.16b, v2.16b //AES block 4k+6 - round 8
6050
6051
aese v1.16b, v27.16b
6052
aesmc v1.16b, v1.16b //AES block 4k+5 - round 9
6053
eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up
6054
6055
pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid
6056
6057
aese v2.16b, v27.16b
6058
aesmc v2.16b, v2.16b //AES block 4k+6 - round 9
6059
ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment
6060
6061
aese v3.16b, v27.16b
6062
aesmc v3.16b, v3.16b //AES block 4k+7 - round 9
6063
6064
aese v0.16b, v27.16b
6065
aesmc v0.16b, v0.16b //AES block 4k+4 - round 9
6066
eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid
6067
6068
aese v2.16b, v28.16b
6069
aesmc v2.16b, v2.16b //AES block 4k+6 - round 10
6070
6071
aese v3.16b, v28.16b
6072
aesmc v3.16b, v3.16b //AES block 4k+7 - round 10
6073
6074
aese v0.16b, v28.16b
6075
aesmc v0.16b, v0.16b //AES block 4k+4 - round 10
6076
eor x22, x22, x14 //AES block 4k+2 - round 14 high
6077
#ifdef __AARCH64EB__
6078
rev x22, x22
6079
#endif
6080
aese v1.16b, v28.16b
6081
aesmc v1.16b, v1.16b //AES block 4k+5 - round 10
6082
eor x23, x23, x13 //AES block 4k+3 - round 14 low
6083
#ifdef __AARCH64EB__
6084
rev x23, x23
6085
#endif
6086
aese v2.16b, v29.16b
6087
aesmc v2.16b, v2.16b //AES block 4k+6 - round 11
6088
eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid
6089
6090
aese v0.16b, v29.16b
6091
aesmc v0.16b, v0.16b //AES block 4k+4 - round 11
6092
add w12, w12, #1 //CTR block 4k+7
6093
6094
aese v1.16b, v29.16b
6095
aesmc v1.16b, v1.16b //AES block 4k+5 - round 11
6096
eor x21, x21, x13 //AES block 4k+2 - round 14 low
6097
#ifdef __AARCH64EB__
6098
rev x21, x21
6099
#endif
6100
6101
aese v2.16b, v30.16b
6102
aesmc v2.16b, v2.16b //AES block 4k+6 - round 12
6103
6104
pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low
6105
eor x24, x24, x14 //AES block 4k+3 - round 14 high
6106
#ifdef __AARCH64EB__
6107
rev x24, x24
6108
#endif
6109
6110
aese v3.16b, v29.16b
6111
aesmc v3.16b, v3.16b //AES block 4k+7 - round 11
6112
stp x21, x22, [x2], #16 //AES block 4k+2 - store result
6113
6114
aese v1.16b, v30.16b
6115
aesmc v1.16b, v1.16b //AES block 4k+5 - round 12
6116
ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment
6117
6118
aese v0.16b, v30.16b
6119
aesmc v0.16b, v0.16b //AES block 4k+4 - round 12
6120
stp x23, x24, [x2], #16 //AES block 4k+3 - store result
6121
6122
aese v3.16b, v30.16b
6123
aesmc v3.16b, v3.16b //AES block 4k+7 - round 12
6124
eor v11.16b, v11.16b, v8.16b //MODULO - fold into low
6125
6126
aese v1.16b, v31.16b //AES block 4k+5 - round 13
6127
6128
aese v0.16b, v31.16b //AES block 4k+4 - round 13
6129
6130
aese v3.16b, v31.16b //AES block 4k+7 - round 13
6131
6132
aese v2.16b, v31.16b //AES block 4k+6 - round 13
6133
eor v11.16b, v11.16b, v10.16b //MODULO - fold into low
6134
.L256_dec_tail: //TAIL
6135
6136
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
6137
ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext
6138
6139
eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result
6140
6141
mov x6, v0.d[0] //AES block 4k+4 - mov low
6142
6143
mov x7, v0.d[1] //AES block 4k+4 - mov high
6144
ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag
6145
6146
cmp x5, #48
6147
6148
eor x6, x6, x13 //AES block 4k+4 - round 14 low
6149
#ifdef __AARCH64EB__
6150
rev x6, x6
6151
#endif
6152
6153
eor x7, x7, x14 //AES block 4k+4 - round 14 high
6154
#ifdef __AARCH64EB__
6155
rev x7, x7
6156
#endif
6157
b.gt .L256_dec_blocks_more_than_3
6158
6159
sub w12, w12, #1
6160
mov v3.16b, v2.16b
6161
movi v10.8b, #0
6162
6163
movi v11.8b, #0
6164
cmp x5, #32
6165
6166
movi v9.8b, #0
6167
mov v2.16b, v1.16b
6168
b.gt .L256_dec_blocks_more_than_2
6169
6170
sub w12, w12, #1
6171
6172
mov v3.16b, v1.16b
6173
cmp x5, #16
6174
b.gt .L256_dec_blocks_more_than_1
6175
6176
sub w12, w12, #1
6177
b .L256_dec_blocks_less_than_1
6178
.L256_dec_blocks_more_than_3: //blocks left > 3
6179
rev64 v4.16b, v5.16b //GHASH final-3 block
6180
ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext
6181
6182
stp x6, x7, [x2], #16 //AES final-3 block - store result
6183
6184
mov d10, v17.d[1] //GHASH final-3 block - mid
6185
6186
eor v4.16b, v4.16b, v8.16b //feed in partial tag
6187
6188
eor v0.16b, v5.16b, v1.16b //AES final-2 block - result
6189
6190
mov d22, v4.d[1] //GHASH final-3 block - mid
6191
6192
mov x6, v0.d[0] //AES final-2 block - mov low
6193
6194
mov x7, v0.d[1] //AES final-2 block - mov high
6195
6196
eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid
6197
6198
movi v8.8b, #0 //suppress further partial tag feed in
6199
6200
pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high
6201
6202
pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid
6203
eor x6, x6, x13 //AES final-2 block - round 14 low
6204
#ifdef __AARCH64EB__
6205
rev x6, x6
6206
#endif
6207
6208
pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low
6209
eor x7, x7, x14 //AES final-2 block - round 14 high
6210
#ifdef __AARCH64EB__
6211
rev x7, x7
6212
#endif
6213
.L256_dec_blocks_more_than_2: //blocks left > 2
6214
6215
rev64 v4.16b, v5.16b //GHASH final-2 block
6216
ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext
6217
6218
eor v4.16b, v4.16b, v8.16b //feed in partial tag
6219
stp x6, x7, [x2], #16 //AES final-2 block - store result
6220
6221
eor v0.16b, v5.16b, v2.16b //AES final-1 block - result
6222
6223
mov d22, v4.d[1] //GHASH final-2 block - mid
6224
6225
pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low
6226
6227
pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high
6228
6229
eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid
6230
mov x6, v0.d[0] //AES final-1 block - mov low
6231
6232
mov x7, v0.d[1] //AES final-1 block - mov high
6233
eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low
6234
movi v8.8b, #0 //suppress further partial tag feed in
6235
6236
pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid
6237
6238
eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high
6239
eor x6, x6, x13 //AES final-1 block - round 14 low
6240
#ifdef __AARCH64EB__
6241
rev x6, x6
6242
#endif
6243
6244
eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid
6245
eor x7, x7, x14 //AES final-1 block - round 14 high
6246
#ifdef __AARCH64EB__
6247
rev x7, x7
6248
#endif
6249
.L256_dec_blocks_more_than_1: //blocks left > 1
6250
6251
stp x6, x7, [x2], #16 //AES final-1 block - store result
6252
rev64 v4.16b, v5.16b //GHASH final-1 block
6253
6254
ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext
6255
6256
eor v4.16b, v4.16b, v8.16b //feed in partial tag
6257
movi v8.8b, #0 //suppress further partial tag feed in
6258
6259
mov d22, v4.d[1] //GHASH final-1 block - mid
6260
6261
eor v0.16b, v5.16b, v3.16b //AES final block - result
6262
6263
pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high
6264
6265
eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid
6266
6267
pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low
6268
mov x6, v0.d[0] //AES final block - mov low
6269
6270
ins v22.d[1], v22.d[0] //GHASH final-1 block - mid
6271
6272
mov x7, v0.d[1] //AES final block - mov high
6273
6274
pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid
6275
eor x6, x6, x13 //AES final block - round 14 low
6276
#ifdef __AARCH64EB__
6277
rev x6, x6
6278
#endif
6279
eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low
6280
6281
eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high
6282
6283
eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid
6284
eor x7, x7, x14 //AES final block - round 14 high
6285
#ifdef __AARCH64EB__
6286
rev x7, x7
6287
#endif
6288
.L256_dec_blocks_less_than_1: //blocks left <= 1
6289
6290
and x1, x1, #127 //bit_length %= 128
6291
mvn x14, xzr //rk14_h = 0xffffffffffffffff
6292
6293
sub x1, x1, #128 //bit_length -= 128
6294
mvn x13, xzr //rk14_l = 0xffffffffffffffff
6295
6296
ldp x4, x5, [x2] //load existing bytes we need to not overwrite
6297
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
6298
6299
and x1, x1, #127 //bit_length %= 128
6300
6301
lsr x14, x14, x1 //rk14_h is mask for top 64b of last block
6302
cmp x1, #64
6303
6304
csel x9, x13, x14, lt
6305
csel x10, x14, xzr, lt
6306
6307
fmov d0, x9 //ctr0b is mask for last block
6308
and x6, x6, x9
6309
6310
mov v0.d[1], x10
6311
bic x4, x4, x9 //mask out low existing bytes
6312
6313
#ifndef __AARCH64EB__
6314
rev w9, w12
6315
#else
6316
mov w9, w12
6317
#endif
6318
6319
bic x5, x5, x10 //mask out high existing bytes
6320
6321
orr x6, x6, x4
6322
6323
and x7, x7, x10
6324
6325
orr x7, x7, x5
6326
6327
and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits
6328
6329
rev64 v4.16b, v5.16b //GHASH final block
6330
6331
eor v4.16b, v4.16b, v8.16b //feed in partial tag
6332
6333
pmull v21.1q, v4.1d, v12.1d //GHASH final block - low
6334
6335
mov d8, v4.d[1] //GHASH final block - mid
6336
6337
eor v8.8b, v8.8b, v4.8b //GHASH final block - mid
6338
6339
pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high
6340
6341
pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid
6342
6343
eor v9.16b, v9.16b, v20.16b //GHASH final block - high
6344
6345
eor v11.16b, v11.16b, v21.16b //GHASH final block - low
6346
6347
eor v10.16b, v10.16b, v8.16b //GHASH final block - mid
6348
movi v8.8b, #0xc2
6349
6350
eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up
6351
6352
shl d8, d8, #56 //mod_constant
6353
6354
eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up
6355
6356
pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid
6357
6358
ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment
6359
6360
eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid
6361
6362
eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid
6363
6364
pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low
6365
6366
ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment
6367
6368
eor v11.16b, v11.16b, v8.16b //MODULO - fold into low
6369
6370
stp x6, x7, [x2]
6371
6372
str w9, [x16, #12] //store the updated counter
6373
6374
eor v11.16b, v11.16b, v10.16b //MODULO - fold into low
6375
ext v11.16b, v11.16b, v11.16b, #8
6376
rev64 v11.16b, v11.16b
6377
mov x0, x15
6378
st1 { v11.16b }, [x3]
6379
6380
ldp x21, x22, [sp, #16]
6381
ldp x23, x24, [sp, #32]
6382
ldp d8, d9, [sp, #48]
6383
ldp d10, d11, [sp, #64]
6384
ldp d12, d13, [sp, #80]
6385
ldp d14, d15, [sp, #96]
6386
ldp x19, x20, [sp], #112
6387
ret
6388
6389
.L256_dec_ret:
6390
mov w0, #0x0
6391
ret
6392
.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
6393
.section .rodata
6394
.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
6395
.align 2
6396
.align 2
6397
#endif
6398
6399