Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/aarch64/aes-gcm-armv8-unroll8_64.S
39507 views
1
/* Do not modify. This file is auto-generated from aes-gcm-armv8-unroll8_64.pl. */
2
#include "arm_arch.h"
3
4
#if __ARM_MAX_ARCH__>=8
5
.arch armv8-a+crypto
6
.text
7
.globl unroll8_eor3_aes_gcm_enc_128_kernel
8
.type unroll8_eor3_aes_gcm_enc_128_kernel,%function
9
.align 4
10
unroll8_eor3_aes_gcm_enc_128_kernel:
11
AARCH64_VALID_CALL_TARGET
12
cbz x1, .L128_enc_ret
13
stp d8, d9, [sp, #-80]!
14
lsr x9, x1, #3
15
mov x16, x4
16
mov x8, x5
17
stp d10, d11, [sp, #16]
18
stp d12, d13, [sp, #32]
19
stp d14, d15, [sp, #48]
20
mov x5, #0xc200000000000000
21
stp x5, xzr, [sp, #64]
22
add x10, sp, #64
23
24
mov x15, #0x100000000 //set up counter increment
25
movi v31.16b, #0x0
26
mov v31.d[1], x15
27
mov x5, x9
28
ld1 { v0.16b}, [x16] //CTR block 0
29
30
sub x5, x5, #1 //byte_len - 1
31
32
and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
33
34
rev32 v30.16b, v0.16b //set up reversed counter
35
36
add v30.4s, v30.4s, v31.4s //CTR block 0
37
38
rev32 v1.16b, v30.16b //CTR block 1
39
add v30.4s, v30.4s, v31.4s //CTR block 1
40
41
rev32 v2.16b, v30.16b //CTR block 2
42
add v30.4s, v30.4s, v31.4s //CTR block 2
43
44
rev32 v3.16b, v30.16b //CTR block 3
45
add v30.4s, v30.4s, v31.4s //CTR block 3
46
47
rev32 v4.16b, v30.16b //CTR block 4
48
add v30.4s, v30.4s, v31.4s //CTR block 4
49
50
rev32 v5.16b, v30.16b //CTR block 5
51
add v30.4s, v30.4s, v31.4s //CTR block 5
52
ldp q26, q27, [x8, #0] //load rk0, rk1
53
54
rev32 v6.16b, v30.16b //CTR block 6
55
add v30.4s, v30.4s, v31.4s //CTR block 6
56
57
rev32 v7.16b, v30.16b //CTR block 7
58
add v30.4s, v30.4s, v31.4s //CTR block 7
59
60
aese v4.16b, v26.16b
61
aesmc v4.16b, v4.16b //AES block 4 - round 0
62
aese v6.16b, v26.16b
63
aesmc v6.16b, v6.16b //AES block 6 - round 0
64
aese v3.16b, v26.16b
65
aesmc v3.16b, v3.16b //AES block 3 - round 0
66
67
aese v0.16b, v26.16b
68
aesmc v0.16b, v0.16b //AES block 0 - round 0
69
aese v1.16b, v26.16b
70
aesmc v1.16b, v1.16b //AES block 1 - round 0
71
aese v2.16b, v26.16b
72
aesmc v2.16b, v2.16b //AES block 2 - round 0
73
74
aese v7.16b, v26.16b
75
aesmc v7.16b, v7.16b //AES block 7 - round 0
76
aese v5.16b, v26.16b
77
aesmc v5.16b, v5.16b //AES block 5 - round 0
78
ldp q28, q26, [x8, #32] //load rk2, rk3
79
80
aese v3.16b, v27.16b
81
aesmc v3.16b, v3.16b //AES block 3 - round 1
82
83
aese v7.16b, v27.16b
84
aesmc v7.16b, v7.16b //AES block 7 - round 1
85
aese v5.16b, v27.16b
86
aesmc v5.16b, v5.16b //AES block 5 - round 1
87
aese v4.16b, v27.16b
88
aesmc v4.16b, v4.16b //AES block 4 - round 1
89
90
aese v2.16b, v27.16b
91
aesmc v2.16b, v2.16b //AES block 2 - round 1
92
aese v6.16b, v27.16b
93
aesmc v6.16b, v6.16b //AES block 6 - round 1
94
aese v0.16b, v27.16b
95
aesmc v0.16b, v0.16b //AES block 0 - round 1
96
97
aese v5.16b, v28.16b
98
aesmc v5.16b, v5.16b //AES block 5 - round 2
99
aese v1.16b, v27.16b
100
aesmc v1.16b, v1.16b //AES block 1 - round 1
101
aese v0.16b, v28.16b
102
aesmc v0.16b, v0.16b //AES block 0 - round 2
103
104
aese v2.16b, v28.16b
105
aesmc v2.16b, v2.16b //AES block 2 - round 2
106
aese v3.16b, v28.16b
107
aesmc v3.16b, v3.16b //AES block 3 - round 2
108
aese v7.16b, v28.16b
109
aesmc v7.16b, v7.16b //AES block 7 - round 2
110
111
aese v1.16b, v28.16b
112
aesmc v1.16b, v1.16b //AES block 1 - round 2
113
aese v6.16b, v28.16b
114
aesmc v6.16b, v6.16b //AES block 6 - round 2
115
aese v4.16b, v28.16b
116
aesmc v4.16b, v4.16b //AES block 4 - round 2
117
118
aese v2.16b, v26.16b
119
aesmc v2.16b, v2.16b //AES block 2 - round 3
120
121
ldp q27, q28, [x8, #64] //load rk4, rk5
122
aese v5.16b, v26.16b
123
aesmc v5.16b, v5.16b //AES block 5 - round 3
124
aese v0.16b, v26.16b
125
aesmc v0.16b, v0.16b //AES block 0 - round 3
126
127
aese v4.16b, v26.16b
128
aesmc v4.16b, v4.16b //AES block 4 - round 3
129
aese v3.16b, v26.16b
130
aesmc v3.16b, v3.16b //AES block 3 - round 3
131
aese v6.16b, v26.16b
132
aesmc v6.16b, v6.16b //AES block 6 - round 3
133
134
aese v7.16b, v26.16b
135
aesmc v7.16b, v7.16b //AES block 7 - round 3
136
137
aese v6.16b, v27.16b
138
aesmc v6.16b, v6.16b //AES block 6 - round 4
139
aese v1.16b, v26.16b
140
aesmc v1.16b, v1.16b //AES block 1 - round 3
141
aese v5.16b, v27.16b
142
aesmc v5.16b, v5.16b //AES block 5 - round 4
143
144
aese v7.16b, v27.16b
145
aesmc v7.16b, v7.16b //AES block 7 - round 4
146
aese v4.16b, v27.16b
147
aesmc v4.16b, v4.16b //AES block 4 - round 4
148
aese v0.16b, v27.16b
149
aesmc v0.16b, v0.16b //AES block 0 - round 4
150
151
aese v1.16b, v27.16b
152
aesmc v1.16b, v1.16b //AES block 1 - round 4
153
aese v2.16b, v27.16b
154
aesmc v2.16b, v2.16b //AES block 2 - round 4
155
aese v3.16b, v27.16b
156
aesmc v3.16b, v3.16b //AES block 3 - round 4
157
158
aese v7.16b, v28.16b
159
aesmc v7.16b, v7.16b //AES block 7 - round 5
160
aese v0.16b, v28.16b
161
aesmc v0.16b, v0.16b //AES block 0 - round 5
162
ldp q26, q27, [x8, #96] //load rk6, rk7
163
164
aese v1.16b, v28.16b
165
aesmc v1.16b, v1.16b //AES block 1 - round 5
166
aese v3.16b, v28.16b
167
aesmc v3.16b, v3.16b //AES block 3 - round 5
168
aese v2.16b, v28.16b
169
aesmc v2.16b, v2.16b //AES block 2 - round 5
170
171
aese v4.16b, v28.16b
172
aesmc v4.16b, v4.16b //AES block 4 - round 5
173
aese v5.16b, v28.16b
174
aesmc v5.16b, v5.16b //AES block 5 - round 5
175
aese v6.16b, v28.16b
176
aesmc v6.16b, v6.16b //AES block 6 - round 5
177
178
aese v4.16b, v26.16b
179
aesmc v4.16b, v4.16b //AES block 4 - round 6
180
aese v3.16b, v26.16b
181
aesmc v3.16b, v3.16b //AES block 3 - round 6
182
aese v2.16b, v26.16b
183
aesmc v2.16b, v2.16b //AES block 2 - round 6
184
185
aese v7.16b, v26.16b
186
aesmc v7.16b, v7.16b //AES block 7 - round 6
187
aese v6.16b, v26.16b
188
aesmc v6.16b, v6.16b //AES block 6 - round 6
189
aese v5.16b, v26.16b
190
aesmc v5.16b, v5.16b //AES block 5 - round 6
191
192
aese v0.16b, v26.16b
193
aesmc v0.16b, v0.16b //AES block 0 - round 6
194
aese v1.16b, v26.16b
195
aesmc v1.16b, v1.16b //AES block 1 - round 6
196
ldp q28, q26, [x8, #128] //load rk8, rk9
197
198
aese v5.16b, v27.16b
199
aesmc v5.16b, v5.16b //AES block 5 - round 7
200
201
ld1 { v19.16b}, [x3]
202
ext v19.16b, v19.16b, v19.16b, #8
203
rev64 v19.16b, v19.16b
204
205
aese v7.16b, v27.16b
206
aesmc v7.16b, v7.16b //AES block 7 - round 7
207
208
aese v4.16b, v27.16b
209
aesmc v4.16b, v4.16b //AES block 4 - round 7
210
aese v3.16b, v27.16b
211
aesmc v3.16b, v3.16b //AES block 3 - round 7
212
aese v6.16b, v27.16b
213
aesmc v6.16b, v6.16b //AES block 6 - round 7
214
215
aese v1.16b, v27.16b
216
aesmc v1.16b, v1.16b //AES block 1 - round 7
217
aese v2.16b, v27.16b
218
aesmc v2.16b, v2.16b //AES block 2 - round 7
219
aese v0.16b, v27.16b
220
aesmc v0.16b, v0.16b //AES block 0 - round 7
221
222
aese v3.16b, v28.16b
223
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
224
aese v6.16b, v28.16b
225
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
226
aese v2.16b, v28.16b
227
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
228
229
aese v7.16b, v28.16b
230
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
231
aese v0.16b, v28.16b
232
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
233
ldr q27, [x8, #160] //load rk10
234
235
aese v3.16b, v26.16b //AES block 8k+11 - round 9
236
aese v4.16b, v28.16b
237
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
238
aese v2.16b, v26.16b //AES block 8k+10 - round 9
239
240
aese v5.16b, v28.16b
241
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
242
aese v1.16b, v28.16b
243
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
244
aese v6.16b, v26.16b //AES block 8k+14 - round 9
245
246
aese v4.16b, v26.16b //AES block 8k+12 - round 9
247
add x5, x5, x0
248
aese v0.16b, v26.16b //AES block 8k+8 - round 9
249
250
aese v7.16b, v26.16b //AES block 8k+15 - round 9
251
aese v5.16b, v26.16b //AES block 8k+13 - round 9
252
aese v1.16b, v26.16b //AES block 8k+9 - round 9
253
254
add x4, x0, x1, lsr #3 //end_input_ptr
255
cmp x0, x5 //check if we have <= 8 blocks
256
b.ge .L128_enc_tail //handle tail
257
258
ldp q8, q9, [x0], #32 //AES block 0, 1 - load plaintext
259
260
ldp q10, q11, [x0], #32 //AES block 2, 3 - load plaintext
261
262
ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext
263
264
ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext
265
cmp x0, x5 //check if we have <= 8 blocks
266
267
.inst 0xce006d08 //eor3 v8.16b, v8.16b, v0.16b, v27.16b //AES block 0 - result
268
rev32 v0.16b, v30.16b //CTR block 8
269
add v30.4s, v30.4s, v31.4s //CTR block 8
270
271
.inst 0xce016d29 //eor3 v9.16b, v9.16b, v1.16b, v27.16b //AES block 1 - result
272
stp q8, q9, [x2], #32 //AES block 0, 1 - store result
273
274
rev32 v1.16b, v30.16b //CTR block 9
275
.inst 0xce056dad //eor3 v13.16b, v13.16b, v5.16b, v27.16b //AES block 5 - result
276
add v30.4s, v30.4s, v31.4s //CTR block 9
277
278
.inst 0xce026d4a //eor3 v10.16b, v10.16b, v2.16b, v27.16b //AES block 2 - result
279
.inst 0xce066dce //eor3 v14.16b, v14.16b, v6.16b, v27.16b //AES block 6 - result
280
.inst 0xce046d8c //eor3 v12.16b, v12.16b, v4.16b, v27.16b //AES block 4 - result
281
282
rev32 v2.16b, v30.16b //CTR block 10
283
add v30.4s, v30.4s, v31.4s //CTR block 10
284
285
.inst 0xce036d6b //eor3 v11.16b, v11.16b, v3.16b, v27.16b //AES block 3 - result
286
.inst 0xce076def //eor3 v15.16b, v15.16b, v7.16b,v27.16b //AES block 7 - result
287
stp q10, q11, [x2], #32 //AES block 2, 3 - store result
288
289
rev32 v3.16b, v30.16b //CTR block 11
290
add v30.4s, v30.4s, v31.4s //CTR block 11
291
stp q12, q13, [x2], #32 //AES block 4, 5 - store result
292
293
stp q14, q15, [x2], #32 //AES block 6, 7 - store result
294
295
rev32 v4.16b, v30.16b //CTR block 12
296
add v30.4s, v30.4s, v31.4s //CTR block 12
297
b.ge .L128_enc_prepretail //do prepretail
298
299
.L128_enc_main_loop: //main loop start
300
rev32 v5.16b, v30.16b //CTR block 8k+13
301
ldr q20, [x3, #128] //load h5l | h5h
302
ext v20.16b, v20.16b, v20.16b, #8
303
ldr q22, [x3, #160] //load h6l | h6h
304
ext v22.16b, v22.16b, v22.16b, #8
305
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
306
307
rev64 v9.16b, v9.16b //GHASH block 8k+1
308
rev64 v8.16b, v8.16b //GHASH block 8k
309
ldr q23, [x3, #176] //load h7l | h7h
310
ext v23.16b, v23.16b, v23.16b, #8
311
ldr q25, [x3, #208] //load h8l | h8h
312
ext v25.16b, v25.16b, v25.16b, #8
313
314
rev32 v6.16b, v30.16b //CTR block 8k+14
315
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
316
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
317
318
ldr q21, [x3, #144] //load h6k | h5k
319
ldr q24, [x3, #192] //load h8k | h7k
320
rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free)
321
rev64 v11.16b, v11.16b //GHASH block 8k+3
322
323
ldp q26, q27, [x8, #0] //load rk0, rk1
324
eor v8.16b, v8.16b, v19.16b //PRE 1
325
rev32 v7.16b, v30.16b //CTR block 8k+15
326
327
rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free)
328
329
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
330
rev64 v10.16b, v10.16b //GHASH block 8k+2
331
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
332
333
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
334
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
335
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
336
337
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
338
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
339
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
340
341
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
342
ldr q23, [x3, #80] //load h3l | h3h
343
ext v23.16b, v23.16b, v23.16b, #8
344
ldr q25, [x3, #112] //load h3l | h3h
345
ext v25.16b, v25.16b, v25.16b, #8
346
aese v5.16b, v26.16b
347
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
348
349
aese v1.16b, v26.16b
350
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
351
aese v4.16b, v26.16b
352
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
353
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
354
355
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
356
aese v2.16b, v26.16b
357
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
358
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
359
360
aese v6.16b, v26.16b
361
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
362
aese v1.16b, v27.16b
363
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
364
aese v0.16b, v26.16b
365
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
366
367
aese v2.16b, v27.16b
368
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
369
aese v3.16b, v26.16b
370
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
371
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
372
373
aese v5.16b, v27.16b
374
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
375
aese v7.16b, v26.16b
376
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
377
aese v0.16b, v27.16b
378
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
379
380
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b,v9.16b //GHASH block 8k+2, 8k+3 - high
381
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
382
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
383
384
ldp q28, q26, [x8, #32] //load rk2, rk3
385
aese v4.16b, v27.16b
386
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
387
aese v3.16b, v27.16b
388
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
389
390
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
391
aese v7.16b, v27.16b
392
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
393
aese v6.16b, v27.16b
394
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
395
396
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
397
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
398
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
399
400
rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free)
401
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
402
403
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
404
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
405
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
406
407
aese v5.16b, v28.16b
408
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
409
aese v4.16b, v28.16b
410
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
411
aese v2.16b, v28.16b
412
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
413
414
aese v1.16b, v28.16b
415
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
416
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
417
aese v6.16b, v28.16b
418
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
419
420
aese v0.16b, v28.16b
421
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
422
aese v3.16b, v28.16b
423
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
424
aese v7.16b, v28.16b
425
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
426
427
aese v6.16b, v26.16b
428
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
429
ldr q21, [x3, #48] //load h2k | h1k
430
ldr q24, [x3, #96] //load h4k | h3k
431
rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free)
432
433
ldp q27, q28, [x8, #64] //load rk4, rk5
434
aese v2.16b, v26.16b
435
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
436
aese v1.16b, v26.16b
437
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
438
439
ldr q20, [x3, #32] //load h1l | h1h
440
ext v20.16b, v20.16b, v20.16b, #8
441
ldr q22, [x3, #64] //load h1l | h1h
442
ext v22.16b, v22.16b, v22.16b, #8
443
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
444
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
445
446
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
447
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
448
449
aese v0.16b, v26.16b
450
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
451
aese v3.16b, v26.16b
452
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
453
454
aese v7.16b, v26.16b
455
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
456
aese v4.16b, v26.16b
457
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
458
459
aese v5.16b, v26.16b
460
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
461
aese v0.16b, v27.16b
462
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
463
464
aese v7.16b, v27.16b
465
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
466
aese v3.16b, v27.16b
467
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
468
aese v4.16b, v27.16b
469
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
470
471
aese v5.16b, v27.16b
472
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
473
aese v6.16b, v27.16b
474
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
475
aese v1.16b, v27.16b
476
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
477
478
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
479
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
480
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
481
482
aese v2.16b, v27.16b
483
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
484
ldp q26, q27, [x8, #96] //load rk6, rk7
485
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
486
487
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
488
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
489
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
490
491
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
492
aese v2.16b, v28.16b
493
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
494
aese v5.16b, v28.16b
495
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
496
497
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
498
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
499
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
500
501
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
502
aese v6.16b, v28.16b
503
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
504
505
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
506
aese v7.16b, v28.16b
507
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
508
aese v1.16b, v28.16b
509
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
510
511
aese v3.16b, v28.16b
512
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
513
aese v4.16b, v28.16b
514
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
515
aese v0.16b, v28.16b
516
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
517
518
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
519
ldr d16, [x10] //MODULO - load modulo constant
520
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
521
522
aese v7.16b, v26.16b
523
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
524
aese v5.16b, v26.16b
525
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
526
527
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
528
aese v1.16b, v26.16b
529
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
530
aese v2.16b, v26.16b
531
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
532
533
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
534
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
535
ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load plaintext
536
537
aese v3.16b, v26.16b
538
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
539
rev32 v20.16b, v30.16b //CTR block 8k+16
540
add v30.4s, v30.4s, v31.4s //CTR block 8k+16
541
542
aese v4.16b, v26.16b
543
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
544
aese v0.16b, v26.16b
545
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
546
aese v6.16b, v26.16b
547
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
548
549
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
550
ldp q28, q26, [x8, #128] //load rk8, rk9
551
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
552
553
aese v2.16b, v27.16b
554
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
555
aese v7.16b, v27.16b
556
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
557
ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load plaintext
558
559
aese v5.16b, v27.16b
560
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
561
aese v6.16b, v27.16b
562
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
563
aese v1.16b, v27.16b
564
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
565
566
pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
567
aese v0.16b, v27.16b
568
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
569
aese v4.16b, v27.16b
570
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
571
572
rev32 v22.16b, v30.16b //CTR block 8k+17
573
aese v3.16b, v27.16b
574
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
575
576
aese v5.16b, v28.16b
577
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
578
ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load plaintext
579
add v30.4s, v30.4s, v31.4s //CTR block 8k+17
580
581
aese v2.16b, v28.16b
582
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
583
aese v1.16b, v28.16b
584
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
585
aese v7.16b, v28.16b
586
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
587
588
aese v4.16b, v28.16b
589
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
590
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
591
ldr q27, [x8, #160] //load rk10
592
593
ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
594
rev32 v23.16b, v30.16b //CTR block 8k+18
595
add v30.4s, v30.4s, v31.4s //CTR block 8k+18
596
aese v3.16b, v28.16b
597
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
598
599
aese v0.16b, v28.16b
600
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
601
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
602
aese v6.16b, v28.16b
603
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
604
605
aese v2.16b, v26.16b //AES block 8k+10 - round 9
606
aese v4.16b, v26.16b //AES block 8k+12 - round 9
607
aese v1.16b, v26.16b //AES block 8k+9 - round 9
608
609
ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load plaintext
610
rev32 v25.16b, v30.16b //CTR block 8k+19
611
add v30.4s, v30.4s, v31.4s //CTR block 8k+19
612
613
cmp x0, x5 //.LOOP CONTROL
614
.inst 0xce046d8c //eor3 v12.16b, v12.16b, v4.16b, v27.16b //AES block 4 - result
615
aese v7.16b, v26.16b //AES block 8k+15 - round 9
616
617
aese v6.16b, v26.16b //AES block 8k+14 - round 9
618
aese v3.16b, v26.16b //AES block 8k+11 - round 9
619
620
.inst 0xce026d4a //eor3 v10.16b, v10.16b, v2.16b, v27.16b //AES block 8k+10 - result
621
622
mov v2.16b, v23.16b //CTR block 8k+18
623
aese v0.16b, v26.16b //AES block 8k+8 - round 9
624
625
rev32 v4.16b, v30.16b //CTR block 8k+20
626
add v30.4s, v30.4s, v31.4s //CTR block 8k+20
627
628
.inst 0xce076def //eor3 v15.16b, v15.16b, v7.16b, v27.16b //AES block 7 - result
629
aese v5.16b, v26.16b //AES block 8k+13 - round 9
630
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
631
632
.inst 0xce016d29 //eor3 v9.16b, v9.16b, v1.16b, v27.16b //AES block 8k+9 - result
633
.inst 0xce036d6b //eor3 v11.16b, v11.16b, v3.16b, v27.16b //AES block 8k+11 - result
634
mov v3.16b, v25.16b //CTR block 8k+19
635
636
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
637
.inst 0xce056dad //eor3 v13.16b, v13.16b, v5.16b, v27.16b //AES block 5 - result
638
mov v1.16b, v22.16b //CTR block 8k+17
639
640
.inst 0xce006d08 //eor3 v8.16b, v8.16b, v0.16b, v27.16b //AES block 8k+8 - result
641
mov v0.16b, v20.16b //CTR block 8k+16
642
stp q8, q9, [x2], #32 //AES block 8k+8, 8k+9 - store result
643
644
stp q10, q11, [x2], #32 //AES block 8k+10, 8k+11 - store result
645
.inst 0xce066dce //eor3 v14.16b, v14.16b, v6.16b, v27.16b //AES block 6 - result
646
647
stp q12, q13, [x2], #32 //AES block 8k+12, 8k+13 - store result
648
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
649
650
stp q14, q15, [x2], #32 //AES block 8k+14, 8k+15 - store result
651
b.lt .L128_enc_main_loop
652
653
.L128_enc_prepretail: //PREPRETAIL
654
rev32 v5.16b, v30.16b //CTR block 8k+13
655
ldr q23, [x3, #176] //load h7l | h7h
656
ext v23.16b, v23.16b, v23.16b, #8
657
ldr q25, [x3, #208] //load h8l | h8h
658
ext v25.16b, v25.16b, v25.16b, #8
659
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
660
661
ldr q20, [x3, #128] //load h5l | h5h
662
ext v20.16b, v20.16b, v20.16b, #8
663
ldr q22, [x3, #160] //load h6l | h6h
664
ext v22.16b, v22.16b, v22.16b, #8
665
rev64 v8.16b, v8.16b //GHASH block 8k
666
rev64 v9.16b, v9.16b //GHASH block 8k+1
667
668
ldr q21, [x3, #144] //load h6k | h5k
669
ldr q24, [x3, #192] //load h6k | h5k
670
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
671
rev64 v11.16b, v11.16b //GHASH block 8k+3
672
673
rev64 v10.16b, v10.16b //GHASH block 8k+2
674
eor v8.16b, v8.16b, v19.16b //PRE 1
675
676
rev32 v6.16b, v30.16b //CTR block 8k+14
677
678
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
679
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
680
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
681
682
rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free)
683
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
684
685
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
686
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
687
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
688
689
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
690
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
691
692
ldp q26, q27, [x8, #0] //load rk0, rk1
693
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
694
695
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
696
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
697
698
rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free)
699
rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free)
700
701
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
702
703
rev32 v7.16b, v30.16b //CTR block 8k+15
704
705
rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free)
706
707
aese v2.16b, v26.16b
708
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
709
710
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
711
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
712
713
aese v6.16b, v26.16b
714
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
715
aese v3.16b, v26.16b
716
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
717
718
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
719
aese v1.16b, v26.16b
720
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
721
722
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
723
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
724
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
725
726
aese v5.16b, v26.16b
727
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
728
aese v7.16b, v26.16b
729
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
730
731
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
732
aese v4.16b, v26.16b
733
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
734
aese v0.16b, v26.16b
735
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
736
737
aese v3.16b, v27.16b
738
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
739
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
740
741
ldr q23, [x3, #80] //load h3l | h3h
742
ext v23.16b, v23.16b, v23.16b, #8
743
ldr q25, [x3, #112] //load h4l | h4h
744
ext v25.16b, v25.16b, v25.16b, #8
745
746
ldp q28, q26, [x8, #32] //load rk2, rk3
747
aese v5.16b, v27.16b
748
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
749
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
750
751
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
752
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
753
754
aese v1.16b, v27.16b
755
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
756
aese v0.16b, v27.16b
757
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
758
759
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
760
ldr q21, [x3, #48] //load h2k | h1k
761
ldr q24, [x3, #96] //load h4k | h3k
762
aese v2.16b, v27.16b
763
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
764
765
aese v4.16b, v27.16b
766
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
767
aese v7.16b, v27.16b
768
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
769
770
aese v5.16b, v28.16b
771
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
772
aese v2.16b, v28.16b
773
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
774
aese v3.16b, v28.16b
775
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
776
777
aese v1.16b, v28.16b
778
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
779
aese v6.16b, v27.16b
780
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
781
aese v4.16b, v28.16b
782
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
783
784
aese v5.16b, v26.16b
785
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
786
aese v0.16b, v28.16b
787
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
788
789
aese v6.16b, v28.16b
790
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
791
aese v7.16b, v28.16b
792
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
793
ldp q27, q28, [x8, #64] //load rk4, rk5
794
795
ldr q20, [x3, #32] //load h1l | h1h
796
ext v20.16b, v20.16b, v20.16b, #8
797
ldr q22, [x3, #64] //load h1l | h1h
798
ext v22.16b, v22.16b, v22.16b, #8
799
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
800
aese v0.16b, v26.16b
801
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
802
803
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
804
aese v6.16b, v26.16b
805
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
806
aese v3.16b, v26.16b
807
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
808
809
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
810
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
811
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
812
813
aese v2.16b, v26.16b
814
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
815
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
816
817
aese v7.16b, v26.16b
818
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
819
aese v1.16b, v26.16b
820
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
821
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
822
823
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
824
aese v4.16b, v26.16b
825
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
826
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
827
828
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
829
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
830
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
831
832
aese v1.16b, v27.16b
833
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
834
aese v3.16b, v27.16b
835
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
836
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
837
838
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
839
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
840
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
841
842
aese v1.16b, v28.16b
843
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
844
aese v6.16b, v27.16b
845
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
846
aese v0.16b, v27.16b
847
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
848
849
aese v7.16b, v27.16b
850
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
851
aese v2.16b, v27.16b
852
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
853
854
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
855
aese v4.16b, v27.16b
856
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
857
aese v5.16b, v27.16b
858
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
859
860
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
861
ldp q26, q27, [x8, #96] //load rk6, rk7
862
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
863
864
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
865
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
866
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
867
868
aese v0.16b, v28.16b
869
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
870
aese v7.16b, v28.16b
871
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
872
ldr d16, [x10] //MODULO - load modulo constant
873
874
aese v2.16b, v28.16b
875
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
876
aese v4.16b, v28.16b
877
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
878
879
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
880
aese v5.16b, v28.16b
881
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
882
aese v6.16b, v28.16b
883
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
884
885
aese v3.16b, v28.16b
886
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
887
aese v4.16b, v26.16b
888
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
889
890
aese v5.16b, v26.16b
891
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
892
aese v2.16b, v26.16b
893
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
894
aese v0.16b, v26.16b
895
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
896
897
aese v3.16b, v26.16b
898
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
899
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
900
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
901
902
aese v6.16b, v26.16b
903
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
904
aese v1.16b, v26.16b
905
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
906
aese v7.16b, v26.16b
907
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
908
909
pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
910
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
911
ldp q28, q26, [x8, #128] //load rk8, rk9
912
913
aese v3.16b, v27.16b
914
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
915
aese v6.16b, v27.16b
916
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
917
aese v1.16b, v27.16b
918
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
919
ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
920
921
aese v5.16b, v27.16b
922
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
923
aese v0.16b, v27.16b
924
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
925
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
926
927
aese v2.16b, v27.16b
928
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
929
aese v7.16b, v27.16b
930
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
931
932
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
933
aese v4.16b, v27.16b
934
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
935
936
aese v7.16b, v28.16b
937
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
938
aese v2.16b, v28.16b
939
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
940
aese v1.16b, v28.16b
941
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
942
ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
943
944
aese v6.16b, v28.16b
945
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
946
.inst 0xce114a73 //eor3 v19.16b, v19.16b, v17.16b, v18.16b //MODULO - fold into low
947
aese v4.16b, v28.16b
948
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
949
950
aese v3.16b, v28.16b
951
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
952
aese v0.16b, v28.16b
953
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
954
aese v5.16b, v28.16b
955
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
956
957
ldr q27, [x8, #160] //load rk10
958
aese v6.16b, v26.16b //AES block 8k+14 - round 9
959
aese v2.16b, v26.16b //AES block 8k+10 - round 9
960
961
aese v0.16b, v26.16b //AES block 8k+8 - round 9
962
aese v1.16b, v26.16b //AES block 8k+9 - round 9
963
964
aese v3.16b, v26.16b //AES block 8k+11 - round 9
965
aese v5.16b, v26.16b //AES block 8k+13 - round 9
966
967
aese v4.16b, v26.16b //AES block 8k+12 - round 9
968
aese v7.16b, v26.16b //AES block 8k+15 - round 9
969
.L128_enc_tail: //TAIL
970
971
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
972
ldr q8, [x0], #16 //AES block 8k+8 - load plaintext
973
974
mov v29.16b, v27.16b
975
ldp q20, q21, [x3, #128] //load h5l | h5h
976
ext v20.16b, v20.16b, v20.16b, #8
977
978
.inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result
979
ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
980
ldp q22, q23, [x3, #160] //load h6l | h6h
981
ext v22.16b, v22.16b, v22.16b, #8
982
ext v23.16b, v23.16b, v23.16b, #8
983
984
ldp q24, q25, [x3, #192] //load h8k | h7k
985
ext v25.16b, v25.16b, v25.16b, #8
986
cmp x5, #112
987
b.gt .L128_enc_blocks_more_than_7
988
989
mov v7.16b, v6.16b
990
mov v6.16b, v5.16b
991
movi v17.8b, #0
992
993
cmp x5, #96
994
sub v30.4s, v30.4s, v31.4s
995
mov v5.16b, v4.16b
996
997
mov v4.16b, v3.16b
998
mov v3.16b, v2.16b
999
mov v2.16b, v1.16b
1000
1001
movi v19.8b, #0
1002
movi v18.8b, #0
1003
b.gt .L128_enc_blocks_more_than_6
1004
1005
mov v7.16b, v6.16b
1006
cmp x5, #80
1007
1008
sub v30.4s, v30.4s, v31.4s
1009
mov v6.16b, v5.16b
1010
mov v5.16b, v4.16b
1011
1012
mov v4.16b, v3.16b
1013
mov v3.16b, v1.16b
1014
b.gt .L128_enc_blocks_more_than_5
1015
1016
cmp x5, #64
1017
sub v30.4s, v30.4s, v31.4s
1018
1019
mov v7.16b, v6.16b
1020
mov v6.16b, v5.16b
1021
1022
mov v5.16b, v4.16b
1023
mov v4.16b, v1.16b
1024
b.gt .L128_enc_blocks_more_than_4
1025
1026
mov v7.16b, v6.16b
1027
sub v30.4s, v30.4s, v31.4s
1028
mov v6.16b, v5.16b
1029
1030
mov v5.16b, v1.16b
1031
cmp x5, #48
1032
b.gt .L128_enc_blocks_more_than_3
1033
1034
sub v30.4s, v30.4s, v31.4s
1035
mov v7.16b, v6.16b
1036
mov v6.16b, v1.16b
1037
1038
cmp x5, #32
1039
ldr q24, [x3, #96] //load h4k | h3k
1040
b.gt .L128_enc_blocks_more_than_2
1041
1042
cmp x5, #16
1043
1044
sub v30.4s, v30.4s, v31.4s
1045
mov v7.16b, v1.16b
1046
b.gt .L128_enc_blocks_more_than_1
1047
1048
ldr q21, [x3, #48] //load h2k | h1k
1049
sub v30.4s, v30.4s, v31.4s
1050
b .L128_enc_blocks_less_than_1
1051
.L128_enc_blocks_more_than_7: //blocks left > 7
1052
st1 { v9.16b}, [x2], #16 //AES final-7 block - store result
1053
1054
rev64 v8.16b, v9.16b //GHASH final-7 block
1055
ldr q9, [x0], #16 //AES final-6 block - load plaintext
1056
1057
eor v8.16b, v8.16b, v16.16b //feed in partial tag
1058
1059
ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
1060
1061
pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
1062
1063
ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
1064
1065
eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
1066
movi v16.8b, #0 //suppress further partial tag feed in
1067
1068
.inst 0xce017529 //eor3 v9.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
1069
1070
pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
1071
pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
1072
.L128_enc_blocks_more_than_6: //blocks left > 6
1073
1074
st1 { v9.16b}, [x2], #16 //AES final-6 block - store result
1075
1076
rev64 v8.16b, v9.16b //GHASH final-6 block
1077
ldr q9, [x0], #16 //AES final-5 block - load plaintext
1078
1079
eor v8.16b, v8.16b, v16.16b //feed in partial tag
1080
1081
ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
1082
1083
.inst 0xce027529 //eor3 v9.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
1084
pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
1085
1086
eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
1087
movi v16.8b, #0 //suppress further partial tag feed in
1088
1089
pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
1090
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
1091
1092
eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
1093
1094
eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
1095
eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
1096
.L128_enc_blocks_more_than_5: //blocks left > 5
1097
1098
st1 { v9.16b}, [x2], #16 //AES final-5 block - store result
1099
1100
rev64 v8.16b, v9.16b //GHASH final-5 block
1101
1102
eor v8.16b, v8.16b, v16.16b //feed in partial tag
1103
1104
ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
1105
ldr q9, [x0], #16 //AES final-4 block - load plaintext
1106
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
1107
1108
eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
1109
1110
eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
1111
1112
ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
1113
1114
.inst 0xce037529 //eor3 v9.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
1115
pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
1116
movi v16.8b, #0 //suppress further partial tag feed in
1117
1118
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
1119
eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
1120
1121
eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
1122
.L128_enc_blocks_more_than_4: //blocks left > 4
1123
1124
st1 { v9.16b}, [x2], #16 //AES final-4 block - store result
1125
1126
rev64 v8.16b, v9.16b //GHASH final-4 block
1127
1128
ldr q9, [x0], #16 //AES final-3 block - load plaintext
1129
1130
eor v8.16b, v8.16b, v16.16b //feed in partial tag
1131
1132
ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
1133
movi v16.8b, #0 //suppress further partial tag feed in
1134
pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
1135
1136
eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
1137
1138
pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
1139
1140
eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
1141
pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
1142
1143
eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
1144
1145
.inst 0xce047529 //eor3 v9.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
1146
eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
1147
.L128_enc_blocks_more_than_3: //blocks left > 3
1148
1149
st1 { v9.16b}, [x2], #16 //AES final-3 block - store result
1150
1151
ldr q25, [x3, #112] //load h4l | h4h
1152
ext v25.16b, v25.16b, v25.16b, #8
1153
1154
rev64 v8.16b, v9.16b //GHASH final-3 block
1155
1156
eor v8.16b, v8.16b, v16.16b //feed in partial tag
1157
movi v16.8b, #0 //suppress further partial tag feed in
1158
1159
ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
1160
ldr q24, [x3, #96] //load h4k | h3k
1161
pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
1162
1163
ldr q9, [x0], #16 //AES final-2 block - load plaintext
1164
1165
eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
1166
1167
ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
1168
eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
1169
1170
.inst 0xce057529 //eor3 v9.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
1171
1172
pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
1173
pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
1174
1175
eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
1176
eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
1177
.L128_enc_blocks_more_than_2: //blocks left > 2
1178
1179
st1 { v9.16b}, [x2], #16 //AES final-2 block - store result
1180
1181
rev64 v8.16b, v9.16b //GHASH final-2 block
1182
1183
eor v8.16b, v8.16b, v16.16b //feed in partial tag
1184
1185
ldr q9, [x0], #16 //AES final-1 block - load plaintext
1186
1187
ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
1188
ldr q23, [x3, #80] //load h3l | h3h
1189
ext v23.16b, v23.16b, v23.16b, #8
1190
movi v16.8b, #0 //suppress further partial tag feed in
1191
1192
eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
1193
.inst 0xce067529 //eor3 v9.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
1194
1195
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
1196
1197
pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
1198
pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
1199
1200
eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
1201
1202
eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
1203
eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
1204
.L128_enc_blocks_more_than_1: //blocks left > 1
1205
1206
st1 { v9.16b}, [x2], #16 //AES final-1 block - store result
1207
1208
ldr q22, [x3, #64] //load h2l | h2h
1209
ext v22.16b, v22.16b, v22.16b, #8
1210
rev64 v8.16b, v9.16b //GHASH final-1 block
1211
ldr q9, [x0], #16 //AES final block - load plaintext
1212
1213
eor v8.16b, v8.16b, v16.16b //feed in partial tag
1214
1215
movi v16.8b, #0 //suppress further partial tag feed in
1216
ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
1217
.inst 0xce077529 //eor3 v9.16b, v9.16b, v7.16b, v29.16b //AES final block - result
1218
1219
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
1220
1221
eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
1222
1223
ldr q21, [x3, #48] //load h2k | h1k
1224
1225
ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
1226
1227
pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
1228
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
1229
1230
eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
1231
1232
eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
1233
eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
1234
.L128_enc_blocks_less_than_1: //blocks left <= 1
1235
1236
rev32 v30.16b, v30.16b
1237
str q30, [x16] //store the updated counter
1238
and x1, x1, #127 //bit_length %= 128
1239
1240
sub x1, x1, #128 //bit_length -= 128
1241
1242
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
1243
1244
mvn x6, xzr //temp0_x = 0xffffffffffffffff
1245
ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
1246
and x1, x1, #127 //bit_length %= 128
1247
1248
lsr x6, x6, x1 //temp0_x is mask for top 64b of last block
1249
mvn x7, xzr //temp1_x = 0xffffffffffffffff
1250
cmp x1, #64
1251
1252
csel x13, x7, x6, lt
1253
csel x14, x6, xzr, lt
1254
1255
mov v0.d[1], x14
1256
mov v0.d[0], x13 //ctr0b is mask for last block
1257
1258
and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
1259
1260
rev64 v8.16b, v9.16b //GHASH final block
1261
1262
bif v9.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
1263
st1 { v9.16b}, [x2] //store all 16B
1264
1265
eor v8.16b, v8.16b, v16.16b //feed in partial tag
1266
1267
ins v16.d[0], v8.d[1] //GHASH final block - mid
1268
1269
eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
1270
ldr q20, [x3, #32] //load h1l | h1h
1271
ext v20.16b, v20.16b, v20.16b, #8
1272
1273
pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
1274
1275
pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
1276
eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
1277
ldr d16, [x10] //MODULO - load modulo constant
1278
1279
pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
1280
1281
eor v17.16b, v17.16b, v28.16b //GHASH final block - high
1282
1283
eor v19.16b, v19.16b, v26.16b //GHASH final block - low
1284
1285
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
1286
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
1287
1288
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
1289
1290
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
1291
1292
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
1293
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
1294
1295
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
1296
ext v19.16b, v19.16b, v19.16b, #8
1297
rev64 v19.16b, v19.16b
1298
st1 { v19.16b }, [x3]
1299
mov x0, x9
1300
1301
ldp d10, d11, [sp, #16]
1302
ldp d12, d13, [sp, #32]
1303
ldp d14, d15, [sp, #48]
1304
ldp d8, d9, [sp], #80
1305
ret
1306
1307
.L128_enc_ret:
1308
mov w0, #0x0
1309
ret
1310
.size unroll8_eor3_aes_gcm_enc_128_kernel,.-unroll8_eor3_aes_gcm_enc_128_kernel
1311
.globl unroll8_eor3_aes_gcm_dec_128_kernel
1312
.type unroll8_eor3_aes_gcm_dec_128_kernel,%function
1313
.align 4
1314
unroll8_eor3_aes_gcm_dec_128_kernel:
1315
AARCH64_VALID_CALL_TARGET
1316
cbz x1, .L128_dec_ret
1317
stp d8, d9, [sp, #-80]!
1318
lsr x9, x1, #3
1319
mov x16, x4
1320
mov x8, x5
1321
stp d10, d11, [sp, #16]
1322
stp d12, d13, [sp, #32]
1323
stp d14, d15, [sp, #48]
1324
mov x5, #0xc200000000000000
1325
stp x5, xzr, [sp, #64]
1326
add x10, sp, #64
1327
1328
mov x5, x9
1329
ld1 { v0.16b}, [x16] //CTR block 0
1330
1331
ldp q26, q27, [x8, #0] //load rk0, rk1
1332
sub x5, x5, #1 //byte_len - 1
1333
1334
mov x15, #0x100000000 //set up counter increment
1335
movi v31.16b, #0x0
1336
mov v31.d[1], x15
1337
ld1 { v19.16b}, [x3]
1338
ext v19.16b, v19.16b, v19.16b, #8
1339
rev64 v19.16b, v19.16b
1340
1341
rev32 v30.16b, v0.16b //set up reversed counter
1342
1343
aese v0.16b, v26.16b
1344
aesmc v0.16b, v0.16b //AES block 0 - round 0
1345
1346
add v30.4s, v30.4s, v31.4s //CTR block 0
1347
1348
rev32 v1.16b, v30.16b //CTR block 1
1349
add v30.4s, v30.4s, v31.4s //CTR block 1
1350
1351
and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
1352
1353
rev32 v2.16b, v30.16b //CTR block 2
1354
add v30.4s, v30.4s, v31.4s //CTR block 2
1355
aese v1.16b, v26.16b
1356
aesmc v1.16b, v1.16b //AES block 1 - round 0
1357
1358
rev32 v3.16b, v30.16b //CTR block 3
1359
add v30.4s, v30.4s, v31.4s //CTR block 3
1360
1361
aese v0.16b, v27.16b
1362
aesmc v0.16b, v0.16b //AES block 0 - round 1
1363
aese v1.16b, v27.16b
1364
aesmc v1.16b, v1.16b //AES block 1 - round 1
1365
1366
rev32 v4.16b, v30.16b //CTR block 4
1367
add v30.4s, v30.4s, v31.4s //CTR block 4
1368
1369
rev32 v5.16b, v30.16b //CTR block 5
1370
add v30.4s, v30.4s, v31.4s //CTR block 5
1371
1372
aese v2.16b, v26.16b
1373
aesmc v2.16b, v2.16b //AES block 2 - round 0
1374
1375
rev32 v6.16b, v30.16b //CTR block 6
1376
add v30.4s, v30.4s, v31.4s //CTR block 6
1377
aese v5.16b, v26.16b
1378
aesmc v5.16b, v5.16b //AES block 5 - round 0
1379
1380
aese v3.16b, v26.16b
1381
aesmc v3.16b, v3.16b //AES block 3 - round 0
1382
aese v4.16b, v26.16b
1383
aesmc v4.16b, v4.16b //AES block 4 - round 0
1384
1385
rev32 v7.16b, v30.16b //CTR block 7
1386
1387
aese v6.16b, v26.16b
1388
aesmc v6.16b, v6.16b //AES block 6 - round 0
1389
aese v2.16b, v27.16b
1390
aesmc v2.16b, v2.16b //AES block 2 - round 1
1391
1392
aese v7.16b, v26.16b
1393
aesmc v7.16b, v7.16b //AES block 7 - round 0
1394
1395
ldp q28, q26, [x8, #32] //load rk2, rk3
1396
1397
aese v6.16b, v27.16b
1398
aesmc v6.16b, v6.16b //AES block 6 - round 1
1399
aese v5.16b, v27.16b
1400
aesmc v5.16b, v5.16b //AES block 5 - round 1
1401
1402
aese v4.16b, v27.16b
1403
aesmc v4.16b, v4.16b //AES block 4 - round 1
1404
aese v7.16b, v27.16b
1405
aesmc v7.16b, v7.16b //AES block 7 - round 1
1406
1407
aese v7.16b, v28.16b
1408
aesmc v7.16b, v7.16b //AES block 7 - round 2
1409
aese v0.16b, v28.16b
1410
aesmc v0.16b, v0.16b //AES block 0 - round 2
1411
aese v3.16b, v27.16b
1412
aesmc v3.16b, v3.16b //AES block 3 - round 1
1413
1414
aese v6.16b, v28.16b
1415
aesmc v6.16b, v6.16b //AES block 6 - round 2
1416
aese v2.16b, v28.16b
1417
aesmc v2.16b, v2.16b //AES block 2 - round 2
1418
aese v5.16b, v28.16b
1419
aesmc v5.16b, v5.16b //AES block 5 - round 2
1420
1421
aese v4.16b, v28.16b
1422
aesmc v4.16b, v4.16b //AES block 4 - round 2
1423
aese v3.16b, v28.16b
1424
aesmc v3.16b, v3.16b //AES block 3 - round 2
1425
aese v1.16b, v28.16b
1426
aesmc v1.16b, v1.16b //AES block 1 - round 2
1427
1428
aese v6.16b, v26.16b
1429
aesmc v6.16b, v6.16b //AES block 6 - round 3
1430
aese v2.16b, v26.16b
1431
aesmc v2.16b, v2.16b //AES block 2 - round 3
1432
1433
ldp q27, q28, [x8, #64] //load rk4, rk5
1434
aese v5.16b, v26.16b
1435
aesmc v5.16b, v5.16b //AES block 5 - round 3
1436
1437
aese v0.16b, v26.16b
1438
aesmc v0.16b, v0.16b //AES block 0 - round 3
1439
aese v7.16b, v26.16b
1440
aesmc v7.16b, v7.16b //AES block 7 - round 3
1441
1442
aese v3.16b, v26.16b
1443
aesmc v3.16b, v3.16b //AES block 3 - round 3
1444
aese v1.16b, v26.16b
1445
aesmc v1.16b, v1.16b //AES block 1 - round 3
1446
1447
aese v0.16b, v27.16b
1448
aesmc v0.16b, v0.16b //AES block 0 - round 4
1449
aese v7.16b, v27.16b
1450
aesmc v7.16b, v7.16b //AES block 7 - round 4
1451
aese v4.16b, v26.16b
1452
aesmc v4.16b, v4.16b //AES block 4 - round 3
1453
1454
aese v6.16b, v27.16b
1455
aesmc v6.16b, v6.16b //AES block 6 - round 4
1456
aese v1.16b, v27.16b
1457
aesmc v1.16b, v1.16b //AES block 1 - round 4
1458
aese v3.16b, v27.16b
1459
aesmc v3.16b, v3.16b //AES block 3 - round 4
1460
1461
aese v5.16b, v27.16b
1462
aesmc v5.16b, v5.16b //AES block 5 - round 4
1463
aese v4.16b, v27.16b
1464
aesmc v4.16b, v4.16b //AES block 4 - round 4
1465
aese v2.16b, v27.16b
1466
aesmc v2.16b, v2.16b //AES block 2 - round 4
1467
1468
ldp q26, q27, [x8, #96] //load rk6, rk7
1469
aese v2.16b, v28.16b
1470
aesmc v2.16b, v2.16b //AES block 2 - round 5
1471
aese v3.16b, v28.16b
1472
aesmc v3.16b, v3.16b //AES block 3 - round 5
1473
1474
aese v6.16b, v28.16b
1475
aesmc v6.16b, v6.16b //AES block 6 - round 5
1476
aese v1.16b, v28.16b
1477
aesmc v1.16b, v1.16b //AES block 1 - round 5
1478
1479
aese v7.16b, v28.16b
1480
aesmc v7.16b, v7.16b //AES block 7 - round 5
1481
aese v5.16b, v28.16b
1482
aesmc v5.16b, v5.16b //AES block 5 - round 5
1483
1484
aese v4.16b, v28.16b
1485
aesmc v4.16b, v4.16b //AES block 4 - round 5
1486
1487
aese v3.16b, v26.16b
1488
aesmc v3.16b, v3.16b //AES block 3 - round 6
1489
aese v2.16b, v26.16b
1490
aesmc v2.16b, v2.16b //AES block 2 - round 6
1491
aese v0.16b, v28.16b
1492
aesmc v0.16b, v0.16b //AES block 0 - round 5
1493
1494
aese v5.16b, v26.16b
1495
aesmc v5.16b, v5.16b //AES block 5 - round 6
1496
aese v4.16b, v26.16b
1497
aesmc v4.16b, v4.16b //AES block 4 - round 6
1498
aese v1.16b, v26.16b
1499
aesmc v1.16b, v1.16b //AES block 1 - round 6
1500
1501
aese v0.16b, v26.16b
1502
aesmc v0.16b, v0.16b //AES block 0 - round 6
1503
aese v7.16b, v26.16b
1504
aesmc v7.16b, v7.16b //AES block 7 - round 6
1505
aese v6.16b, v26.16b
1506
aesmc v6.16b, v6.16b //AES block 6 - round 6
1507
1508
aese v3.16b, v27.16b
1509
aesmc v3.16b, v3.16b //AES block 3 - round 7
1510
aese v4.16b, v27.16b
1511
aesmc v4.16b, v4.16b //AES block 4 - round 7
1512
aese v1.16b, v27.16b
1513
aesmc v1.16b, v1.16b //AES block 1 - round 7
1514
1515
aese v7.16b, v27.16b
1516
aesmc v7.16b, v7.16b //AES block 7 - round 7
1517
aese v5.16b, v27.16b
1518
aesmc v5.16b, v5.16b //AES block 5 - round 7
1519
ldp q28, q26, [x8, #128] //load rk8, rk9
1520
1521
aese v6.16b, v27.16b
1522
aesmc v6.16b, v6.16b //AES block 6 - round 7
1523
aese v2.16b, v27.16b
1524
aesmc v2.16b, v2.16b //AES block 2 - round 7
1525
aese v0.16b, v27.16b
1526
aesmc v0.16b, v0.16b //AES block 0 - round 7
1527
1528
add x5, x5, x0
1529
add v30.4s, v30.4s, v31.4s //CTR block 7
1530
1531
aese v6.16b, v28.16b
1532
aesmc v6.16b, v6.16b //AES block 6 - round 8
1533
aese v0.16b, v28.16b
1534
aesmc v0.16b, v0.16b //AES block 0 - round 8
1535
1536
aese v1.16b, v28.16b
1537
aesmc v1.16b, v1.16b //AES block 1 - round 8
1538
aese v7.16b, v28.16b
1539
aesmc v7.16b, v7.16b //AES block 7 - round 8
1540
aese v3.16b, v28.16b
1541
aesmc v3.16b, v3.16b //AES block 3 - round 8
1542
1543
aese v5.16b, v28.16b
1544
aesmc v5.16b, v5.16b //AES block 5 - round 8
1545
aese v2.16b, v28.16b
1546
aesmc v2.16b, v2.16b //AES block 2 - round 8
1547
aese v4.16b, v28.16b
1548
aesmc v4.16b, v4.16b //AES block 4 - round 8
1549
1550
aese v0.16b, v26.16b //AES block 0 - round 9
1551
aese v1.16b, v26.16b //AES block 1 - round 9
1552
aese v6.16b, v26.16b //AES block 6 - round 9
1553
1554
ldr q27, [x8, #160] //load rk10
1555
aese v4.16b, v26.16b //AES block 4 - round 9
1556
aese v3.16b, v26.16b //AES block 3 - round 9
1557
1558
aese v2.16b, v26.16b //AES block 2 - round 9
1559
aese v5.16b, v26.16b //AES block 5 - round 9
1560
aese v7.16b, v26.16b //AES block 7 - round 9
1561
1562
add x4, x0, x1, lsr #3 //end_input_ptr
1563
cmp x0, x5 //check if we have <= 8 blocks
1564
b.ge .L128_dec_tail //handle tail
1565
1566
ldp q8, q9, [x0], #32 //AES block 0, 1 - load ciphertext
1567
1568
.inst 0xce006d00 //eor3 v0.16b, v8.16b, v0.16b, v27.16b //AES block 0 - result
1569
.inst 0xce016d21 //eor3 v1.16b, v9.16b, v1.16b, v27.16b //AES block 1 - result
1570
stp q0, q1, [x2], #32 //AES block 0, 1 - store result
1571
1572
rev32 v0.16b, v30.16b //CTR block 8
1573
add v30.4s, v30.4s, v31.4s //CTR block 8
1574
ldp q10, q11, [x0], #32 //AES block 2, 3 - load ciphertext
1575
1576
ldp q12, q13, [x0], #32 //AES block 4, 5 - load ciphertext
1577
1578
rev32 v1.16b, v30.16b //CTR block 9
1579
add v30.4s, v30.4s, v31.4s //CTR block 9
1580
ldp q14, q15, [x0], #32 //AES block 6, 7 - load ciphertext
1581
1582
.inst 0xce036d63 //eor3 v3.16b, v11.16b, v3.16b, v27.16b //AES block 3 - result
1583
.inst 0xce026d42 //eor3 v2.16b, v10.16b, v2.16b, v27.16b //AES block 2 - result
1584
stp q2, q3, [x2], #32 //AES block 2, 3 - store result
1585
1586
rev32 v2.16b, v30.16b //CTR block 10
1587
add v30.4s, v30.4s, v31.4s //CTR block 10
1588
1589
.inst 0xce066dc6 //eor3 v6.16b, v14.16b, v6.16b, v27.16b //AES block 6 - result
1590
1591
rev32 v3.16b, v30.16b //CTR block 11
1592
add v30.4s, v30.4s, v31.4s //CTR block 11
1593
1594
.inst 0xce046d84 //eor3 v4.16b, v12.16b, v4.16b, v27.16b //AES block 4 - result
1595
.inst 0xce056da5 //eor3 v5.16b, v13.16b, v5.16b, v27.16b //AES block 5 - result
1596
stp q4, q5, [x2], #32 //AES block 4, 5 - store result
1597
1598
.inst 0xce076de7 //eor3 v7.16b, v15.16b, v7.16b, v27.16b //AES block 7 - result
1599
stp q6, q7, [x2], #32 //AES block 6, 7 - store result
1600
rev32 v4.16b, v30.16b //CTR block 12
1601
1602
cmp x0, x5 //check if we have <= 8 blocks
1603
add v30.4s, v30.4s, v31.4s //CTR block 12
1604
b.ge .L128_dec_prepretail //do prepretail
1605
1606
.L128_dec_main_loop: //main loop start
1607
ldr q23, [x3, #176] //load h7l | h7h
1608
ext v23.16b, v23.16b, v23.16b, #8
1609
ldr q25, [x3, #208] //load h8l | h8h
1610
ext v25.16b, v25.16b, v25.16b, #8
1611
1612
rev64 v9.16b, v9.16b //GHASH block 8k+1
1613
rev64 v8.16b, v8.16b //GHASH block 8k
1614
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
1615
1616
rev64 v14.16b, v14.16b //GHASH block 8k+6
1617
ldr q20, [x3, #128] //load h5l | h5h
1618
ext v20.16b, v20.16b, v20.16b, #8
1619
ldr q22, [x3, #160] //load h6l | h6h
1620
ext v22.16b, v22.16b, v22.16b, #8
1621
1622
eor v8.16b, v8.16b, v19.16b //PRE 1
1623
rev32 v5.16b, v30.16b //CTR block 8k+13
1624
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
1625
1626
rev64 v10.16b, v10.16b //GHASH block 8k+2
1627
rev64 v12.16b, v12.16b //GHASH block 8k+4
1628
ldp q26, q27, [x8, #0] //load rk0, rk1
1629
1630
rev32 v6.16b, v30.16b //CTR block 8k+14
1631
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
1632
ldr q21, [x3, #144] //load h6k | h5k
1633
ldr q24, [x3, #192] //load h8k | h7k
1634
1635
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
1636
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
1637
rev64 v11.16b, v11.16b //GHASH block 8k+3
1638
1639
rev32 v7.16b, v30.16b //CTR block 8k+15
1640
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
1641
rev64 v13.16b, v13.16b //GHASH block 8k+5
1642
1643
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
1644
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
1645
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
1646
1647
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
1648
aese v4.16b, v26.16b
1649
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
1650
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
1651
1652
aese v6.16b, v26.16b
1653
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
1654
aese v5.16b, v26.16b
1655
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
1656
aese v7.16b, v26.16b
1657
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
1658
1659
aese v3.16b, v26.16b
1660
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
1661
aese v2.16b, v26.16b
1662
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
1663
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
1664
1665
aese v1.16b, v26.16b
1666
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
1667
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
1668
aese v0.16b, v26.16b
1669
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
1670
1671
aese v2.16b, v27.16b
1672
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
1673
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
1674
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
1675
1676
ldp q28, q26, [x8, #32] //load rk2, rk3
1677
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
1678
aese v7.16b, v27.16b
1679
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
1680
1681
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
1682
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
1683
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
1684
1685
ldr q23, [x3, #80] //load h3l | h3h
1686
ext v23.16b, v23.16b, v23.16b, #8
1687
ldr q25, [x3, #112] //load h4l | h4h
1688
ext v25.16b, v25.16b, v25.16b, #8
1689
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
1690
aese v6.16b, v27.16b
1691
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
1692
1693
aese v4.16b, v27.16b
1694
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
1695
aese v5.16b, v27.16b
1696
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
1697
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
1698
1699
aese v3.16b, v27.16b
1700
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
1701
aese v0.16b, v27.16b
1702
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
1703
aese v1.16b, v27.16b
1704
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
1705
1706
aese v7.16b, v28.16b
1707
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
1708
aese v2.16b, v28.16b
1709
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
1710
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
1711
1712
aese v4.16b, v28.16b
1713
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
1714
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
1715
ldr q20, [x3, #32] //load h1l | h1h
1716
ext v20.16b, v20.16b, v20.16b, #8
1717
ldr q22, [x3, #64] //load h2l | h2h
1718
ext v22.16b, v22.16b, v22.16b, #8
1719
1720
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
1721
aese v1.16b, v28.16b
1722
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
1723
aese v3.16b, v28.16b
1724
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
1725
1726
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
1727
aese v5.16b, v28.16b
1728
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
1729
aese v0.16b, v28.16b
1730
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
1731
1732
aese v6.16b, v28.16b
1733
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
1734
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
1735
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
1736
1737
aese v7.16b, v26.16b
1738
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
1739
rev64 v15.16b, v15.16b //GHASH block 8k+7
1740
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
1741
1742
ldp q27, q28, [x8, #64] //load rk4, rk5
1743
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
1744
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
1745
1746
ldr q21, [x3, #48] //load h2k | h1k
1747
ldr q24, [x3, #96] //load h4k | h3k
1748
aese v2.16b, v26.16b
1749
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
1750
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
1751
1752
aese v4.16b, v26.16b
1753
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
1754
aese v3.16b, v26.16b
1755
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
1756
aese v1.16b, v26.16b
1757
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
1758
1759
aese v0.16b, v26.16b
1760
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
1761
aese v6.16b, v26.16b
1762
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
1763
aese v5.16b, v26.16b
1764
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
1765
1766
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
1767
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
1768
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
1769
1770
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
1771
aese v0.16b, v27.16b
1772
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
1773
aese v7.16b, v27.16b
1774
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
1775
1776
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
1777
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
1778
aese v3.16b, v27.16b
1779
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
1780
1781
aese v1.16b, v27.16b
1782
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
1783
aese v5.16b, v27.16b
1784
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
1785
aese v6.16b, v27.16b
1786
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
1787
1788
aese v2.16b, v27.16b
1789
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
1790
aese v4.16b, v27.16b
1791
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
1792
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
1793
1794
ldp q26, q27, [x8, #96] //load rk6, rk7
1795
aese v0.16b, v28.16b
1796
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
1797
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
1798
1799
aese v2.16b, v28.16b
1800
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
1801
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
1802
aese v1.16b, v28.16b
1803
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
1804
1805
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
1806
aese v6.16b, v28.16b
1807
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
1808
aese v7.16b, v28.16b
1809
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
1810
1811
aese v3.16b, v28.16b
1812
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
1813
aese v5.16b, v28.16b
1814
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
1815
aese v4.16b, v28.16b
1816
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
1817
1818
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
1819
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
1820
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
1821
1822
aese v3.16b, v26.16b
1823
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
1824
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
1825
aese v7.16b, v26.16b
1826
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
1827
1828
aese v1.16b, v26.16b
1829
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
1830
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
1831
aese v6.16b, v26.16b
1832
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
1833
1834
aese v2.16b, v26.16b
1835
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
1836
aese v5.16b, v26.16b
1837
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
1838
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
1839
1840
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
1841
aese v0.16b, v26.16b
1842
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
1843
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
1844
1845
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
1846
aese v4.16b, v26.16b
1847
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
1848
ldp q28, q26, [x8, #128] //load rk8, rk9
1849
1850
ldr d16, [x10] //MODULO - load modulo constant
1851
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
1852
aese v5.16b, v27.16b
1853
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
1854
1855
rev32 v20.16b, v30.16b //CTR block 8k+16
1856
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
1857
add v30.4s, v30.4s, v31.4s //CTR block 8k+16
1858
1859
aese v6.16b, v27.16b
1860
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
1861
aese v3.16b, v27.16b
1862
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
1863
aese v7.16b, v27.16b
1864
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
1865
1866
aese v2.16b, v27.16b
1867
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
1868
aese v1.16b, v27.16b
1869
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
1870
rev32 v22.16b, v30.16b //CTR block 8k+17
1871
1872
aese v4.16b, v27.16b
1873
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
1874
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
1875
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
1876
1877
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
1878
aese v0.16b, v27.16b
1879
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
1880
add v30.4s, v30.4s, v31.4s //CTR block 8k+17
1881
1882
aese v5.16b, v28.16b
1883
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
1884
aese v1.16b, v28.16b
1885
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
1886
ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load ciphertext
1887
1888
ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load ciphertext
1889
aese v0.16b, v28.16b
1890
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
1891
rev32 v23.16b, v30.16b //CTR block 8k+18
1892
1893
ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load ciphertext
1894
aese v4.16b, v28.16b
1895
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
1896
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
1897
1898
ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load ciphertext
1899
aese v3.16b, v28.16b
1900
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
1901
add v30.4s, v30.4s, v31.4s //CTR block 8k+18
1902
1903
aese v7.16b, v28.16b
1904
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
1905
aese v2.16b, v28.16b
1906
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
1907
aese v6.16b, v28.16b
1908
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
1909
1910
aese v0.16b, v26.16b //AES block 8k+8 - round 9
1911
aese v1.16b, v26.16b //AES block 8k+9 - round 9
1912
ldr q27, [x8, #160] //load rk10
1913
1914
aese v6.16b, v26.16b //AES block 8k+14 - round 9
1915
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
1916
aese v2.16b, v26.16b //AES block 8k+10 - round 9
1917
1918
aese v7.16b, v26.16b //AES block 8k+15 - round 9
1919
aese v4.16b, v26.16b //AES block 8k+12 - round 9
1920
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
1921
1922
rev32 v25.16b, v30.16b //CTR block 8k+19
1923
add v30.4s, v30.4s, v31.4s //CTR block 8k+19
1924
1925
aese v3.16b, v26.16b //AES block 8k+11 - round 9
1926
aese v5.16b, v26.16b //AES block 8k+13 - round 9
1927
.inst 0xce016d21 //eor3 v1.16b, v9.16b, v1.16b, v27.16b //AES block 8k+9 - result
1928
1929
.inst 0xce006d00 //eor3 v0.16b, v8.16b, v0.16b, v27.16b //AES block 8k+8 - result
1930
.inst 0xce076de7 //eor3 v7.16b, v15.16b, v7.16b, v27.16b //AES block 8k+15 - result
1931
.inst 0xce066dc6 //eor3 v6.16b, v14.16b, v6.16b, v27.16b //AES block 8k+14 - result
1932
1933
.inst 0xce026d42 //eor3 v2.16b, v10.16b, v2.16b, v27.16b //AES block 8k+10 - result
1934
stp q0, q1, [x2], #32 //AES block 8k+8, 8k+9 - store result
1935
mov v1.16b, v22.16b //CTR block 8k+17
1936
1937
.inst 0xce046d84 //eor3 v4.16b, v12.16b, v4.16b, v27.16b //AES block 8k+12 - result
1938
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
1939
mov v0.16b, v20.16b //CTR block 8k+16
1940
1941
.inst 0xce036d63 //eor3 v3.16b, v11.16b, v3.16b, v27.16b //AES block 8k+11 - result
1942
cmp x0, x5 //.LOOP CONTROL
1943
stp q2, q3, [x2], #32 //AES block 8k+10, 8k+11 - store result
1944
1945
.inst 0xce056da5 //eor3 v5.16b, v13.16b, v5.16b, v27.16b //AES block 8k+13 - result
1946
mov v2.16b, v23.16b //CTR block 8k+18
1947
1948
stp q4, q5, [x2], #32 //AES block 8k+12, 8k+13 - store result
1949
rev32 v4.16b, v30.16b //CTR block 8k+20
1950
add v30.4s, v30.4s, v31.4s //CTR block 8k+20
1951
1952
stp q6, q7, [x2], #32 //AES block 8k+14, 8k+15 - store result
1953
mov v3.16b, v25.16b //CTR block 8k+19
1954
b.lt .L128_dec_main_loop
1955
1956
.L128_dec_prepretail: //PREPRETAIL
1957
rev64 v11.16b, v11.16b //GHASH block 8k+3
1958
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
1959
rev64 v8.16b, v8.16b //GHASH block 8k
1960
1961
rev64 v10.16b, v10.16b //GHASH block 8k+2
1962
rev32 v5.16b, v30.16b //CTR block 8k+13
1963
ldp q26, q27, [x8, #0] //load rk0, rk1
1964
1965
ldr q23, [x3, #176] //load h7l | h7h
1966
ext v23.16b, v23.16b, v23.16b, #8
1967
ldr q25, [x3, #208] //load h8l | h8h
1968
ext v25.16b, v25.16b, v25.16b, #8
1969
eor v8.16b, v8.16b, v19.16b //PRE 1
1970
rev64 v9.16b, v9.16b //GHASH block 8k+1
1971
1972
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
1973
ldr q20, [x3, #128] //load h5l | h5h
1974
ext v20.16b, v20.16b, v20.16b, #8
1975
ldr q22, [x3, #160] //load h6l | h6h
1976
ext v22.16b, v22.16b, v22.16b, #8
1977
rev64 v13.16b, v13.16b //GHASH block 8k+5
1978
1979
rev64 v12.16b, v12.16b //GHASH block 8k+4
1980
1981
rev64 v14.16b, v14.16b //GHASH block 8k+6
1982
1983
ldr q21, [x3, #144] //load h6k | h5k
1984
ldr q24, [x3, #192] //load h8k | h7k
1985
rev32 v6.16b, v30.16b //CTR block 8k+14
1986
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
1987
1988
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
1989
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
1990
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
1991
1992
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
1993
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
1994
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
1995
1996
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
1997
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
1998
aese v0.16b, v26.16b
1999
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
2000
2001
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
2002
aese v4.16b, v26.16b
2003
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
2004
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
2005
2006
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
2007
rev32 v7.16b, v30.16b //CTR block 8k+15
2008
aese v3.16b, v26.16b
2009
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
2010
2011
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
2012
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
2013
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
2014
2015
aese v2.16b, v26.16b
2016
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
2017
aese v1.16b, v26.16b
2018
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
2019
aese v5.16b, v26.16b
2020
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
2021
2022
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
2023
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
2024
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
2025
2026
aese v2.16b, v27.16b
2027
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
2028
aese v7.16b, v26.16b
2029
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
2030
aese v6.16b, v26.16b
2031
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
2032
2033
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
2034
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
2035
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
2036
2037
aese v6.16b, v27.16b
2038
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
2039
aese v4.16b, v27.16b
2040
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
2041
aese v5.16b, v27.16b
2042
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
2043
2044
ldp q28, q26, [x8, #32] //load rk2, rk3
2045
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
2046
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
2047
2048
ldr q23, [x3, #80] //load h3l | h3h
2049
ext v23.16b, v23.16b, v23.16b, #8
2050
ldr q25, [x3, #112] //load h4l | h4h
2051
ext v25.16b, v25.16b, v25.16b, #8
2052
aese v1.16b, v27.16b
2053
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
2054
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
2055
2056
aese v3.16b, v27.16b
2057
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
2058
aese v7.16b, v27.16b
2059
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
2060
aese v0.16b, v27.16b
2061
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
2062
2063
ldr q20, [x3, #32] //load h1l | h1h
2064
ext v20.16b, v20.16b, v20.16b, #8
2065
ldr q22, [x3, #64] //load h2l | h2h
2066
ext v22.16b, v22.16b, v22.16b, #8
2067
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
2068
2069
aese v0.16b, v28.16b
2070
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
2071
aese v6.16b, v28.16b
2072
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
2073
aese v2.16b, v28.16b
2074
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
2075
2076
aese v4.16b, v28.16b
2077
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
2078
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
2079
aese v7.16b, v28.16b
2080
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
2081
2082
aese v1.16b, v28.16b
2083
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
2084
aese v5.16b, v28.16b
2085
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
2086
aese v3.16b, v28.16b
2087
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
2088
2089
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
2090
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
2091
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
2092
2093
ldp q27, q28, [x8, #64] //load rk4, rk5
2094
rev64 v15.16b, v15.16b //GHASH block 8k+7
2095
aese v6.16b, v26.16b
2096
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
2097
2098
ldr q21, [x3, #48] //load h2k | h1k
2099
ldr q24, [x3, #96] //load h4k | h3k
2100
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
2101
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
2102
2103
aese v2.16b, v26.16b
2104
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
2105
aese v0.16b, v26.16b
2106
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
2107
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
2108
2109
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
2110
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
2111
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
2112
2113
aese v4.16b, v26.16b
2114
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
2115
aese v3.16b, v26.16b
2116
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
2117
aese v7.16b, v26.16b
2118
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
2119
2120
aese v1.16b, v26.16b
2121
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
2122
aese v5.16b, v26.16b
2123
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
2124
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
2125
2126
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
2127
aese v0.16b, v27.16b
2128
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
2129
aese v2.16b, v27.16b
2130
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
2131
2132
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
2133
aese v5.16b, v27.16b
2134
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
2135
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
2136
2137
aese v1.16b, v27.16b
2138
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
2139
aese v6.16b, v27.16b
2140
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
2141
aese v4.16b, v27.16b
2142
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
2143
2144
aese v7.16b, v27.16b
2145
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
2146
aese v3.16b, v27.16b
2147
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
2148
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
2149
2150
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
2151
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
2152
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
2153
2154
ldp q26, q27, [x8, #96] //load rk6, rk7
2155
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
2156
aese v6.16b, v28.16b
2157
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
2158
2159
ldr d16, [x10] //MODULO - load modulo constant
2160
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
2161
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
2162
2163
aese v0.16b, v28.16b
2164
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
2165
aese v2.16b, v28.16b
2166
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
2167
aese v4.16b, v28.16b
2168
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
2169
2170
aese v3.16b, v28.16b
2171
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
2172
aese v1.16b, v28.16b
2173
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
2174
aese v5.16b, v28.16b
2175
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
2176
2177
aese v7.16b, v28.16b
2178
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
2179
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
2180
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
2181
2182
aese v4.16b, v26.16b
2183
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
2184
aese v1.16b, v26.16b
2185
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
2186
aese v2.16b, v26.16b
2187
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
2188
2189
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
2190
aese v5.16b, v26.16b
2191
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
2192
aese v0.16b, v26.16b
2193
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
2194
2195
aese v3.16b, v26.16b
2196
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
2197
aese v6.16b, v26.16b
2198
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
2199
aese v7.16b, v26.16b
2200
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
2201
2202
aese v4.16b, v27.16b
2203
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
2204
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
2205
ldp q28, q26, [x8, #128] //load rk8, rk9
2206
2207
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
2208
aese v3.16b, v27.16b
2209
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
2210
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
2211
2212
aese v5.16b, v27.16b
2213
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
2214
aese v6.16b, v27.16b
2215
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
2216
aese v0.16b, v27.16b
2217
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
2218
2219
aese v7.16b, v27.16b
2220
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
2221
aese v1.16b, v27.16b
2222
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
2223
aese v2.16b, v27.16b
2224
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
2225
2226
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
2227
ldr q27, [x8, #160] //load rk10
2228
2229
aese v3.16b, v28.16b
2230
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
2231
aese v0.16b, v28.16b
2232
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
2233
2234
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
2235
aese v6.16b, v28.16b
2236
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
2237
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
2238
2239
aese v2.16b, v28.16b
2240
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
2241
aese v1.16b, v28.16b
2242
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
2243
aese v7.16b, v28.16b
2244
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
2245
2246
aese v6.16b, v26.16b //AES block 8k+14 - round 9
2247
aese v5.16b, v28.16b
2248
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
2249
aese v4.16b, v28.16b
2250
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
2251
2252
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
2253
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
2254
aese v2.16b, v26.16b //AES block 8k+10 - round 9
2255
2256
aese v3.16b, v26.16b //AES block 8k+11 - round 9
2257
aese v5.16b, v26.16b //AES block 8k+13 - round 9
2258
aese v0.16b, v26.16b //AES block 8k+8 - round 9
2259
2260
aese v4.16b, v26.16b //AES block 8k+12 - round 9
2261
aese v1.16b, v26.16b //AES block 8k+9 - round 9
2262
aese v7.16b, v26.16b //AES block 8k+15 - round 9
2263
2264
.L128_dec_tail: //TAIL
2265
2266
mov v29.16b, v27.16b
2267
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
2268
2269
cmp x5, #112
2270
2271
ldp q24, q25, [x3, #192] //load h8k | h7k
2272
ext v25.16b, v25.16b, v25.16b, #8
2273
ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext
2274
2275
ldp q20, q21, [x3, #128] //load h5l | h5h
2276
ext v20.16b, v20.16b, v20.16b, #8
2277
ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
2278
2279
ldp q22, q23, [x3, #160] //load h6l | h6h
2280
ext v22.16b, v22.16b, v22.16b, #8
2281
ext v23.16b, v23.16b, v23.16b, #8
2282
2283
.inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result
2284
b.gt .L128_dec_blocks_more_than_7
2285
2286
cmp x5, #96
2287
mov v7.16b, v6.16b
2288
movi v19.8b, #0
2289
2290
movi v17.8b, #0
2291
mov v6.16b, v5.16b
2292
mov v5.16b, v4.16b
2293
2294
mov v4.16b, v3.16b
2295
mov v3.16b, v2.16b
2296
mov v2.16b, v1.16b
2297
2298
movi v18.8b, #0
2299
sub v30.4s, v30.4s, v31.4s
2300
b.gt .L128_dec_blocks_more_than_6
2301
2302
cmp x5, #80
2303
sub v30.4s, v30.4s, v31.4s
2304
2305
mov v7.16b, v6.16b
2306
mov v6.16b, v5.16b
2307
mov v5.16b, v4.16b
2308
2309
mov v4.16b, v3.16b
2310
mov v3.16b, v1.16b
2311
b.gt .L128_dec_blocks_more_than_5
2312
2313
cmp x5, #64
2314
2315
mov v7.16b, v6.16b
2316
mov v6.16b, v5.16b
2317
mov v5.16b, v4.16b
2318
2319
mov v4.16b, v1.16b
2320
sub v30.4s, v30.4s, v31.4s
2321
b.gt .L128_dec_blocks_more_than_4
2322
2323
sub v30.4s, v30.4s, v31.4s
2324
mov v7.16b, v6.16b
2325
mov v6.16b, v5.16b
2326
2327
mov v5.16b, v1.16b
2328
cmp x5, #48
2329
b.gt .L128_dec_blocks_more_than_3
2330
2331
sub v30.4s, v30.4s, v31.4s
2332
mov v7.16b, v6.16b
2333
cmp x5, #32
2334
2335
ldr q24, [x3, #96] //load h4k | h3k
2336
mov v6.16b, v1.16b
2337
b.gt .L128_dec_blocks_more_than_2
2338
2339
cmp x5, #16
2340
2341
mov v7.16b, v1.16b
2342
sub v30.4s, v30.4s, v31.4s
2343
b.gt .L128_dec_blocks_more_than_1
2344
2345
sub v30.4s, v30.4s, v31.4s
2346
ldr q21, [x3, #48] //load h2k | h1k
2347
b .L128_dec_blocks_less_than_1
2348
.L128_dec_blocks_more_than_7: //blocks left > 7
2349
rev64 v8.16b, v9.16b //GHASH final-7 block
2350
2351
eor v8.16b, v8.16b, v16.16b //feed in partial tag
2352
2353
ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
2354
2355
pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
2356
ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
2357
2358
movi v16.8b, #0 //suppress further partial tag feed in
2359
ldr q9, [x0], #16 //AES final-6 block - load ciphertext
2360
2361
eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
2362
2363
pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
2364
st1 { v12.16b}, [x2], #16 //AES final-7 block - store result
2365
.inst 0xce01752c //eor3 v12.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
2366
2367
pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
2368
.L128_dec_blocks_more_than_6: //blocks left > 6
2369
2370
rev64 v8.16b, v9.16b //GHASH final-6 block
2371
2372
eor v8.16b, v8.16b, v16.16b //feed in partial tag
2373
2374
ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
2375
2376
eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
2377
2378
pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
2379
ldr q9, [x0], #16 //AES final-5 block - load ciphertext
2380
movi v16.8b, #0 //suppress further partial tag feed in
2381
2382
pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
2383
st1 { v12.16b}, [x2], #16 //AES final-6 block - store result
2384
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
2385
2386
eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
2387
eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
2388
2389
eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
2390
.inst 0xce02752c //eor3 v12.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
2391
.L128_dec_blocks_more_than_5: //blocks left > 5
2392
2393
rev64 v8.16b, v9.16b //GHASH final-5 block
2394
2395
ldr q9, [x0], #16 //AES final-4 block - load ciphertext
2396
st1 { v12.16b}, [x2], #16 //AES final-5 block - store result
2397
2398
eor v8.16b, v8.16b, v16.16b //feed in partial tag
2399
2400
ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
2401
2402
.inst 0xce03752c //eor3 v12.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
2403
2404
eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
2405
2406
ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
2407
pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
2408
movi v16.8b, #0 //suppress further partial tag feed in
2409
2410
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
2411
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
2412
eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
2413
2414
eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
2415
eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
2416
.L128_dec_blocks_more_than_4: //blocks left > 4
2417
2418
rev64 v8.16b, v9.16b //GHASH final-4 block
2419
2420
eor v8.16b, v8.16b, v16.16b //feed in partial tag
2421
ldr q9, [x0], #16 //AES final-3 block - load ciphertext
2422
2423
ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
2424
movi v16.8b, #0 //suppress further partial tag feed in
2425
pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
2426
2427
pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
2428
2429
eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
2430
2431
st1 { v12.16b}, [x2], #16 //AES final-4 block - store result
2432
eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
2433
2434
.inst 0xce04752c //eor3 v12.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
2435
eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
2436
2437
pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
2438
2439
eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
2440
.L128_dec_blocks_more_than_3: //blocks left > 3
2441
2442
st1 { v12.16b}, [x2], #16 //AES final-3 block - store result
2443
rev64 v8.16b, v9.16b //GHASH final-3 block
2444
2445
eor v8.16b, v8.16b, v16.16b //feed in partial tag
2446
2447
ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
2448
2449
ldr q25, [x3, #112] //load h4l | h4h
2450
ext v25.16b, v25.16b, v25.16b, #8
2451
ldr q24, [x3, #96] //load h4k | h3k
2452
2453
eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
2454
2455
ldr q9, [x0], #16 //AES final-2 block - load ciphertext
2456
2457
ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
2458
pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
2459
pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
2460
2461
movi v16.8b, #0 //suppress further partial tag feed in
2462
.inst 0xce05752c //eor3 v12.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
2463
eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
2464
2465
pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
2466
2467
eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
2468
eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
2469
.L128_dec_blocks_more_than_2: //blocks left > 2
2470
2471
rev64 v8.16b, v9.16b //GHASH final-2 block
2472
2473
st1 { v12.16b}, [x2], #16 //AES final-2 block - store result
2474
2475
eor v8.16b, v8.16b, v16.16b //feed in partial tag
2476
ldr q23, [x3, #80] //load h3l | h3h
2477
ext v23.16b, v23.16b, v23.16b, #8
2478
movi v16.8b, #0 //suppress further partial tag feed in
2479
2480
ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
2481
2482
eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
2483
2484
pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
2485
2486
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
2487
pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
2488
ldr q9, [x0], #16 //AES final-1 block - load ciphertext
2489
2490
eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
2491
2492
eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
2493
2494
.inst 0xce06752c //eor3 v12.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
2495
eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
2496
.L128_dec_blocks_more_than_1: //blocks left > 1
2497
2498
st1 { v12.16b}, [x2], #16 //AES final-1 block - store result
2499
rev64 v8.16b, v9.16b //GHASH final-1 block
2500
2501
ldr q22, [x3, #64] //load h2l | h2h
2502
ext v22.16b, v22.16b, v22.16b, #8
2503
2504
eor v8.16b, v8.16b, v16.16b //feed in partial tag
2505
2506
movi v16.8b, #0 //suppress further partial tag feed in
2507
2508
ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
2509
2510
ldr q9, [x0], #16 //AES final block - load ciphertext
2511
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
2512
2513
eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
2514
eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
2515
ldr q21, [x3, #48] //load h2k | h1k
2516
2517
ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
2518
.inst 0xce07752c //eor3 v12.16b, v9.16b, v7.16b, v29.16b //AES final block - result
2519
2520
pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
2521
2522
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
2523
2524
eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
2525
2526
eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
2527
.L128_dec_blocks_less_than_1: //blocks left <= 1
2528
2529
and x1, x1, #127 //bit_length %= 128
2530
2531
sub x1, x1, #128 //bit_length -= 128
2532
2533
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
2534
2535
mvn x6, xzr //temp0_x = 0xffffffffffffffff
2536
and x1, x1, #127 //bit_length %= 128
2537
2538
lsr x6, x6, x1 //temp0_x is mask for top 64b of last block
2539
cmp x1, #64
2540
mvn x7, xzr //temp1_x = 0xffffffffffffffff
2541
2542
csel x13, x7, x6, lt
2543
csel x14, x6, xzr, lt
2544
2545
mov v0.d[1], x14
2546
mov v0.d[0], x13 //ctr0b is mask for last block
2547
2548
ldr q20, [x3, #32] //load h1l | h1h
2549
ext v20.16b, v20.16b, v20.16b, #8
2550
ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
2551
2552
and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
2553
2554
rev64 v8.16b, v9.16b //GHASH final block
2555
2556
eor v8.16b, v8.16b, v16.16b //feed in partial tag
2557
2558
pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
2559
ins v16.d[0], v8.d[1] //GHASH final block - mid
2560
2561
eor v17.16b, v17.16b, v28.16b //GHASH final block - high
2562
eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
2563
2564
bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
2565
2566
pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
2567
st1 { v12.16b}, [x2] //store all 16B
2568
2569
pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
2570
2571
eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
2572
ldr d16, [x10] //MODULO - load modulo constant
2573
2574
eor v19.16b, v19.16b, v26.16b //GHASH final block - low
2575
2576
eor v14.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
2577
2578
pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
2579
ext v17.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
2580
2581
eor v18.16b, v18.16b, v14.16b //MODULO - karatsuba tidy up
2582
2583
.inst 0xce115652 //eor3 v18.16b, v18.16b, v17.16b, v21.16b //MODULO - fold into mid
2584
2585
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
2586
ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
2587
2588
.inst 0xce124673 //eor3 v19.16b, v19.16b, v18.16b, v17.16b //MODULO - fold into low
2589
ext v19.16b, v19.16b, v19.16b, #8
2590
rev64 v19.16b, v19.16b
2591
st1 { v19.16b }, [x3]
2592
rev32 v30.16b, v30.16b
2593
2594
str q30, [x16] //store the updated counter
2595
2596
mov x0, x9
2597
2598
ldp d10, d11, [sp, #16]
2599
ldp d12, d13, [sp, #32]
2600
ldp d14, d15, [sp, #48]
2601
ldp d8, d9, [sp], #80
2602
ret
2603
.L128_dec_ret:
2604
mov w0, #0x0
2605
ret
2606
.size unroll8_eor3_aes_gcm_dec_128_kernel,.-unroll8_eor3_aes_gcm_dec_128_kernel
2607
.globl unroll8_eor3_aes_gcm_enc_192_kernel
2608
.type unroll8_eor3_aes_gcm_enc_192_kernel,%function
2609
.align 4
2610
unroll8_eor3_aes_gcm_enc_192_kernel:
2611
AARCH64_VALID_CALL_TARGET
2612
cbz x1, .L192_enc_ret
2613
stp d8, d9, [sp, #-80]!
2614
lsr x9, x1, #3
2615
mov x16, x4
2616
mov x8, x5
2617
stp d10, d11, [sp, #16]
2618
stp d12, d13, [sp, #32]
2619
stp d14, d15, [sp, #48]
2620
mov x5, #0xc200000000000000
2621
stp x5, xzr, [sp, #64]
2622
add x10, sp, #64
2623
2624
mov x5, x9
2625
ld1 { v0.16b}, [x16] //CTR block 0
2626
2627
mov x15, #0x100000000 //set up counter increment
2628
movi v31.16b, #0x0
2629
mov v31.d[1], x15
2630
2631
rev32 v30.16b, v0.16b //set up reversed counter
2632
2633
add v30.4s, v30.4s, v31.4s //CTR block 0
2634
2635
rev32 v1.16b, v30.16b //CTR block 1
2636
add v30.4s, v30.4s, v31.4s //CTR block 1
2637
2638
rev32 v2.16b, v30.16b //CTR block 2
2639
add v30.4s, v30.4s, v31.4s //CTR block 2
2640
2641
rev32 v3.16b, v30.16b //CTR block 3
2642
add v30.4s, v30.4s, v31.4s //CTR block 3
2643
2644
rev32 v4.16b, v30.16b //CTR block 4
2645
add v30.4s, v30.4s, v31.4s //CTR block 4
2646
sub x5, x5, #1 //byte_len - 1
2647
2648
and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
2649
2650
rev32 v5.16b, v30.16b //CTR block 5
2651
add v30.4s, v30.4s, v31.4s //CTR block 5
2652
ldp q26, q27, [x8, #0] //load rk0, rk1
2653
2654
add x5, x5, x0
2655
2656
rev32 v6.16b, v30.16b //CTR block 6
2657
add v30.4s, v30.4s, v31.4s //CTR block 6
2658
2659
rev32 v7.16b, v30.16b //CTR block 7
2660
2661
aese v5.16b, v26.16b
2662
aesmc v5.16b, v5.16b //AES block 5 - round 0
2663
aese v4.16b, v26.16b
2664
aesmc v4.16b, v4.16b //AES block 4 - round 0
2665
aese v3.16b, v26.16b
2666
aesmc v3.16b, v3.16b //AES block 3 - round 0
2667
2668
aese v0.16b, v26.16b
2669
aesmc v0.16b, v0.16b //AES block 0 - round 0
2670
aese v1.16b, v26.16b
2671
aesmc v1.16b, v1.16b //AES block 1 - round 0
2672
aese v7.16b, v26.16b
2673
aesmc v7.16b, v7.16b //AES block 7 - round 0
2674
2675
aese v6.16b, v26.16b
2676
aesmc v6.16b, v6.16b //AES block 6 - round 0
2677
aese v2.16b, v26.16b
2678
aesmc v2.16b, v2.16b //AES block 2 - round 0
2679
ldp q28, q26, [x8, #32] //load rk2, rk3
2680
2681
aese v5.16b, v27.16b
2682
aesmc v5.16b, v5.16b //AES block 5 - round 1
2683
aese v7.16b, v27.16b
2684
aesmc v7.16b, v7.16b //AES block 7 - round 1
2685
2686
aese v2.16b, v27.16b
2687
aesmc v2.16b, v2.16b //AES block 2 - round 1
2688
aese v3.16b, v27.16b
2689
aesmc v3.16b, v3.16b //AES block 3 - round 1
2690
aese v6.16b, v27.16b
2691
aesmc v6.16b, v6.16b //AES block 6 - round 1
2692
2693
aese v5.16b, v28.16b
2694
aesmc v5.16b, v5.16b //AES block 5 - round 2
2695
aese v4.16b, v27.16b
2696
aesmc v4.16b, v4.16b //AES block 4 - round 1
2697
aese v0.16b, v27.16b
2698
aesmc v0.16b, v0.16b //AES block 0 - round 1
2699
2700
aese v1.16b, v27.16b
2701
aesmc v1.16b, v1.16b //AES block 1 - round 1
2702
aese v7.16b, v28.16b
2703
aesmc v7.16b, v7.16b //AES block 7 - round 2
2704
aese v3.16b, v28.16b
2705
aesmc v3.16b, v3.16b //AES block 3 - round 2
2706
2707
aese v2.16b, v28.16b
2708
aesmc v2.16b, v2.16b //AES block 2 - round 2
2709
aese v0.16b, v28.16b
2710
aesmc v0.16b, v0.16b //AES block 0 - round 2
2711
2712
aese v1.16b, v28.16b
2713
aesmc v1.16b, v1.16b //AES block 1 - round 2
2714
aese v4.16b, v28.16b
2715
aesmc v4.16b, v4.16b //AES block 4 - round 2
2716
aese v6.16b, v28.16b
2717
aesmc v6.16b, v6.16b //AES block 6 - round 2
2718
2719
ldp q27, q28, [x8, #64] //load rk4, rk5
2720
aese v4.16b, v26.16b
2721
aesmc v4.16b, v4.16b //AES block 4 - round 3
2722
2723
aese v7.16b, v26.16b
2724
aesmc v7.16b, v7.16b //AES block 7 - round 3
2725
aese v3.16b, v26.16b
2726
aesmc v3.16b, v3.16b //AES block 3 - round 3
2727
aese v2.16b, v26.16b
2728
aesmc v2.16b, v2.16b //AES block 2 - round 3
2729
2730
aese v1.16b, v26.16b
2731
aesmc v1.16b, v1.16b //AES block 1 - round 3
2732
2733
aese v0.16b, v26.16b
2734
aesmc v0.16b, v0.16b //AES block 0 - round 3
2735
2736
aese v6.16b, v26.16b
2737
aesmc v6.16b, v6.16b //AES block 6 - round 3
2738
2739
aese v0.16b, v27.16b
2740
aesmc v0.16b, v0.16b //AES block 0 - round 4
2741
aese v1.16b, v27.16b
2742
aesmc v1.16b, v1.16b //AES block 1 - round 4
2743
aese v5.16b, v26.16b
2744
aesmc v5.16b, v5.16b //AES block 5 - round 3
2745
2746
aese v3.16b, v27.16b
2747
aesmc v3.16b, v3.16b //AES block 3 - round 4
2748
aese v2.16b, v27.16b
2749
aesmc v2.16b, v2.16b //AES block 2 - round 4
2750
aese v4.16b, v27.16b
2751
aesmc v4.16b, v4.16b //AES block 4 - round 4
2752
2753
aese v6.16b, v27.16b
2754
aesmc v6.16b, v6.16b //AES block 6 - round 4
2755
aese v7.16b, v27.16b
2756
aesmc v7.16b, v7.16b //AES block 7 - round 4
2757
aese v5.16b, v27.16b
2758
aesmc v5.16b, v5.16b //AES block 5 - round 4
2759
2760
aese v1.16b, v28.16b
2761
aesmc v1.16b, v1.16b //AES block 1 - round 5
2762
ldp q26, q27, [x8, #96] //load rk6, rk7
2763
aese v2.16b, v28.16b
2764
aesmc v2.16b, v2.16b //AES block 2 - round 5
2765
2766
aese v4.16b, v28.16b
2767
aesmc v4.16b, v4.16b //AES block 4 - round 5
2768
aese v7.16b, v28.16b
2769
aesmc v7.16b, v7.16b //AES block 7 - round 5
2770
aese v0.16b, v28.16b
2771
aesmc v0.16b, v0.16b //AES block 0 - round 5
2772
2773
aese v5.16b, v28.16b
2774
aesmc v5.16b, v5.16b //AES block 5 - round 5
2775
aese v6.16b, v28.16b
2776
aesmc v6.16b, v6.16b //AES block 6 - round 5
2777
aese v3.16b, v28.16b
2778
aesmc v3.16b, v3.16b //AES block 3 - round 5
2779
2780
add v30.4s, v30.4s, v31.4s //CTR block 7
2781
2782
aese v5.16b, v26.16b
2783
aesmc v5.16b, v5.16b //AES block 5 - round 6
2784
aese v4.16b, v26.16b
2785
aesmc v4.16b, v4.16b //AES block 4 - round 6
2786
aese v3.16b, v26.16b
2787
aesmc v3.16b, v3.16b //AES block 3 - round 6
2788
2789
aese v2.16b, v26.16b
2790
aesmc v2.16b, v2.16b //AES block 2 - round 6
2791
aese v6.16b, v26.16b
2792
aesmc v6.16b, v6.16b //AES block 6 - round 6
2793
aese v1.16b, v26.16b
2794
aesmc v1.16b, v1.16b //AES block 1 - round 6
2795
2796
aese v0.16b, v26.16b
2797
aesmc v0.16b, v0.16b //AES block 0 - round 6
2798
aese v7.16b, v26.16b
2799
aesmc v7.16b, v7.16b //AES block 7 - round 6
2800
ldp q28, q26, [x8, #128] //load rk8, rk9
2801
2802
aese v6.16b, v27.16b
2803
aesmc v6.16b, v6.16b //AES block 6 - round 7
2804
aese v3.16b, v27.16b
2805
aesmc v3.16b, v3.16b //AES block 3 - round 7
2806
2807
aese v4.16b, v27.16b
2808
aesmc v4.16b, v4.16b //AES block 4 - round 7
2809
aese v0.16b, v27.16b
2810
aesmc v0.16b, v0.16b //AES block 0 - round 7
2811
2812
aese v7.16b, v27.16b
2813
aesmc v7.16b, v7.16b //AES block 7 - round 7
2814
aese v1.16b, v27.16b
2815
aesmc v1.16b, v1.16b //AES block 1 - round 7
2816
2817
aese v2.16b, v27.16b
2818
aesmc v2.16b, v2.16b //AES block 2 - round 7
2819
aese v5.16b, v27.16b
2820
aesmc v5.16b, v5.16b //AES block 5 - round 7
2821
2822
aese v7.16b, v28.16b
2823
aesmc v7.16b, v7.16b //AES block 7 - round 8
2824
aese v0.16b, v28.16b
2825
aesmc v0.16b, v0.16b //AES block 0 - round 8
2826
2827
aese v4.16b, v28.16b
2828
aesmc v4.16b, v4.16b //AES block 4 - round 8
2829
aese v3.16b, v28.16b
2830
aesmc v3.16b, v3.16b //AES block 3 - round 8
2831
aese v5.16b, v28.16b
2832
aesmc v5.16b, v5.16b //AES block 5 - round 8
2833
2834
aese v2.16b, v28.16b
2835
aesmc v2.16b, v2.16b //AES block 2 - round 8
2836
aese v1.16b, v28.16b
2837
aesmc v1.16b, v1.16b //AES block 1 - round 8
2838
aese v6.16b, v28.16b
2839
aesmc v6.16b, v6.16b //AES block 6 - round 8
2840
2841
add x4, x0, x1, lsr #3 //end_input_ptr
2842
cmp x0, x5 //check if we have <= 8 blocks
2843
aese v3.16b, v26.16b
2844
aesmc v3.16b, v3.16b //AES block 3 - round 9
2845
2846
ld1 { v19.16b}, [x3]
2847
ext v19.16b, v19.16b, v19.16b, #8
2848
rev64 v19.16b, v19.16b
2849
ldp q27, q28, [x8, #160] //load rk10, rk11
2850
2851
aese v6.16b, v26.16b
2852
aesmc v6.16b, v6.16b //AES block 6 - round 9
2853
aese v1.16b, v26.16b
2854
aesmc v1.16b, v1.16b //AES block 1 - round 9
2855
2856
aese v5.16b, v26.16b
2857
aesmc v5.16b, v5.16b //AES block 5 - round 9
2858
aese v2.16b, v26.16b
2859
aesmc v2.16b, v2.16b //AES block 2 - round 9
2860
2861
aese v0.16b, v26.16b
2862
aesmc v0.16b, v0.16b //AES block 0 - round 9
2863
aese v4.16b, v26.16b
2864
aesmc v4.16b, v4.16b //AES block 4 - round 9
2865
2866
aese v6.16b, v27.16b
2867
aesmc v6.16b, v6.16b //AES block 14 - round 10
2868
aese v7.16b, v26.16b
2869
aesmc v7.16b, v7.16b //AES block 7 - round 9
2870
aese v3.16b, v27.16b
2871
aesmc v3.16b, v3.16b //AES block 11 - round 10
2872
2873
aese v1.16b, v27.16b
2874
aesmc v1.16b, v1.16b //AES block 9 - round 10
2875
aese v5.16b, v27.16b
2876
aesmc v5.16b, v5.16b //AES block 13 - round 10
2877
aese v4.16b, v27.16b
2878
aesmc v4.16b, v4.16b //AES block 12 - round 10
2879
2880
aese v0.16b, v27.16b
2881
aesmc v0.16b, v0.16b //AES block 8 - round 10
2882
aese v2.16b, v27.16b
2883
aesmc v2.16b, v2.16b //AES block 10 - round 10
2884
aese v7.16b, v27.16b
2885
aesmc v7.16b, v7.16b //AES block 15 - round 10
2886
2887
aese v6.16b, v28.16b //AES block 14 - round 11
2888
aese v3.16b, v28.16b //AES block 11 - round 11
2889
2890
aese v4.16b, v28.16b //AES block 12 - round 11
2891
aese v7.16b, v28.16b //AES block 15 - round 11
2892
ldr q26, [x8, #192] //load rk12
2893
2894
aese v1.16b, v28.16b //AES block 9 - round 11
2895
aese v5.16b, v28.16b //AES block 13 - round 11
2896
2897
aese v2.16b, v28.16b //AES block 10 - round 11
2898
aese v0.16b, v28.16b //AES block 8 - round 11
2899
b.ge .L192_enc_tail //handle tail
2900
2901
ldp q8, q9, [x0], #32 //AES block 0, 1 - load plaintext
2902
2903
ldp q10, q11, [x0], #32 //AES block 2, 3 - load plaintext
2904
2905
ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext
2906
2907
ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext
2908
2909
.inst 0xce006908 //eor3 v8.16b, v8.16b, v0.16b, v26.16b //AES block 0 - result
2910
rev32 v0.16b, v30.16b //CTR block 8
2911
add v30.4s, v30.4s, v31.4s //CTR block 8
2912
2913
.inst 0xce03696b //eor3 v11.16b, v11.16b, v3.16b, v26.16b //AES block 3 - result
2914
.inst 0xce016929 //eor3 v9.16b, v9.16b, v1.16b, v26.16b //AES block 1 - result
2915
2916
rev32 v1.16b, v30.16b //CTR block 9
2917
add v30.4s, v30.4s, v31.4s //CTR block 9
2918
.inst 0xce04698c //eor3 v12.16b, v12.16b, v4.16b, v26.16b //AES block 4 - result
2919
2920
.inst 0xce0569ad //eor3 v13.16b, v13.16b, v5.16b, v26.16b //AES block 5 - result
2921
.inst 0xce0769ef //eor3 v15.16b, v15.16b, v7.16b, v26.16b //AES block 7 - result
2922
stp q8, q9, [x2], #32 //AES block 0, 1 - store result
2923
2924
.inst 0xce02694a //eor3 v10.16b, v10.16b, v2.16b, v26.16b //AES block 2 - result
2925
rev32 v2.16b, v30.16b //CTR block 10
2926
add v30.4s, v30.4s, v31.4s //CTR block 10
2927
2928
stp q10, q11, [x2], #32 //AES block 2, 3 - store result
2929
cmp x0, x5 //check if we have <= 8 blocks
2930
2931
rev32 v3.16b, v30.16b //CTR block 11
2932
add v30.4s, v30.4s, v31.4s //CTR block 11
2933
.inst 0xce0669ce //eor3 v14.16b, v14.16b, v6.16b, v26.16b //AES block 6 - result
2934
2935
stp q12, q13, [x2], #32 //AES block 4, 5 - store result
2936
2937
rev32 v4.16b, v30.16b //CTR block 12
2938
stp q14, q15, [x2], #32 //AES block 6, 7 - store result
2939
add v30.4s, v30.4s, v31.4s //CTR block 12
2940
2941
b.ge .L192_enc_prepretail //do prepretail
2942
2943
.L192_enc_main_loop: //main loop start
2944
rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free)
2945
ldp q26, q27, [x8, #0] //load rk0, rk1
2946
rev64 v10.16b, v10.16b //GHASH block 8k+2
2947
2948
rev32 v5.16b, v30.16b //CTR block 8k+13
2949
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
2950
ldr q23, [x3, #176] //load h7l | h7h
2951
ext v23.16b, v23.16b, v23.16b, #8
2952
ldr q25, [x3, #208] //load h8l | h8h
2953
ext v25.16b, v25.16b, v25.16b, #8
2954
2955
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
2956
rev64 v8.16b, v8.16b //GHASH block 8k
2957
ldr q20, [x3, #128] //load h5l | h5h
2958
ext v20.16b, v20.16b, v20.16b, #8
2959
ldr q22, [x3, #160] //load h6l | h6h
2960
ext v22.16b, v22.16b, v22.16b, #8
2961
2962
rev64 v9.16b, v9.16b //GHASH block 8k+1
2963
rev32 v6.16b, v30.16b //CTR block 8k+14
2964
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
2965
2966
eor v8.16b, v8.16b, v19.16b //PRE 1
2967
rev64 v11.16b, v11.16b //GHASH block 8k+3
2968
rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free)
2969
2970
aese v0.16b, v26.16b
2971
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
2972
rev32 v7.16b, v30.16b //CTR block 8k+15
2973
aese v1.16b, v26.16b
2974
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
2975
2976
aese v3.16b, v26.16b
2977
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
2978
aese v5.16b, v26.16b
2979
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
2980
aese v2.16b, v26.16b
2981
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
2982
2983
aese v7.16b, v26.16b
2984
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
2985
aese v4.16b, v26.16b
2986
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
2987
aese v6.16b, v26.16b
2988
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
2989
2990
ldp q28, q26, [x8, #32] //load rk2, rk3
2991
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
2992
aese v0.16b, v27.16b
2993
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
2994
2995
aese v4.16b, v27.16b
2996
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
2997
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
2998
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
2999
3000
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
3001
aese v3.16b, v27.16b
3002
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
3003
ldr q21, [x3, #144] //load h6k | h5k
3004
ldr q24, [x3, #192] //load h8k | h7k
3005
3006
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
3007
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
3008
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
3009
3010
aese v1.16b, v27.16b
3011
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
3012
aese v2.16b, v27.16b
3013
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
3014
aese v5.16b, v27.16b
3015
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
3016
3017
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
3018
aese v6.16b, v27.16b
3019
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
3020
aese v7.16b, v27.16b
3021
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
3022
3023
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
3024
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
3025
aese v1.16b, v28.16b
3026
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
3027
3028
aese v3.16b, v28.16b
3029
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
3030
aese v4.16b, v28.16b
3031
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
3032
aese v6.16b, v28.16b
3033
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
3034
3035
aese v5.16b, v28.16b
3036
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
3037
aese v1.16b, v26.16b
3038
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
3039
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
3040
3041
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
3042
aese v7.16b, v28.16b
3043
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
3044
aese v4.16b, v26.16b
3045
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
3046
3047
aese v2.16b, v28.16b
3048
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
3049
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
3050
aese v0.16b, v28.16b
3051
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
3052
3053
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
3054
aese v3.16b, v26.16b
3055
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
3056
ldp q27, q28, [x8, #64] //load rk4, rk5
3057
3058
aese v0.16b, v26.16b
3059
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
3060
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
3061
ldr q23, [x3, #80] //load h3l | h3h
3062
ext v23.16b, v23.16b, v23.16b, #8
3063
ldr q25, [x3, #112] //load h4l | h4h
3064
ext v25.16b, v25.16b, v25.16b, #8
3065
3066
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
3067
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
3068
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
3069
3070
aese v5.16b, v26.16b
3071
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
3072
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
3073
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
3074
3075
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
3076
aese v6.16b, v26.16b
3077
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
3078
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
3079
3080
aese v1.16b, v27.16b
3081
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
3082
aese v3.16b, v27.16b
3083
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
3084
aese v7.16b, v26.16b
3085
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
3086
3087
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
3088
aese v6.16b, v27.16b
3089
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
3090
aese v2.16b, v26.16b
3091
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
3092
3093
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
3094
aese v0.16b, v27.16b
3095
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
3096
aese v4.16b, v27.16b
3097
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
3098
3099
aese v2.16b, v27.16b
3100
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
3101
aese v5.16b, v27.16b
3102
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
3103
aese v7.16b, v27.16b
3104
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
3105
3106
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
3107
aese v4.16b, v28.16b
3108
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
3109
ldr q20, [x3, #32] //load h1l | h1h
3110
ext v20.16b, v20.16b, v20.16b, #8
3111
ldr q22, [x3, #64] //load h2l | h2h
3112
ext v22.16b, v22.16b, v22.16b, #8
3113
3114
ldp q26, q27, [x8, #96] //load rk6, rk7
3115
aese v2.16b, v28.16b
3116
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
3117
rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free)
3118
3119
rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free)
3120
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
3121
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
3122
3123
aese v5.16b, v28.16b
3124
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
3125
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
3126
3127
aese v6.16b, v28.16b
3128
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
3129
ldr q21, [x3, #48] //load h2k | h1k
3130
ldr q24, [x3, #96] //load h4k | h3k
3131
3132
aese v1.16b, v28.16b
3133
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
3134
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
3135
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
3136
3137
aese v3.16b, v28.16b
3138
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
3139
aese v7.16b, v28.16b
3140
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
3141
aese v0.16b, v28.16b
3142
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
3143
3144
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
3145
aese v4.16b, v26.16b
3146
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
3147
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
3148
3149
aese v0.16b, v26.16b
3150
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
3151
aese v3.16b, v26.16b
3152
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
3153
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
3154
3155
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
3156
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
3157
aese v2.16b, v26.16b
3158
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
3159
3160
aese v6.16b, v26.16b
3161
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
3162
aese v5.16b, v26.16b
3163
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
3164
3165
aese v7.16b, v26.16b
3166
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
3167
aese v2.16b, v27.16b
3168
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
3169
aese v1.16b, v26.16b
3170
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
3171
3172
aese v6.16b, v27.16b
3173
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
3174
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
3175
3176
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
3177
ldp q28, q26, [x8, #128] //load rk8, rk9
3178
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
3179
3180
aese v4.16b, v27.16b
3181
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
3182
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
3183
aese v5.16b, v27.16b
3184
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
3185
3186
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
3187
aese v7.16b, v27.16b
3188
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
3189
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
3190
3191
ldr d16, [x10] //MODULO - load modulo constant
3192
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
3193
aese v0.16b, v27.16b
3194
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
3195
3196
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
3197
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
3198
aese v3.16b, v27.16b
3199
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
3200
3201
aese v5.16b, v28.16b
3202
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
3203
aese v4.16b, v28.16b
3204
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
3205
aese v0.16b, v28.16b
3206
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
3207
3208
aese v6.16b, v28.16b
3209
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
3210
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
3211
aese v1.16b, v27.16b
3212
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
3213
3214
aese v7.16b, v28.16b
3215
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
3216
aese v2.16b, v28.16b
3217
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
3218
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
3219
3220
aese v1.16b, v28.16b
3221
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
3222
aese v3.16b, v28.16b
3223
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
3224
ldp q27, q28, [x8, #160] //load rk10, rk11
3225
3226
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
3227
rev32 v20.16b, v30.16b //CTR block 8k+16
3228
add v30.4s, v30.4s, v31.4s //CTR block 8k+16
3229
3230
aese v2.16b, v26.16b
3231
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
3232
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
3233
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
3234
3235
aese v6.16b, v26.16b
3236
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
3237
aese v3.16b, v26.16b
3238
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
3239
ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load plaintext
3240
3241
pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
3242
rev32 v22.16b, v30.16b //CTR block 8k+17
3243
aese v0.16b, v26.16b
3244
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
3245
3246
aese v4.16b, v26.16b
3247
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
3248
aese v1.16b, v26.16b
3249
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
3250
aese v7.16b, v26.16b
3251
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
3252
3253
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
3254
aese v5.16b, v26.16b
3255
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
3256
add v30.4s, v30.4s, v31.4s //CTR block 8k+17
3257
3258
aese v2.16b, v27.16b
3259
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
3260
aese v4.16b, v27.16b
3261
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
3262
ldr q26, [x8, #192] //load rk12
3263
ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
3264
3265
aese v0.16b, v27.16b
3266
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
3267
aese v7.16b, v27.16b
3268
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
3269
ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load plaintext
3270
3271
aese v4.16b, v28.16b //AES block 8k+12 - round 11
3272
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
3273
ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load plaintext
3274
3275
ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load plaintext
3276
aese v2.16b, v28.16b //AES block 8k+10 - round 11
3277
aese v1.16b, v27.16b
3278
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
3279
3280
rev32 v23.16b, v30.16b //CTR block 8k+18
3281
aese v5.16b, v27.16b
3282
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
3283
3284
aese v3.16b, v27.16b
3285
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
3286
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
3287
3288
aese v6.16b, v27.16b
3289
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
3290
aese v5.16b, v28.16b //AES block 8k+13 - round 11
3291
add v30.4s, v30.4s, v31.4s //CTR block 8k+18
3292
3293
aese v7.16b, v28.16b //AES block 8k+15 - round 11
3294
aese v0.16b, v28.16b //AES block 8k+8 - round 11
3295
.inst 0xce04698c //eor3 v12.16b, v12.16b, v4.16b, v26.16b //AES block 4 - result
3296
3297
aese v6.16b, v28.16b //AES block 8k+14 - round 11
3298
aese v3.16b, v28.16b //AES block 8k+11 - round 11
3299
aese v1.16b, v28.16b //AES block 8k+9 - round 11
3300
3301
rev32 v25.16b, v30.16b //CTR block 8k+19
3302
add v30.4s, v30.4s, v31.4s //CTR block 8k+19
3303
.inst 0xce0769ef //eor3 v15.16b, v15.16b, v7.16b, v26.16b //AES block 7 - result
3304
3305
.inst 0xce02694a //eor3 v10.16b, v10.16b, v2.16b, v26.16b //AES block 8k+10 - result
3306
.inst 0xce006908 //eor3 v8.16b, v8.16b, v0.16b, v26.16b //AES block 8k+8 - result
3307
mov v2.16b, v23.16b //CTR block 8k+18
3308
3309
.inst 0xce016929 //eor3 v9.16b, v9.16b, v1.16b, v26.16b //AES block 8k+9 - result
3310
mov v1.16b, v22.16b //CTR block 8k+17
3311
stp q8, q9, [x2], #32 //AES block 8k+8, 8k+9 - store result
3312
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
3313
3314
.inst 0xce0669ce //eor3 v14.16b, v14.16b, v6.16b, v26.16b //AES block 6 - result
3315
mov v0.16b, v20.16b //CTR block 8k+16
3316
rev32 v4.16b, v30.16b //CTR block 8k+20
3317
3318
add v30.4s, v30.4s, v31.4s //CTR block 8k+20
3319
.inst 0xce0569ad //eor3 v13.16b, v13.16b, v5.16b, v26.16b //AES block 5 - result
3320
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
3321
3322
.inst 0xce03696b //eor3 v11.16b, v11.16b, v3.16b, v26.16b //AES block 8k+11 - result
3323
mov v3.16b, v25.16b //CTR block 8k+19
3324
3325
stp q10, q11, [x2], #32 //AES block 8k+10, 8k+11 - store result
3326
3327
stp q12, q13, [x2], #32 //AES block 8k+12, 8k+13 - store result
3328
3329
cmp x0, x5 //.LOOP CONTROL
3330
stp q14, q15, [x2], #32 //AES block 8k+14, 8k+15 - store result
3331
b.lt .L192_enc_main_loop
3332
3333
.L192_enc_prepretail: //PREPRETAIL
3334
rev32 v5.16b, v30.16b //CTR block 8k+13
3335
ldp q26, q27, [x8, #0] //load rk0, rk1
3336
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
3337
3338
ldr q23, [x3, #176] //load h7l | h7h
3339
ext v23.16b, v23.16b, v23.16b, #8
3340
ldr q25, [x3, #208] //load h8l | h8h
3341
ext v25.16b, v25.16b, v25.16b, #8
3342
rev64 v8.16b, v8.16b //GHASH block 8k
3343
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
3344
3345
rev32 v6.16b, v30.16b //CTR block 8k+14
3346
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
3347
ldr q21, [x3, #144] //load h6k | h5k
3348
ldr q24, [x3, #192] //load h8k | h7k
3349
3350
rev64 v11.16b, v11.16b //GHASH block 8k+3
3351
rev64 v10.16b, v10.16b //GHASH block 8k+2
3352
ldr q20, [x3, #128] //load h5l | h5h
3353
ext v20.16b, v20.16b, v20.16b, #8
3354
ldr q22, [x3, #160] //load h6l | h6h
3355
ext v22.16b, v22.16b, v22.16b, #8
3356
3357
eor v8.16b, v8.16b, v19.16b //PRE 1
3358
rev32 v7.16b, v30.16b //CTR block 8k+15
3359
rev64 v9.16b, v9.16b //GHASH block 8k+1
3360
3361
aese v5.16b, v26.16b
3362
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
3363
aese v2.16b, v26.16b
3364
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
3365
aese v3.16b, v26.16b
3366
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
3367
3368
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
3369
aese v0.16b, v26.16b
3370
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
3371
aese v6.16b, v26.16b
3372
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
3373
3374
aese v1.16b, v26.16b
3375
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
3376
aese v4.16b, v26.16b
3377
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
3378
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
3379
3380
aese v6.16b, v27.16b
3381
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
3382
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
3383
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
3384
3385
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
3386
aese v7.16b, v26.16b
3387
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
3388
ldp q28, q26, [x8, #32] //load rk2, rk3
3389
3390
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
3391
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
3392
aese v2.16b, v27.16b
3393
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
3394
3395
aese v5.16b, v27.16b
3396
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
3397
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
3398
aese v1.16b, v27.16b
3399
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
3400
3401
aese v7.16b, v27.16b
3402
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
3403
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
3404
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
3405
3406
aese v3.16b, v27.16b
3407
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
3408
aese v0.16b, v27.16b
3409
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
3410
aese v4.16b, v27.16b
3411
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
3412
3413
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
3414
aese v5.16b, v28.16b
3415
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
3416
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
3417
3418
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
3419
aese v7.16b, v28.16b
3420
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
3421
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
3422
3423
aese v5.16b, v26.16b
3424
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
3425
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
3426
aese v6.16b, v28.16b
3427
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
3428
3429
aese v0.16b, v28.16b
3430
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
3431
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
3432
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
3433
3434
aese v3.16b, v28.16b
3435
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
3436
rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free)
3437
rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free)
3438
3439
aese v2.16b, v28.16b
3440
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
3441
aese v1.16b, v28.16b
3442
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
3443
aese v4.16b, v28.16b
3444
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
3445
3446
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
3447
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
3448
ldp q27, q28, [x8, #64] //load rk4, rk5
3449
3450
aese v1.16b, v26.16b
3451
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
3452
aese v6.16b, v26.16b
3453
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
3454
aese v2.16b, v26.16b
3455
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
3456
3457
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
3458
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
3459
aese v7.16b, v26.16b
3460
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
3461
3462
ldr q23, [x3, #80] //load h3l | h3h
3463
ext v23.16b, v23.16b, v23.16b, #8
3464
ldr q25, [x3, #112] //load h4l | h4h
3465
ext v25.16b, v25.16b, v25.16b, #8
3466
aese v3.16b, v26.16b
3467
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
3468
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
3469
3470
ldr q20, [x3, #32] //load h1l | h1h
3471
ext v20.16b, v20.16b, v20.16b, #8
3472
ldr q22, [x3, #64] //load h2l | h2h
3473
ext v22.16b, v22.16b, v22.16b, #8
3474
aese v4.16b, v26.16b
3475
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
3476
rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free)
3477
3478
aese v0.16b, v26.16b
3479
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
3480
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
3481
aese v6.16b, v27.16b
3482
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
3483
3484
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
3485
aese v7.16b, v27.16b
3486
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
3487
aese v5.16b, v27.16b
3488
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
3489
3490
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
3491
aese v3.16b, v27.16b
3492
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
3493
aese v0.16b, v27.16b
3494
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
3495
3496
aese v1.16b, v27.16b
3497
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
3498
aese v4.16b, v27.16b
3499
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
3500
aese v2.16b, v27.16b
3501
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
3502
3503
aese v0.16b, v28.16b
3504
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
3505
rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free)
3506
ldr q21, [x3, #48] //load h2k | h1k
3507
ldr q24, [x3, #96] //load h4k | h3k
3508
3509
aese v1.16b, v28.16b
3510
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
3511
aese v2.16b, v28.16b
3512
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
3513
ldp q26, q27, [x8, #96] //load rk6, rk7
3514
3515
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
3516
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
3517
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
3518
3519
aese v4.16b, v28.16b
3520
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
3521
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
3522
3523
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
3524
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
3525
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
3526
3527
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
3528
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
3529
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
3530
3531
aese v5.16b, v28.16b
3532
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
3533
aese v1.16b, v26.16b
3534
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
3535
aese v7.16b, v28.16b
3536
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
3537
3538
aese v6.16b, v28.16b
3539
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
3540
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
3541
aese v3.16b, v28.16b
3542
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
3543
3544
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
3545
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
3546
3547
aese v4.16b, v26.16b
3548
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
3549
aese v5.16b, v26.16b
3550
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
3551
aese v1.16b, v27.16b
3552
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
3553
3554
aese v0.16b, v26.16b
3555
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
3556
aese v7.16b, v26.16b
3557
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
3558
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
3559
3560
aese v2.16b, v26.16b
3561
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
3562
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
3563
aese v5.16b, v27.16b
3564
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
3565
3566
aese v6.16b, v26.16b
3567
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
3568
ldr d16, [x10] //MODULO - load modulo constant
3569
aese v3.16b, v26.16b
3570
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
3571
3572
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
3573
aese v0.16b, v27.16b
3574
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
3575
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
3576
3577
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
3578
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
3579
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
3580
3581
aese v4.16b, v27.16b
3582
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
3583
aese v2.16b, v27.16b
3584
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
3585
ldp q28, q26, [x8, #128] //load rk8, rk9
3586
3587
aese v3.16b, v27.16b
3588
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
3589
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
3590
3591
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
3592
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
3593
3594
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
3595
ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
3596
aese v7.16b, v27.16b
3597
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
3598
pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
3599
3600
aese v5.16b, v28.16b
3601
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
3602
aese v1.16b, v28.16b
3603
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
3604
3605
aese v6.16b, v27.16b
3606
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
3607
aese v2.16b, v28.16b
3608
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
3609
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
3610
3611
aese v3.16b, v28.16b
3612
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
3613
aese v5.16b, v26.16b
3614
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
3615
aese v4.16b, v28.16b
3616
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
3617
3618
aese v0.16b, v28.16b
3619
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
3620
aese v7.16b, v28.16b
3621
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
3622
aese v6.16b, v28.16b
3623
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
3624
3625
aese v3.16b, v26.16b
3626
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
3627
ldp q27, q28, [x8, #160] //load rk10, rk11
3628
aese v4.16b, v26.16b
3629
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
3630
3631
aese v2.16b, v26.16b
3632
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
3633
aese v7.16b, v26.16b
3634
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
3635
3636
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
3637
aese v6.16b, v26.16b
3638
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
3639
aese v0.16b, v26.16b
3640
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
3641
aese v1.16b, v26.16b
3642
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
3643
3644
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
3645
ldr q26, [x8, #192] //load rk12
3646
3647
aese v7.16b, v27.16b
3648
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
3649
aese v1.16b, v27.16b
3650
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
3651
aese v2.16b, v27.16b
3652
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
3653
3654
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
3655
aese v0.16b, v27.16b
3656
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
3657
aese v3.16b, v27.16b
3658
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
3659
3660
aese v1.16b, v28.16b //AES block 8k+9 - round 11
3661
aese v7.16b, v28.16b //AES block 8k+15 - round 11
3662
3663
aese v4.16b, v27.16b
3664
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
3665
aese v3.16b, v28.16b //AES block 8k+11 - round 11
3666
3667
aese v5.16b, v27.16b
3668
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
3669
aese v6.16b, v27.16b
3670
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
3671
3672
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
3673
aese v2.16b, v28.16b //AES block 8k+10 - round 11
3674
aese v0.16b, v28.16b //AES block 8k+8 - round 11
3675
3676
aese v6.16b, v28.16b //AES block 8k+14 - round 11
3677
aese v4.16b, v28.16b //AES block 8k+12 - round 11
3678
aese v5.16b, v28.16b //AES block 8k+13 - round 11
3679
3680
.L192_enc_tail: //TAIL
3681
3682
ldp q20, q21, [x3, #128] //load h5l | h5h
3683
ext v20.16b, v20.16b, v20.16b, #8
3684
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
3685
3686
ldr q8, [x0], #16 //AES block 8k+8 - l3ad plaintext
3687
3688
ldp q24, q25, [x3, #192] //load h8k | h7k
3689
ext v25.16b, v25.16b, v25.16b, #8
3690
3691
mov v29.16b, v26.16b
3692
3693
ldp q22, q23, [x3, #160] //load h6l | h6h
3694
ext v22.16b, v22.16b, v22.16b, #8
3695
ext v23.16b, v23.16b, v23.16b, #8
3696
cmp x5, #112
3697
3698
.inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result
3699
ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
3700
b.gt .L192_enc_blocks_more_than_7
3701
3702
cmp x5, #96
3703
mov v7.16b, v6.16b
3704
movi v17.8b, #0
3705
3706
mov v6.16b, v5.16b
3707
movi v19.8b, #0
3708
sub v30.4s, v30.4s, v31.4s
3709
3710
mov v5.16b, v4.16b
3711
mov v4.16b, v3.16b
3712
mov v3.16b, v2.16b
3713
3714
mov v2.16b, v1.16b
3715
movi v18.8b, #0
3716
b.gt .L192_enc_blocks_more_than_6
3717
3718
mov v7.16b, v6.16b
3719
cmp x5, #80
3720
3721
mov v6.16b, v5.16b
3722
mov v5.16b, v4.16b
3723
mov v4.16b, v3.16b
3724
3725
mov v3.16b, v1.16b
3726
sub v30.4s, v30.4s, v31.4s
3727
b.gt .L192_enc_blocks_more_than_5
3728
3729
cmp x5, #64
3730
sub v30.4s, v30.4s, v31.4s
3731
3732
mov v7.16b, v6.16b
3733
mov v6.16b, v5.16b
3734
mov v5.16b, v4.16b
3735
3736
mov v4.16b, v1.16b
3737
b.gt .L192_enc_blocks_more_than_4
3738
3739
mov v7.16b, v6.16b
3740
mov v6.16b, v5.16b
3741
mov v5.16b, v1.16b
3742
3743
sub v30.4s, v30.4s, v31.4s
3744
cmp x5, #48
3745
b.gt .L192_enc_blocks_more_than_3
3746
3747
mov v7.16b, v6.16b
3748
mov v6.16b, v1.16b
3749
sub v30.4s, v30.4s, v31.4s
3750
3751
ldr q24, [x3, #96] //load h4k | h3k
3752
cmp x5, #32
3753
b.gt .L192_enc_blocks_more_than_2
3754
3755
sub v30.4s, v30.4s, v31.4s
3756
3757
cmp x5, #16
3758
mov v7.16b, v1.16b
3759
b.gt .L192_enc_blocks_more_than_1
3760
3761
sub v30.4s, v30.4s, v31.4s
3762
ldr q21, [x3, #48] //load h2k | h1k
3763
b .L192_enc_blocks_less_than_1
3764
.L192_enc_blocks_more_than_7: //blocks left > 7
3765
st1 { v9.16b}, [x2], #16 //AES final-7 block - store result
3766
3767
rev64 v8.16b, v9.16b //GHASH final-7 block
3768
ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
3769
3770
eor v8.16b, v8.16b, v16.16b //feed in partial tag
3771
3772
ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
3773
3774
ldr q9, [x0], #16 //AES final-6 block - load plaintext
3775
3776
eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
3777
movi v16.8b, #0 //suppress further partial tag feed in
3778
pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
3779
3780
pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
3781
3782
pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
3783
.inst 0xce017529 //eor3 v9.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
3784
.L192_enc_blocks_more_than_6: //blocks left > 6
3785
3786
st1 { v9.16b}, [x2], #16 //AES final-6 block - store result
3787
3788
rev64 v8.16b, v9.16b //GHASH final-6 block
3789
3790
ldr q9, [x0], #16 //AES final-5 block - load plaintext
3791
3792
eor v8.16b, v8.16b, v16.16b //feed in partial tag
3793
3794
ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
3795
3796
pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
3797
.inst 0xce027529 //eor3 v9.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
3798
3799
movi v16.8b, #0 //suppress further partial tag feed in
3800
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
3801
eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
3802
3803
pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
3804
3805
eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
3806
eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
3807
3808
eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
3809
.L192_enc_blocks_more_than_5: //blocks left > 5
3810
3811
st1 { v9.16b}, [x2], #16 //AES final-5 block - store result
3812
3813
rev64 v8.16b, v9.16b //GHASH final-5 block
3814
3815
eor v8.16b, v8.16b, v16.16b //feed in partial tag
3816
3817
ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
3818
3819
ldr q9, [x0], #16 //AES final-4 block - load plaintext
3820
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
3821
3822
eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
3823
eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
3824
3825
ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
3826
pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
3827
3828
eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
3829
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
3830
3831
.inst 0xce037529 //eor3 v9.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
3832
movi v16.8b, #0 //suppress further partial tag feed in
3833
3834
eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
3835
.L192_enc_blocks_more_than_4: //blocks left > 4
3836
3837
st1 { v9.16b}, [x2], #16 //AES final-4 block - store result
3838
3839
rev64 v8.16b, v9.16b //GHASH final-4 block
3840
3841
eor v8.16b, v8.16b, v16.16b //feed in partial tag
3842
3843
ldr q9, [x0], #16 //AES final-3 block - load plaintext
3844
pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
3845
ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
3846
3847
pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
3848
eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
3849
3850
eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
3851
3852
movi v16.8b, #0 //suppress further partial tag feed in
3853
eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
3854
3855
pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
3856
3857
eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
3858
.inst 0xce047529 //eor3 v9.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
3859
.L192_enc_blocks_more_than_3: //blocks left > 3
3860
3861
ldr q24, [x3, #96] //load h4k | h3k
3862
st1 { v9.16b}, [x2], #16 //AES final-3 block - store result
3863
3864
rev64 v8.16b, v9.16b //GHASH final-3 block
3865
3866
eor v8.16b, v8.16b, v16.16b //feed in partial tag
3867
movi v16.8b, #0 //suppress further partial tag feed in
3868
3869
ldr q9, [x0], #16 //AES final-2 block - load plaintext
3870
ldr q25, [x3, #112] //load h4l | h4h
3871
ext v25.16b, v25.16b, v25.16b, #8
3872
3873
ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
3874
3875
.inst 0xce057529 //eor3 v9.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
3876
eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
3877
3878
ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
3879
pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
3880
3881
pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
3882
pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
3883
3884
eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
3885
3886
eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
3887
eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
3888
.L192_enc_blocks_more_than_2: //blocks left > 2
3889
3890
st1 { v9.16b}, [x2], #16 //AES final-2 block - store result
3891
3892
rev64 v8.16b, v9.16b //GHASH final-2 block
3893
ldr q23, [x3, #80] //load h3l | h3h
3894
ext v23.16b, v23.16b, v23.16b, #8
3895
3896
eor v8.16b, v8.16b, v16.16b //feed in partial tag
3897
3898
ldr q9, [x0], #16 //AES final-1 block - load plaintext
3899
ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
3900
3901
eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
3902
3903
pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
3904
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
3905
movi v16.8b, #0 //suppress further partial tag feed in
3906
3907
pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
3908
3909
eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
3910
eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
3911
3912
eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
3913
.inst 0xce067529 //eor3 v9.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
3914
.L192_enc_blocks_more_than_1: //blocks left > 1
3915
3916
ldr q22, [x3, #64] //load h1l | h1h
3917
ext v22.16b, v22.16b, v22.16b, #8
3918
st1 { v9.16b}, [x2], #16 //AES final-1 block - store result
3919
3920
rev64 v8.16b, v9.16b //GHASH final-1 block
3921
3922
eor v8.16b, v8.16b, v16.16b //feed in partial tag
3923
3924
ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
3925
pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
3926
3927
eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
3928
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
3929
eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
3930
3931
ldr q9, [x0], #16 //AES final block - load plaintext
3932
ldr q21, [x3, #48] //load h2k | h1k
3933
3934
ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
3935
3936
.inst 0xce077529 //eor3 v9.16b, v9.16b, v7.16b, v29.16b //AES final block - result
3937
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
3938
3939
movi v16.8b, #0 //suppress further partial tag feed in
3940
3941
eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
3942
eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
3943
.L192_enc_blocks_less_than_1: //blocks left <= 1
3944
3945
mvn x6, xzr //temp0_x = 0xffffffffffffffff
3946
and x1, x1, #127 //bit_length %= 128
3947
3948
sub x1, x1, #128 //bit_length -= 128
3949
3950
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
3951
3952
and x1, x1, #127 //bit_length %= 128
3953
3954
lsr x6, x6, x1 //temp0_x is mask for top 64b of last block
3955
cmp x1, #64
3956
mvn x7, xzr //temp1_x = 0xffffffffffffffff
3957
3958
csel x13, x7, x6, lt
3959
csel x14, x6, xzr, lt
3960
3961
mov v0.d[1], x14
3962
ldr q20, [x3, #32] //load h1l | h1h
3963
ext v20.16b, v20.16b, v20.16b, #8
3964
3965
ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
3966
mov v0.d[0], x13 //ctr0b is mask for last block
3967
3968
and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
3969
3970
rev64 v8.16b, v9.16b //GHASH final block
3971
bif v9.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
3972
3973
st1 { v9.16b}, [x2] //store all 16B
3974
3975
eor v8.16b, v8.16b, v16.16b //feed in partial tag
3976
3977
ins v16.d[0], v8.d[1] //GHASH final block - mid
3978
pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
3979
3980
eor v17.16b, v17.16b, v28.16b //GHASH final block - high
3981
pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
3982
3983
eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
3984
3985
pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
3986
3987
eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
3988
ldr d16, [x10] //MODULO - load modulo constant
3989
3990
eor v19.16b, v19.16b, v26.16b //GHASH final block - low
3991
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
3992
3993
rev32 v30.16b, v30.16b
3994
3995
str q30, [x16] //store the updated counter
3996
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
3997
3998
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
3999
4000
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
4001
4002
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
4003
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
4004
4005
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
4006
ext v19.16b, v19.16b, v19.16b, #8
4007
rev64 v19.16b, v19.16b
4008
st1 { v19.16b }, [x3]
4009
4010
mov x0, x9 //return sizes
4011
4012
ldp d10, d11, [sp, #16]
4013
ldp d12, d13, [sp, #32]
4014
ldp d14, d15, [sp, #48]
4015
ldp d8, d9, [sp], #80
4016
ret
4017
4018
.L192_enc_ret:
4019
mov w0, #0x0
4020
ret
4021
.size unroll8_eor3_aes_gcm_enc_192_kernel,.-unroll8_eor3_aes_gcm_enc_192_kernel
4022
.globl unroll8_eor3_aes_gcm_dec_192_kernel
4023
.type unroll8_eor3_aes_gcm_dec_192_kernel,%function
4024
.align 4
4025
unroll8_eor3_aes_gcm_dec_192_kernel:
4026
AARCH64_VALID_CALL_TARGET
4027
cbz x1, .L192_dec_ret
4028
stp d8, d9, [sp, #-80]!
4029
lsr x9, x1, #3
4030
mov x16, x4
4031
mov x8, x5
4032
stp d10, d11, [sp, #16]
4033
stp d12, d13, [sp, #32]
4034
stp d14, d15, [sp, #48]
4035
mov x5, #0xc200000000000000
4036
stp x5, xzr, [sp, #64]
4037
add x10, sp, #64
4038
4039
mov x5, x9
4040
ld1 { v0.16b}, [x16] //CTR block 0
4041
ld1 { v19.16b}, [x3]
4042
4043
mov x15, #0x100000000 //set up counter increment
4044
movi v31.16b, #0x0
4045
mov v31.d[1], x15
4046
4047
rev32 v30.16b, v0.16b //set up reversed counter
4048
4049
add v30.4s, v30.4s, v31.4s //CTR block 0
4050
4051
rev32 v1.16b, v30.16b //CTR block 1
4052
add v30.4s, v30.4s, v31.4s //CTR block 1
4053
4054
rev32 v2.16b, v30.16b //CTR block 2
4055
add v30.4s, v30.4s, v31.4s //CTR block 2
4056
4057
rev32 v3.16b, v30.16b //CTR block 3
4058
add v30.4s, v30.4s, v31.4s //CTR block 3
4059
4060
rev32 v4.16b, v30.16b //CTR block 4
4061
add v30.4s, v30.4s, v31.4s //CTR block 4
4062
4063
rev32 v5.16b, v30.16b //CTR block 5
4064
add v30.4s, v30.4s, v31.4s //CTR block 5
4065
ldp q26, q27, [x8, #0] //load rk0, rk1
4066
4067
rev32 v6.16b, v30.16b //CTR block 6
4068
add v30.4s, v30.4s, v31.4s //CTR block 6
4069
4070
rev32 v7.16b, v30.16b //CTR block 7
4071
4072
aese v3.16b, v26.16b
4073
aesmc v3.16b, v3.16b //AES block 3 - round 0
4074
aese v6.16b, v26.16b
4075
aesmc v6.16b, v6.16b //AES block 6 - round 0
4076
aese v5.16b, v26.16b
4077
aesmc v5.16b, v5.16b //AES block 5 - round 0
4078
4079
aese v0.16b, v26.16b
4080
aesmc v0.16b, v0.16b //AES block 0 - round 0
4081
aese v1.16b, v26.16b
4082
aesmc v1.16b, v1.16b //AES block 1 - round 0
4083
aese v7.16b, v26.16b
4084
aesmc v7.16b, v7.16b //AES block 7 - round 0
4085
4086
aese v2.16b, v26.16b
4087
aesmc v2.16b, v2.16b //AES block 2 - round 0
4088
aese v4.16b, v26.16b
4089
aesmc v4.16b, v4.16b //AES block 4 - round 0
4090
ldp q28, q26, [x8, #32] //load rk2, rk3
4091
4092
aese v1.16b, v27.16b
4093
aesmc v1.16b, v1.16b //AES block 1 - round 1
4094
4095
aese v2.16b, v27.16b
4096
aesmc v2.16b, v2.16b //AES block 2 - round 1
4097
4098
aese v0.16b, v27.16b
4099
aesmc v0.16b, v0.16b //AES block 0 - round 1
4100
aese v3.16b, v27.16b
4101
aesmc v3.16b, v3.16b //AES block 3 - round 1
4102
aese v7.16b, v27.16b
4103
aesmc v7.16b, v7.16b //AES block 7 - round 1
4104
4105
aese v5.16b, v27.16b
4106
aesmc v5.16b, v5.16b //AES block 5 - round 1
4107
aese v6.16b, v27.16b
4108
aesmc v6.16b, v6.16b //AES block 6 - round 1
4109
4110
aese v7.16b, v28.16b
4111
aesmc v7.16b, v7.16b //AES block 7 - round 2
4112
aese v0.16b, v28.16b
4113
aesmc v0.16b, v0.16b //AES block 0 - round 2
4114
aese v4.16b, v27.16b
4115
aesmc v4.16b, v4.16b //AES block 4 - round 1
4116
4117
aese v5.16b, v28.16b
4118
aesmc v5.16b, v5.16b //AES block 5 - round 2
4119
aese v1.16b, v28.16b
4120
aesmc v1.16b, v1.16b //AES block 1 - round 2
4121
aese v2.16b, v28.16b
4122
aesmc v2.16b, v2.16b //AES block 2 - round 2
4123
4124
aese v3.16b, v28.16b
4125
aesmc v3.16b, v3.16b //AES block 3 - round 2
4126
aese v4.16b, v28.16b
4127
aesmc v4.16b, v4.16b //AES block 4 - round 2
4128
aese v6.16b, v28.16b
4129
aesmc v6.16b, v6.16b //AES block 6 - round 2
4130
4131
aese v7.16b, v26.16b
4132
aesmc v7.16b, v7.16b //AES block 7 - round 3
4133
4134
ldp q27, q28, [x8, #64] //load rk4, rk5
4135
aese v2.16b, v26.16b
4136
aesmc v2.16b, v2.16b //AES block 2 - round 3
4137
aese v5.16b, v26.16b
4138
aesmc v5.16b, v5.16b //AES block 5 - round 3
4139
4140
aese v0.16b, v26.16b
4141
aesmc v0.16b, v0.16b //AES block 0 - round 3
4142
aese v3.16b, v26.16b
4143
aesmc v3.16b, v3.16b //AES block 3 - round 3
4144
4145
aese v4.16b, v26.16b
4146
aesmc v4.16b, v4.16b //AES block 4 - round 3
4147
aese v1.16b, v26.16b
4148
aesmc v1.16b, v1.16b //AES block 1 - round 3
4149
aese v6.16b, v26.16b
4150
aesmc v6.16b, v6.16b //AES block 6 - round 3
4151
4152
aese v3.16b, v27.16b
4153
aesmc v3.16b, v3.16b //AES block 3 - round 4
4154
aese v2.16b, v27.16b
4155
aesmc v2.16b, v2.16b //AES block 2 - round 4
4156
aese v5.16b, v27.16b
4157
aesmc v5.16b, v5.16b //AES block 5 - round 4
4158
4159
aese v1.16b, v27.16b
4160
aesmc v1.16b, v1.16b //AES block 1 - round 4
4161
aese v7.16b, v27.16b
4162
aesmc v7.16b, v7.16b //AES block 7 - round 4
4163
aese v6.16b, v27.16b
4164
aesmc v6.16b, v6.16b //AES block 6 - round 4
4165
4166
aese v0.16b, v27.16b
4167
aesmc v0.16b, v0.16b //AES block 0 - round 4
4168
aese v5.16b, v28.16b
4169
aesmc v5.16b, v5.16b //AES block 5 - round 5
4170
aese v4.16b, v27.16b
4171
aesmc v4.16b, v4.16b //AES block 4 - round 4
4172
4173
aese v6.16b, v28.16b
4174
aesmc v6.16b, v6.16b //AES block 6 - round 5
4175
ldp q26, q27, [x8, #96] //load rk6, rk7
4176
4177
aese v0.16b, v28.16b
4178
aesmc v0.16b, v0.16b //AES block 0 - round 5
4179
aese v4.16b, v28.16b
4180
aesmc v4.16b, v4.16b //AES block 4 - round 5
4181
aese v1.16b, v28.16b
4182
aesmc v1.16b, v1.16b //AES block 1 - round 5
4183
4184
aese v3.16b, v28.16b
4185
aesmc v3.16b, v3.16b //AES block 3 - round 5
4186
aese v2.16b, v28.16b
4187
aesmc v2.16b, v2.16b //AES block 2 - round 5
4188
aese v7.16b, v28.16b
4189
aesmc v7.16b, v7.16b //AES block 7 - round 5
4190
4191
sub x5, x5, #1 //byte_len - 1
4192
4193
aese v4.16b, v26.16b
4194
aesmc v4.16b, v4.16b //AES block 4 - round 6
4195
aese v5.16b, v26.16b
4196
aesmc v5.16b, v5.16b //AES block 5 - round 6
4197
aese v1.16b, v26.16b
4198
aesmc v1.16b, v1.16b //AES block 1 - round 6
4199
4200
aese v0.16b, v26.16b
4201
aesmc v0.16b, v0.16b //AES block 0 - round 6
4202
aese v3.16b, v26.16b
4203
aesmc v3.16b, v3.16b //AES block 3 - round 6
4204
aese v6.16b, v26.16b
4205
aesmc v6.16b, v6.16b //AES block 6 - round 6
4206
4207
aese v7.16b, v26.16b
4208
aesmc v7.16b, v7.16b //AES block 7 - round 6
4209
aese v2.16b, v26.16b
4210
aesmc v2.16b, v2.16b //AES block 2 - round 6
4211
ldp q28, q26, [x8, #128] //load rk8, rk9
4212
4213
add v30.4s, v30.4s, v31.4s //CTR block 7
4214
4215
aese v3.16b, v27.16b
4216
aesmc v3.16b, v3.16b //AES block 3 - round 7
4217
aese v7.16b, v27.16b
4218
aesmc v7.16b, v7.16b //AES block 7 - round 7
4219
4220
aese v2.16b, v27.16b
4221
aesmc v2.16b, v2.16b //AES block 2 - round 7
4222
aese v1.16b, v27.16b
4223
aesmc v1.16b, v1.16b //AES block 1 - round 7
4224
aese v4.16b, v27.16b
4225
aesmc v4.16b, v4.16b //AES block 4 - round 7
4226
4227
aese v6.16b, v27.16b
4228
aesmc v6.16b, v6.16b //AES block 6 - round 7
4229
aese v0.16b, v27.16b
4230
aesmc v0.16b, v0.16b //AES block 0 - round 7
4231
aese v5.16b, v27.16b
4232
aesmc v5.16b, v5.16b //AES block 5 - round 7
4233
4234
aese v1.16b, v28.16b
4235
aesmc v1.16b, v1.16b //AES block 1 - round 8
4236
aese v2.16b, v28.16b
4237
aesmc v2.16b, v2.16b //AES block 2 - round 8
4238
and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
4239
4240
aese v7.16b, v28.16b
4241
aesmc v7.16b, v7.16b //AES block 7 - round 8
4242
aese v6.16b, v28.16b
4243
aesmc v6.16b, v6.16b //AES block 6 - round 8
4244
aese v5.16b, v28.16b
4245
aesmc v5.16b, v5.16b //AES block 5 - round 8
4246
4247
aese v4.16b, v28.16b
4248
aesmc v4.16b, v4.16b //AES block 4 - round 8
4249
aese v3.16b, v28.16b
4250
aesmc v3.16b, v3.16b //AES block 3 - round 8
4251
aese v0.16b, v28.16b
4252
aesmc v0.16b, v0.16b //AES block 0 - round 8
4253
4254
add x4, x0, x1, lsr #3 //end_input_ptr
4255
aese v6.16b, v26.16b
4256
aesmc v6.16b, v6.16b //AES block 6 - round 9
4257
4258
ld1 { v19.16b}, [x3]
4259
ext v19.16b, v19.16b, v19.16b, #8
4260
rev64 v19.16b, v19.16b
4261
4262
ldp q27, q28, [x8, #160] //load rk10, rk11
4263
4264
aese v0.16b, v26.16b
4265
aesmc v0.16b, v0.16b //AES block 0 - round 9
4266
add x5, x5, x0
4267
4268
aese v1.16b, v26.16b
4269
aesmc v1.16b, v1.16b //AES block 1 - round 9
4270
aese v7.16b, v26.16b
4271
aesmc v7.16b, v7.16b //AES block 7 - round 9
4272
aese v4.16b, v26.16b
4273
aesmc v4.16b, v4.16b //AES block 4 - round 9
4274
4275
cmp x0, x5 //check if we have <= 8 blocks
4276
aese v3.16b, v26.16b
4277
aesmc v3.16b, v3.16b //AES block 3 - round 9
4278
4279
aese v5.16b, v26.16b
4280
aesmc v5.16b, v5.16b //AES block 5 - round 9
4281
aese v2.16b, v26.16b
4282
aesmc v2.16b, v2.16b //AES block 2 - round 9
4283
4284
aese v3.16b, v27.16b
4285
aesmc v3.16b, v3.16b //AES block 3 - round 10
4286
aese v1.16b, v27.16b
4287
aesmc v1.16b, v1.16b //AES block 1 - round 10
4288
aese v7.16b, v27.16b
4289
aesmc v7.16b, v7.16b //AES block 7 - round 10
4290
4291
aese v4.16b, v27.16b
4292
aesmc v4.16b, v4.16b //AES block 4 - round 10
4293
aese v0.16b, v27.16b
4294
aesmc v0.16b, v0.16b //AES block 0 - round 10
4295
aese v2.16b, v27.16b
4296
aesmc v2.16b, v2.16b //AES block 2 - round 10
4297
4298
aese v6.16b, v27.16b
4299
aesmc v6.16b, v6.16b //AES block 6 - round 10
4300
aese v5.16b, v27.16b
4301
aesmc v5.16b, v5.16b //AES block 5 - round 10
4302
ldr q26, [x8, #192] //load rk12
4303
4304
aese v0.16b, v28.16b //AES block 0 - round 11
4305
aese v1.16b, v28.16b //AES block 1 - round 11
4306
aese v4.16b, v28.16b //AES block 4 - round 11
4307
4308
aese v6.16b, v28.16b //AES block 6 - round 11
4309
aese v5.16b, v28.16b //AES block 5 - round 11
4310
aese v7.16b, v28.16b //AES block 7 - round 11
4311
4312
aese v2.16b, v28.16b //AES block 2 - round 11
4313
aese v3.16b, v28.16b //AES block 3 - round 11
4314
b.ge .L192_dec_tail //handle tail
4315
4316
ldp q8, q9, [x0], #32 //AES block 0, 1 - load ciphertext
4317
4318
ldp q10, q11, [x0], #32 //AES block 2, 3 - load ciphertext
4319
4320
ldp q12, q13, [x0], #32 //AES block 4, 5 - load ciphertext
4321
4322
.inst 0xce016921 //eor3 v1.16b, v9.16b, v1.16b, v26.16b //AES block 1 - result
4323
.inst 0xce006900 //eor3 v0.16b, v8.16b, v0.16b, v26.16b //AES block 0 - result
4324
stp q0, q1, [x2], #32 //AES block 0, 1 - store result
4325
4326
rev32 v0.16b, v30.16b //CTR block 8
4327
add v30.4s, v30.4s, v31.4s //CTR block 8
4328
4329
rev32 v1.16b, v30.16b //CTR block 9
4330
add v30.4s, v30.4s, v31.4s //CTR block 9
4331
.inst 0xce036963 //eor3 v3.16b, v11.16b, v3.16b, v26.16b //AES block 3 - result
4332
4333
.inst 0xce026942 //eor3 v2.16b, v10.16b, v2.16b, v26.16b //AES block 2 - result
4334
stp q2, q3, [x2], #32 //AES block 2, 3 - store result
4335
ldp q14, q15, [x0], #32 //AES block 6, 7 - load ciphertext
4336
4337
rev32 v2.16b, v30.16b //CTR block 10
4338
add v30.4s, v30.4s, v31.4s //CTR block 10
4339
4340
.inst 0xce046984 //eor3 v4.16b, v12.16b, v4.16b, v26.16b //AES block 4 - result
4341
4342
rev32 v3.16b, v30.16b //CTR block 11
4343
add v30.4s, v30.4s, v31.4s //CTR block 11
4344
4345
.inst 0xce0569a5 //eor3 v5.16b, v13.16b, v5.16b, v26.16b //AES block 5 - result
4346
stp q4, q5, [x2], #32 //AES block 4, 5 - store result
4347
cmp x0, x5 //check if we have <= 8 blocks
4348
4349
.inst 0xce0669c6 //eor3 v6.16b, v14.16b, v6.16b, v26.16b //AES block 6 - result
4350
.inst 0xce0769e7 //eor3 v7.16b, v15.16b, v7.16b, v26.16b //AES block 7 - result
4351
rev32 v4.16b, v30.16b //CTR block 12
4352
4353
add v30.4s, v30.4s, v31.4s //CTR block 12
4354
stp q6, q7, [x2], #32 //AES block 6, 7 - store result
4355
b.ge .L192_dec_prepretail //do prepretail
4356
4357
.L192_dec_main_loop: //main loop start
4358
rev64 v9.16b, v9.16b //GHASH block 8k+1
4359
ldp q26, q27, [x8, #0] //load rk0, rk1
4360
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
4361
4362
rev64 v8.16b, v8.16b //GHASH block 8k
4363
rev32 v5.16b, v30.16b //CTR block 8k+13
4364
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
4365
4366
ldr q23, [x3, #176] //load h7l | h7h
4367
ext v23.16b, v23.16b, v23.16b, #8
4368
ldr q25, [x3, #208] //load h8l | h8h
4369
ext v25.16b, v25.16b, v25.16b, #8
4370
rev64 v12.16b, v12.16b //GHASH block 8k+4
4371
rev64 v11.16b, v11.16b //GHASH block 8k+3
4372
4373
eor v8.16b, v8.16b, v19.16b //PRE 1
4374
rev32 v6.16b, v30.16b //CTR block 8k+14
4375
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
4376
4377
rev64 v13.16b, v13.16b //GHASH block 8k+5
4378
4379
rev32 v7.16b, v30.16b //CTR block 8k+15
4380
aese v1.16b, v26.16b
4381
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
4382
aese v6.16b, v26.16b
4383
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
4384
4385
aese v5.16b, v26.16b
4386
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
4387
aese v4.16b, v26.16b
4388
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
4389
aese v0.16b, v26.16b
4390
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
4391
4392
aese v7.16b, v26.16b
4393
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
4394
aese v2.16b, v26.16b
4395
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
4396
aese v3.16b, v26.16b
4397
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
4398
4399
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
4400
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
4401
ldp q28, q26, [x8, #32] //load rk2, rk3
4402
4403
aese v6.16b, v27.16b
4404
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
4405
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
4406
ldr q20, [x3, #128] //load h5l | h5h
4407
ext v20.16b, v20.16b, v20.16b, #8
4408
ldr q22, [x3, #160] //load h6l | h6h
4409
ext v22.16b, v22.16b, v22.16b, #8
4410
4411
aese v0.16b, v27.16b
4412
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
4413
aese v3.16b, v27.16b
4414
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
4415
aese v7.16b, v27.16b
4416
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
4417
4418
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
4419
aese v2.16b, v27.16b
4420
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
4421
aese v4.16b, v27.16b
4422
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
4423
4424
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
4425
rev64 v10.16b, v10.16b //GHASH block 8k+2
4426
aese v1.16b, v27.16b
4427
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
4428
4429
aese v5.16b, v27.16b
4430
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
4431
ldr q21, [x3, #144] //load h6k | h5k
4432
ldr q24, [x3, #192] //load h8k | h7k
4433
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
4434
4435
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
4436
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
4437
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
4438
4439
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
4440
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
4441
aese v6.16b, v28.16b
4442
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
4443
4444
aese v2.16b, v28.16b
4445
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
4446
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
4447
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
4448
4449
aese v1.16b, v28.16b
4450
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
4451
aese v6.16b, v26.16b
4452
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
4453
aese v4.16b, v28.16b
4454
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
4455
4456
aese v0.16b, v28.16b
4457
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
4458
aese v7.16b, v28.16b
4459
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
4460
aese v3.16b, v28.16b
4461
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
4462
4463
ldr q23, [x3, #80] //load h3l | h3h
4464
ext v23.16b, v23.16b, v23.16b, #8
4465
ldr q25, [x3, #112] //load h4l | h4h
4466
ext v25.16b, v25.16b, v25.16b, #8
4467
aese v5.16b, v28.16b
4468
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
4469
aese v2.16b, v26.16b
4470
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
4471
4472
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
4473
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
4474
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
4475
4476
aese v3.16b, v26.16b
4477
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
4478
aese v4.16b, v26.16b
4479
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
4480
4481
aese v0.16b, v26.16b
4482
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
4483
aese v7.16b, v26.16b
4484
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
4485
ldp q27, q28, [x8, #64] //load rk4, rk5
4486
4487
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
4488
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
4489
aese v1.16b, v26.16b
4490
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
4491
4492
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
4493
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
4494
4495
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
4496
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
4497
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
4498
4499
aese v5.16b, v26.16b
4500
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
4501
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
4502
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
4503
4504
aese v4.16b, v27.16b
4505
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
4506
aese v6.16b, v27.16b
4507
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
4508
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
4509
4510
aese v5.16b, v27.16b
4511
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
4512
aese v1.16b, v27.16b
4513
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
4514
aese v3.16b, v27.16b
4515
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
4516
4517
aese v2.16b, v27.16b
4518
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
4519
aese v0.16b, v27.16b
4520
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
4521
aese v7.16b, v27.16b
4522
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
4523
4524
ldr q20, [x3, #32] //load h1l | h1h
4525
ext v20.16b, v20.16b, v20.16b, #8
4526
ldr q22, [x3, #64] //load h2l | h2h
4527
ext v22.16b, v22.16b, v22.16b, #8
4528
aese v3.16b, v28.16b
4529
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
4530
aese v5.16b, v28.16b
4531
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
4532
4533
ldp q26, q27, [x8, #96] //load rk6, rk7
4534
aese v7.16b, v28.16b
4535
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
4536
rev64 v15.16b, v15.16b //GHASH block 8k+7
4537
4538
aese v4.16b, v28.16b
4539
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
4540
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
4541
aese v1.16b, v28.16b
4542
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
4543
4544
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
4545
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
4546
aese v2.16b, v28.16b
4547
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
4548
4549
aese v6.16b, v28.16b
4550
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
4551
aese v0.16b, v28.16b
4552
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
4553
rev64 v14.16b, v14.16b //GHASH block 8k+6
4554
4555
ldr q21, [x3, #48] //load h2k | h1k
4556
ldr q24, [x3, #96] //load h4k | h3k
4557
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
4558
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
4559
4560
aese v0.16b, v26.16b
4561
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
4562
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
4563
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
4564
4565
aese v7.16b, v26.16b
4566
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
4567
aese v2.16b, v26.16b
4568
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
4569
aese v6.16b, v26.16b
4570
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
4571
4572
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
4573
aese v3.16b, v26.16b
4574
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
4575
aese v1.16b, v26.16b
4576
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
4577
4578
aese v2.16b, v27.16b
4579
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
4580
aese v6.16b, v27.16b
4581
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
4582
aese v5.16b, v26.16b
4583
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
4584
4585
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
4586
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
4587
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
4588
4589
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
4590
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
4591
aese v4.16b, v26.16b
4592
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
4593
4594
aese v5.16b, v27.16b
4595
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
4596
ldp q28, q26, [x8, #128] //load rk8, rk9
4597
aese v3.16b, v27.16b
4598
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
4599
4600
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
4601
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
4602
aese v1.16b, v27.16b
4603
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
4604
4605
aese v4.16b, v27.16b
4606
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
4607
aese v0.16b, v27.16b
4608
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
4609
aese v7.16b, v27.16b
4610
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
4611
4612
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
4613
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
4614
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
4615
4616
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
4617
ldr d16, [x10] //MODULO - load modulo constant
4618
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
4619
4620
aese v2.16b, v28.16b
4621
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
4622
aese v5.16b, v28.16b
4623
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
4624
aese v7.16b, v28.16b
4625
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
4626
4627
aese v0.16b, v28.16b
4628
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
4629
aese v3.16b, v28.16b
4630
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
4631
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
4632
4633
aese v4.16b, v28.16b
4634
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
4635
aese v1.16b, v28.16b
4636
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
4637
aese v6.16b, v28.16b
4638
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
4639
4640
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
4641
rev32 v20.16b, v30.16b //CTR block 8k+16
4642
add v30.4s, v30.4s, v31.4s //CTR block 8k+16
4643
4644
aese v5.16b, v26.16b
4645
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
4646
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
4647
aese v1.16b, v26.16b
4648
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
4649
4650
aese v3.16b, v26.16b
4651
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
4652
aese v7.16b, v26.16b
4653
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
4654
ldp q27, q28, [x8, #160] //load rk10, rk11
4655
4656
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
4657
ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load ciphertext
4658
4659
aese v2.16b, v26.16b
4660
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
4661
aese v0.16b, v26.16b
4662
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
4663
ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load ciphertext
4664
4665
rev32 v22.16b, v30.16b //CTR block 8k+17
4666
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
4667
add v30.4s, v30.4s, v31.4s //CTR block 8k+17
4668
4669
aese v6.16b, v26.16b
4670
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
4671
aese v4.16b, v26.16b
4672
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
4673
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
4674
4675
aese v3.16b, v27.16b
4676
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
4677
aese v7.16b, v27.16b
4678
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
4679
ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load ciphertext
4680
4681
rev32 v23.16b, v30.16b //CTR block 8k+18
4682
add v30.4s, v30.4s, v31.4s //CTR block 8k+18
4683
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
4684
4685
aese v0.16b, v27.16b
4686
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
4687
aese v1.16b, v27.16b
4688
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
4689
ldr q26, [x8, #192] //load rk12
4690
4691
ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load ciphertext
4692
aese v4.16b, v27.16b
4693
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
4694
aese v6.16b, v27.16b
4695
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
4696
4697
aese v0.16b, v28.16b //AES block 8k+8 - round 11
4698
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
4699
aese v1.16b, v28.16b //AES block 8k+9 - round 11
4700
4701
aese v2.16b, v27.16b
4702
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
4703
aese v6.16b, v28.16b //AES block 8k+14 - round 11
4704
aese v3.16b, v28.16b //AES block 8k+11 - round 11
4705
4706
.inst 0xce006900 //eor3 v0.16b, v8.16b, v0.16b, v26.16b //AES block 8k+8 - result
4707
rev32 v25.16b, v30.16b //CTR block 8k+19
4708
aese v5.16b, v27.16b
4709
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
4710
4711
aese v4.16b, v28.16b //AES block 8k+12 - round 11
4712
aese v2.16b, v28.16b //AES block 8k+10 - round 11
4713
add v30.4s, v30.4s, v31.4s //CTR block 8k+19
4714
4715
aese v7.16b, v28.16b //AES block 8k+15 - round 11
4716
aese v5.16b, v28.16b //AES block 8k+13 - round 11
4717
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
4718
4719
.inst 0xce016921 //eor3 v1.16b, v9.16b, v1.16b, v26.16b //AES block 8k+9 - result
4720
stp q0, q1, [x2], #32 //AES block 8k+8, 8k+9 - store result
4721
.inst 0xce036963 //eor3 v3.16b, v11.16b, v3.16b, v26.16b //AES block 8k+11 - result
4722
4723
.inst 0xce026942 //eor3 v2.16b, v10.16b, v2.16b, v26.16b //AES block 8k+10 - result
4724
.inst 0xce0769e7 //eor3 v7.16b, v15.16b, v7.16b, v26.16b //AES block 8k+15 - result
4725
stp q2, q3, [x2], #32 //AES block 8k+10, 8k+11 - store result
4726
4727
.inst 0xce0569a5 //eor3 v5.16b, v13.16b, v5.16b, v26.16b //AES block 8k+13 - result
4728
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
4729
mov v3.16b, v25.16b //CTR block 8k+19
4730
4731
.inst 0xce046984 //eor3 v4.16b, v12.16b, v4.16b, v26.16b //AES block 8k+12 - result
4732
stp q4, q5, [x2], #32 //AES block 8k+12, 8k+13 - store result
4733
cmp x0, x5 //.LOOP CONTROL
4734
4735
.inst 0xce0669c6 //eor3 v6.16b, v14.16b, v6.16b, v26.16b //AES block 8k+14 - result
4736
stp q6, q7, [x2], #32 //AES block 8k+14, 8k+15 - store result
4737
mov v0.16b, v20.16b //CTR block 8k+16
4738
4739
mov v1.16b, v22.16b //CTR block 8k+17
4740
mov v2.16b, v23.16b //CTR block 8k+18
4741
4742
rev32 v4.16b, v30.16b //CTR block 8k+20
4743
add v30.4s, v30.4s, v31.4s //CTR block 8k+20
4744
b.lt .L192_dec_main_loop
4745
4746
.L192_dec_prepretail: //PREPRETAIL
4747
ldp q26, q27, [x8, #0] //load rk0, rk1
4748
rev32 v5.16b, v30.16b //CTR block 8k+13
4749
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
4750
4751
ldr q23, [x3, #176] //load h7l | h7h
4752
ext v23.16b, v23.16b, v23.16b, #8
4753
ldr q25, [x3, #208] //load h8l | h8h
4754
ext v25.16b, v25.16b, v25.16b, #8
4755
rev64 v8.16b, v8.16b //GHASH block 8k
4756
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
4757
4758
rev64 v11.16b, v11.16b //GHASH block 8k+3
4759
rev32 v6.16b, v30.16b //CTR block 8k+14
4760
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
4761
4762
eor v8.16b, v8.16b, v19.16b //PRE 1
4763
rev64 v10.16b, v10.16b //GHASH block 8k+2
4764
rev64 v9.16b, v9.16b //GHASH block 8k+1
4765
4766
ldr q20, [x3, #128] //load h5l | h5h
4767
ext v20.16b, v20.16b, v20.16b, #8
4768
ldr q22, [x3, #160] //load h6l | h6h
4769
ext v22.16b, v22.16b, v22.16b, #8
4770
rev32 v7.16b, v30.16b //CTR block 8k+15
4771
4772
aese v0.16b, v26.16b
4773
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
4774
aese v6.16b, v26.16b
4775
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
4776
aese v5.16b, v26.16b
4777
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
4778
4779
aese v3.16b, v26.16b
4780
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
4781
aese v2.16b, v26.16b
4782
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
4783
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
4784
4785
aese v4.16b, v26.16b
4786
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
4787
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
4788
aese v1.16b, v26.16b
4789
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
4790
4791
aese v6.16b, v27.16b
4792
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
4793
aese v7.16b, v26.16b
4794
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
4795
ldp q28, q26, [x8, #32] //load rk2, rk3
4796
4797
aese v4.16b, v27.16b
4798
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
4799
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
4800
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
4801
4802
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
4803
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
4804
aese v3.16b, v27.16b
4805
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
4806
4807
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
4808
aese v7.16b, v27.16b
4809
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
4810
aese v0.16b, v27.16b
4811
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
4812
4813
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
4814
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
4815
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
4816
4817
aese v2.16b, v27.16b
4818
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
4819
aese v1.16b, v27.16b
4820
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
4821
aese v5.16b, v27.16b
4822
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
4823
4824
ldr q21, [x3, #144] //load h6k | h5k
4825
ldr q24, [x3, #192] //load h8k | h7k
4826
aese v3.16b, v28.16b
4827
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
4828
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
4829
4830
aese v6.16b, v28.16b
4831
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
4832
rev64 v13.16b, v13.16b //GHASH block 8k+5
4833
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
4834
4835
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
4836
aese v4.16b, v28.16b
4837
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
4838
aese v5.16b, v28.16b
4839
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
4840
4841
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
4842
aese v3.16b, v26.16b
4843
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
4844
aese v7.16b, v28.16b
4845
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
4846
4847
aese v0.16b, v28.16b
4848
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
4849
aese v2.16b, v28.16b
4850
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
4851
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
4852
4853
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
4854
aese v1.16b, v28.16b
4855
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
4856
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
4857
4858
aese v5.16b, v26.16b
4859
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
4860
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
4861
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
4862
4863
aese v7.16b, v26.16b
4864
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
4865
aese v6.16b, v26.16b
4866
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
4867
aese v4.16b, v26.16b
4868
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
4869
4870
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
4871
ldp q27, q28, [x8, #64] //load rk4, rk5
4872
aese v0.16b, v26.16b
4873
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
4874
4875
ldr q23, [x3, #80] //load h3l | h3h
4876
ext v23.16b, v23.16b, v23.16b, #8
4877
ldr q25, [x3, #112] //load h4l | h4h
4878
ext v25.16b, v25.16b, v25.16b, #8
4879
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
4880
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
4881
4882
ldr q20, [x3, #32] //load h1l | h1h
4883
ext v20.16b, v20.16b, v20.16b, #8
4884
ldr q22, [x3, #64] //load h2l | h2h
4885
ext v22.16b, v22.16b, v22.16b, #8
4886
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
4887
aese v2.16b, v26.16b
4888
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
4889
4890
rev64 v15.16b, v15.16b //GHASH block 8k+7
4891
4892
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
4893
rev64 v12.16b, v12.16b //GHASH block 8k+4
4894
4895
aese v5.16b, v27.16b
4896
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
4897
aese v4.16b, v27.16b
4898
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
4899
aese v1.16b, v26.16b
4900
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
4901
4902
aese v2.16b, v27.16b
4903
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
4904
aese v0.16b, v27.16b
4905
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
4906
aese v3.16b, v27.16b
4907
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
4908
4909
aese v1.16b, v27.16b
4910
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
4911
aese v6.16b, v27.16b
4912
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
4913
aese v7.16b, v27.16b
4914
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
4915
4916
rev64 v14.16b, v14.16b //GHASH block 8k+6
4917
ldr q21, [x3, #48] //load h2k | h1k
4918
ldr q24, [x3, #96] //load h4k | h3k
4919
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
4920
4921
aese v7.16b, v28.16b
4922
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
4923
aese v1.16b, v28.16b
4924
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
4925
aese v2.16b, v28.16b
4926
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
4927
4928
ldp q26, q27, [x8, #96] //load rk6, rk7
4929
aese v6.16b, v28.16b
4930
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
4931
aese v5.16b, v28.16b
4932
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
4933
4934
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
4935
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
4936
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
4937
4938
aese v4.16b, v28.16b
4939
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
4940
4941
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
4942
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
4943
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
4944
4945
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
4946
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
4947
aese v0.16b, v28.16b
4948
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
4949
4950
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
4951
aese v3.16b, v28.16b
4952
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
4953
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
4954
4955
aese v4.16b, v26.16b
4956
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
4957
aese v2.16b, v26.16b
4958
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
4959
4960
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
4961
aese v1.16b, v26.16b
4962
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
4963
aese v7.16b, v26.16b
4964
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
4965
4966
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
4967
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
4968
aese v0.16b, v26.16b
4969
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
4970
4971
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
4972
aese v5.16b, v26.16b
4973
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
4974
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
4975
4976
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
4977
aese v4.16b, v27.16b
4978
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
4979
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
4980
4981
aese v3.16b, v26.16b
4982
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
4983
aese v6.16b, v26.16b
4984
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
4985
aese v5.16b, v27.16b
4986
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
4987
4988
ldp q28, q26, [x8, #128] //load rk8, rk9
4989
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
4990
aese v2.16b, v27.16b
4991
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
4992
4993
ldr d16, [x10] //MODULO - load modulo constant
4994
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
4995
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
4996
4997
aese v1.16b, v27.16b
4998
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
4999
aese v7.16b, v27.16b
5000
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
5001
aese v6.16b, v27.16b
5002
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
5003
5004
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
5005
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
5006
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
5007
5008
aese v0.16b, v27.16b
5009
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
5010
aese v3.16b, v27.16b
5011
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
5012
5013
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
5014
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
5015
aese v2.16b, v28.16b
5016
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
5017
5018
aese v6.16b, v28.16b
5019
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
5020
aese v7.16b, v28.16b
5021
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
5022
aese v1.16b, v28.16b
5023
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
5024
5025
aese v3.16b, v28.16b
5026
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
5027
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
5028
aese v0.16b, v28.16b
5029
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
5030
5031
aese v5.16b, v28.16b
5032
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
5033
aese v4.16b, v28.16b
5034
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
5035
ldp q27, q28, [x8, #160] //load rk10, rk11
5036
5037
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
5038
aese v7.16b, v26.16b
5039
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
5040
aese v6.16b, v26.16b
5041
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
5042
5043
aese v5.16b, v26.16b
5044
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
5045
aese v2.16b, v26.16b
5046
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
5047
aese v3.16b, v26.16b
5048
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
5049
5050
aese v0.16b, v26.16b
5051
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
5052
aese v1.16b, v26.16b
5053
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
5054
aese v4.16b, v26.16b
5055
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
5056
5057
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
5058
ldr q26, [x8, #192] //load rk12
5059
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
5060
5061
aese v2.16b, v27.16b
5062
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
5063
aese v5.16b, v27.16b
5064
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
5065
aese v0.16b, v27.16b
5066
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
5067
5068
aese v4.16b, v27.16b
5069
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
5070
aese v6.16b, v27.16b
5071
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
5072
aese v7.16b, v27.16b
5073
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
5074
5075
aese v0.16b, v28.16b //AES block 8k+8 - round 11
5076
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
5077
aese v5.16b, v28.16b //AES block 8k+13 - round 11
5078
5079
aese v2.16b, v28.16b //AES block 8k+10 - round 11
5080
aese v3.16b, v27.16b
5081
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
5082
aese v1.16b, v27.16b
5083
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
5084
5085
aese v6.16b, v28.16b //AES block 8k+14 - round 11
5086
aese v4.16b, v28.16b //AES block 8k+12 - round 11
5087
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
5088
5089
aese v3.16b, v28.16b //AES block 8k+11 - round 11
5090
aese v1.16b, v28.16b //AES block 8k+9 - round 11
5091
aese v7.16b, v28.16b //AES block 8k+15 - round 11
5092
5093
.L192_dec_tail: //TAIL
5094
5095
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
5096
5097
ldp q20, q21, [x3, #128] //load h5l | h5h
5098
ext v20.16b, v20.16b, v20.16b, #8
5099
ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext
5100
5101
ldp q24, q25, [x3, #192] //load h8k | h7k
5102
ext v25.16b, v25.16b, v25.16b, #8
5103
5104
mov v29.16b, v26.16b
5105
5106
ldp q22, q23, [x3, #160] //load h6l | h6h
5107
ext v22.16b, v22.16b, v22.16b, #8
5108
ext v23.16b, v23.16b, v23.16b, #8
5109
ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
5110
5111
.inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result
5112
cmp x5, #112
5113
b.gt .L192_dec_blocks_more_than_7
5114
5115
mov v7.16b, v6.16b
5116
movi v17.8b, #0
5117
sub v30.4s, v30.4s, v31.4s
5118
5119
mov v6.16b, v5.16b
5120
mov v5.16b, v4.16b
5121
mov v4.16b, v3.16b
5122
5123
cmp x5, #96
5124
movi v19.8b, #0
5125
mov v3.16b, v2.16b
5126
5127
mov v2.16b, v1.16b
5128
movi v18.8b, #0
5129
b.gt .L192_dec_blocks_more_than_6
5130
5131
mov v7.16b, v6.16b
5132
mov v6.16b, v5.16b
5133
mov v5.16b, v4.16b
5134
5135
mov v4.16b, v3.16b
5136
mov v3.16b, v1.16b
5137
5138
sub v30.4s, v30.4s, v31.4s
5139
cmp x5, #80
5140
b.gt .L192_dec_blocks_more_than_5
5141
5142
mov v7.16b, v6.16b
5143
mov v6.16b, v5.16b
5144
5145
mov v5.16b, v4.16b
5146
mov v4.16b, v1.16b
5147
cmp x5, #64
5148
5149
sub v30.4s, v30.4s, v31.4s
5150
b.gt .L192_dec_blocks_more_than_4
5151
5152
sub v30.4s, v30.4s, v31.4s
5153
mov v7.16b, v6.16b
5154
mov v6.16b, v5.16b
5155
5156
mov v5.16b, v1.16b
5157
cmp x5, #48
5158
b.gt .L192_dec_blocks_more_than_3
5159
5160
sub v30.4s, v30.4s, v31.4s
5161
mov v7.16b, v6.16b
5162
cmp x5, #32
5163
5164
mov v6.16b, v1.16b
5165
ldr q24, [x3, #96] //load h4k | h3k
5166
b.gt .L192_dec_blocks_more_than_2
5167
5168
sub v30.4s, v30.4s, v31.4s
5169
5170
mov v7.16b, v1.16b
5171
cmp x5, #16
5172
b.gt .L192_dec_blocks_more_than_1
5173
5174
sub v30.4s, v30.4s, v31.4s
5175
ldr q21, [x3, #48] //load h2k | h1k
5176
b .L192_dec_blocks_less_than_1
5177
.L192_dec_blocks_more_than_7: //blocks left > 7
5178
rev64 v8.16b, v9.16b //GHASH final-7 block
5179
5180
ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
5181
eor v8.16b, v8.16b, v16.16b //feed in partial tag
5182
5183
pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
5184
ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
5185
ldr q9, [x0], #16 //AES final-6 block - load ciphertext
5186
5187
pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
5188
5189
eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
5190
st1 { v12.16b}, [x2], #16 //AES final-7 block - store result
5191
5192
.inst 0xce01752c //eor3 v12.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
5193
5194
pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
5195
movi v16.8b, #0 //suppress further partial tag feed in
5196
.L192_dec_blocks_more_than_6: //blocks left > 6
5197
5198
rev64 v8.16b, v9.16b //GHASH final-6 block
5199
5200
eor v8.16b, v8.16b, v16.16b //feed in partial tag
5201
5202
ldr q9, [x0], #16 //AES final-5 block - load ciphertext
5203
ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
5204
5205
eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
5206
movi v16.8b, #0 //suppress further partial tag feed in
5207
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
5208
5209
st1 { v12.16b}, [x2], #16 //AES final-6 block - store result
5210
.inst 0xce02752c //eor3 v12.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
5211
5212
eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
5213
pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
5214
pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
5215
5216
eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
5217
eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
5218
.L192_dec_blocks_more_than_5: //blocks left > 5
5219
5220
rev64 v8.16b, v9.16b //GHASH final-5 block
5221
5222
eor v8.16b, v8.16b, v16.16b //feed in partial tag
5223
5224
ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
5225
5226
eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
5227
5228
ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
5229
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
5230
5231
ldr q9, [x0], #16 //AES final-4 block - load ciphertext
5232
5233
eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
5234
pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
5235
5236
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
5237
5238
eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
5239
movi v16.8b, #0 //suppress further partial tag feed in
5240
st1 { v12.16b}, [x2], #16 //AES final-5 block - store result
5241
5242
eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
5243
.inst 0xce03752c //eor3 v12.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
5244
.L192_dec_blocks_more_than_4: //blocks left > 4
5245
5246
rev64 v8.16b, v9.16b //GHASH final-4 block
5247
5248
eor v8.16b, v8.16b, v16.16b //feed in partial tag
5249
movi v16.8b, #0 //suppress further partial tag feed in
5250
5251
ldr q9, [x0], #16 //AES final-3 block - load ciphertext
5252
ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
5253
pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
5254
5255
eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
5256
5257
eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
5258
5259
pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
5260
st1 { v12.16b}, [x2], #16 //AES final-4 block - store result
5261
pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
5262
5263
.inst 0xce04752c //eor3 v12.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
5264
5265
eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
5266
eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
5267
.L192_dec_blocks_more_than_3: //blocks left > 3
5268
5269
ldr q25, [x3, #112] //load h4l | h4h
5270
ext v25.16b, v25.16b, v25.16b, #8
5271
rev64 v8.16b, v9.16b //GHASH final-3 block
5272
ldr q9, [x0], #16 //AES final-2 block - load ciphertext
5273
5274
eor v8.16b, v8.16b, v16.16b //feed in partial tag
5275
5276
ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
5277
pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
5278
5279
eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
5280
movi v16.8b, #0 //suppress further partial tag feed in
5281
pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
5282
5283
st1 { v12.16b}, [x2], #16 //AES final-3 block - store result
5284
eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
5285
.inst 0xce05752c //eor3 v12.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
5286
5287
eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
5288
ldr q24, [x3, #96] //load h4k | h3k
5289
5290
ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
5291
5292
pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
5293
5294
eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
5295
.L192_dec_blocks_more_than_2: //blocks left > 2
5296
5297
rev64 v8.16b, v9.16b //GHASH final-2 block
5298
ldr q23, [x3, #80] //load h3l | h3h
5299
ext v23.16b, v23.16b, v23.16b, #8
5300
5301
eor v8.16b, v8.16b, v16.16b //feed in partial tag
5302
5303
ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
5304
ldr q9, [x0], #16 //AES final-1 block - load ciphertext
5305
5306
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
5307
5308
eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
5309
5310
eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
5311
pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
5312
5313
pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
5314
movi v16.8b, #0 //suppress further partial tag feed in
5315
5316
eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
5317
st1 { v12.16b}, [x2], #16 //AES final-2 block - store result
5318
5319
eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
5320
.inst 0xce06752c //eor3 v12.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
5321
.L192_dec_blocks_more_than_1: //blocks left > 1
5322
5323
rev64 v8.16b, v9.16b //GHASH final-1 block
5324
ldr q9, [x0], #16 //AES final block - load ciphertext
5325
ldr q22, [x3, #64] //load h1l | h1h
5326
ext v22.16b, v22.16b, v22.16b, #8
5327
5328
eor v8.16b, v8.16b, v16.16b //feed in partial tag
5329
movi v16.8b, #0 //suppress further partial tag feed in
5330
ldr q21, [x3, #48] //load h2k | h1k
5331
5332
pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
5333
ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
5334
st1 { v12.16b}, [x2], #16 //AES final-1 block - store result
5335
5336
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
5337
5338
.inst 0xce07752c //eor3 v12.16b, v9.16b, v7.16b, v29.16b //AES final block - result
5339
5340
eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
5341
5342
ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
5343
5344
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
5345
5346
eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
5347
5348
eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
5349
eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
5350
.L192_dec_blocks_less_than_1: //blocks left <= 1
5351
5352
rev32 v30.16b, v30.16b
5353
and x1, x1, #127 //bit_length %= 128
5354
5355
sub x1, x1, #128 //bit_length -= 128
5356
str q30, [x16] //store the updated counter
5357
5358
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
5359
mvn x6, xzr //temp0_x = 0xffffffffffffffff
5360
5361
and x1, x1, #127 //bit_length %= 128
5362
5363
mvn x7, xzr //temp1_x = 0xffffffffffffffff
5364
lsr x6, x6, x1 //temp0_x is mask for top 64b of last block
5365
cmp x1, #64
5366
5367
csel x13, x7, x6, lt
5368
csel x14, x6, xzr, lt
5369
ldr q20, [x3, #32] //load h1l | h1h
5370
ext v20.16b, v20.16b, v20.16b, #8
5371
5372
mov v0.d[1], x14
5373
ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
5374
5375
mov v0.d[0], x13 //ctr0b is mask for last block
5376
5377
and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
5378
bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
5379
5380
rev64 v8.16b, v9.16b //GHASH final block
5381
5382
st1 { v12.16b}, [x2] //store all 16B
5383
5384
eor v8.16b, v8.16b, v16.16b //feed in partial tag
5385
5386
ins v16.d[0], v8.d[1] //GHASH final block - mid
5387
pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
5388
5389
eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
5390
pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
5391
eor v19.16b, v19.16b, v26.16b //GHASH final block - low
5392
5393
pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
5394
eor v17.16b, v17.16b, v28.16b //GHASH final block - high
5395
5396
eor v14.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
5397
eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
5398
ldr d16, [x10] //MODULO - load modulo constant
5399
5400
pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
5401
ext v17.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
5402
5403
eor v18.16b, v18.16b, v14.16b //MODULO - karatsuba tidy up
5404
5405
.inst 0xce115652 //eor3 v18.16b, v18.16b, v17.16b, v21.16b //MODULO - fold into mid
5406
5407
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
5408
ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
5409
5410
.inst 0xce124673 //eor3 v19.16b, v19.16b, v18.16b, v17.16b //MODULO - fold into low
5411
ext v19.16b, v19.16b, v19.16b, #8
5412
rev64 v19.16b, v19.16b
5413
st1 { v19.16b }, [x3]
5414
5415
mov x0, x9
5416
5417
ldp d10, d11, [sp, #16]
5418
ldp d12, d13, [sp, #32]
5419
ldp d14, d15, [sp, #48]
5420
ldp d8, d9, [sp], #80
5421
ret
5422
5423
.L192_dec_ret:
5424
mov w0, #0x0
5425
ret
5426
.size unroll8_eor3_aes_gcm_dec_192_kernel,.-unroll8_eor3_aes_gcm_dec_192_kernel
5427
.globl unroll8_eor3_aes_gcm_enc_256_kernel
5428
.type unroll8_eor3_aes_gcm_enc_256_kernel,%function
5429
.align 4
5430
unroll8_eor3_aes_gcm_enc_256_kernel:
5431
AARCH64_VALID_CALL_TARGET
5432
cbz x1, .L256_enc_ret
5433
stp d8, d9, [sp, #-80]!
5434
lsr x9, x1, #3
5435
mov x16, x4
5436
mov x8, x5
5437
stp d10, d11, [sp, #16]
5438
stp d12, d13, [sp, #32]
5439
stp d14, d15, [sp, #48]
5440
mov x5, #0xc200000000000000
5441
stp x5, xzr, [sp, #64]
5442
add x10, sp, #64
5443
5444
ld1 { v0.16b}, [x16] //CTR block 0
5445
5446
mov x5, x9
5447
5448
mov x15, #0x100000000 //set up counter increment
5449
movi v31.16b, #0x0
5450
mov v31.d[1], x15
5451
sub x5, x5, #1 //byte_len - 1
5452
5453
and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
5454
5455
add x5, x5, x0
5456
5457
rev32 v30.16b, v0.16b //set up reversed counter
5458
5459
add v30.4s, v30.4s, v31.4s //CTR block 0
5460
5461
rev32 v1.16b, v30.16b //CTR block 1
5462
add v30.4s, v30.4s, v31.4s //CTR block 1
5463
5464
rev32 v2.16b, v30.16b //CTR block 2
5465
add v30.4s, v30.4s, v31.4s //CTR block 2
5466
5467
rev32 v3.16b, v30.16b //CTR block 3
5468
add v30.4s, v30.4s, v31.4s //CTR block 3
5469
5470
rev32 v4.16b, v30.16b //CTR block 4
5471
add v30.4s, v30.4s, v31.4s //CTR block 4
5472
5473
rev32 v5.16b, v30.16b //CTR block 5
5474
add v30.4s, v30.4s, v31.4s //CTR block 5
5475
ldp q26, q27, [x8, #0] //load rk0, rk1
5476
5477
rev32 v6.16b, v30.16b //CTR block 6
5478
add v30.4s, v30.4s, v31.4s //CTR block 6
5479
5480
rev32 v7.16b, v30.16b //CTR block 7
5481
5482
aese v3.16b, v26.16b
5483
aesmc v3.16b, v3.16b //AES block 3 - round 0
5484
aese v4.16b, v26.16b
5485
aesmc v4.16b, v4.16b //AES block 4 - round 0
5486
aese v2.16b, v26.16b
5487
aesmc v2.16b, v2.16b //AES block 2 - round 0
5488
5489
aese v0.16b, v26.16b
5490
aesmc v0.16b, v0.16b //AES block 0 - round 0
5491
aese v1.16b, v26.16b
5492
aesmc v1.16b, v1.16b //AES block 1 - round 0
5493
aese v6.16b, v26.16b
5494
aesmc v6.16b, v6.16b //AES block 6 - round 0
5495
5496
aese v5.16b, v26.16b
5497
aesmc v5.16b, v5.16b //AES block 5 - round 0
5498
aese v7.16b, v26.16b
5499
aesmc v7.16b, v7.16b //AES block 7 - round 0
5500
ldp q28, q26, [x8, #32] //load rk2, rk3
5501
5502
aese v4.16b, v27.16b
5503
aesmc v4.16b, v4.16b //AES block 4 - round 1
5504
aese v1.16b, v27.16b
5505
aesmc v1.16b, v1.16b //AES block 1 - round 1
5506
aese v3.16b, v27.16b
5507
aesmc v3.16b, v3.16b //AES block 3 - round 1
5508
5509
aese v6.16b, v27.16b
5510
aesmc v6.16b, v6.16b //AES block 6 - round 1
5511
aese v5.16b, v27.16b
5512
aesmc v5.16b, v5.16b //AES block 5 - round 1
5513
5514
aese v2.16b, v27.16b
5515
aesmc v2.16b, v2.16b //AES block 2 - round 1
5516
5517
aese v7.16b, v27.16b
5518
aesmc v7.16b, v7.16b //AES block 7 - round 1
5519
5520
aese v2.16b, v28.16b
5521
aesmc v2.16b, v2.16b //AES block 2 - round 2
5522
aese v3.16b, v28.16b
5523
aesmc v3.16b, v3.16b //AES block 3 - round 2
5524
aese v0.16b, v27.16b
5525
aesmc v0.16b, v0.16b //AES block 0 - round 1
5526
5527
aese v7.16b, v28.16b
5528
aesmc v7.16b, v7.16b //AES block 7 - round 2
5529
aese v6.16b, v28.16b
5530
aesmc v6.16b, v6.16b //AES block 6 - round 2
5531
aese v5.16b, v28.16b
5532
aesmc v5.16b, v5.16b //AES block 5 - round 2
5533
5534
aese v4.16b, v28.16b
5535
aesmc v4.16b, v4.16b //AES block 4 - round 2
5536
aese v0.16b, v28.16b
5537
aesmc v0.16b, v0.16b //AES block 0 - round 2
5538
aese v1.16b, v28.16b
5539
aesmc v1.16b, v1.16b //AES block 1 - round 2
5540
5541
aese v5.16b, v26.16b
5542
aesmc v5.16b, v5.16b //AES block 5 - round 3
5543
aese v3.16b, v26.16b
5544
aesmc v3.16b, v3.16b //AES block 3 - round 3
5545
ldp q27, q28, [x8, #64] //load rk4, rk5
5546
5547
aese v4.16b, v26.16b
5548
aesmc v4.16b, v4.16b //AES block 4 - round 3
5549
5550
aese v1.16b, v26.16b
5551
aesmc v1.16b, v1.16b //AES block 1 - round 3
5552
aese v6.16b, v26.16b
5553
aesmc v6.16b, v6.16b //AES block 6 - round 3
5554
aese v7.16b, v26.16b
5555
aesmc v7.16b, v7.16b //AES block 7 - round 3
5556
5557
aese v2.16b, v26.16b
5558
aesmc v2.16b, v2.16b //AES block 2 - round 3
5559
aese v0.16b, v26.16b
5560
aesmc v0.16b, v0.16b //AES block 0 - round 3
5561
5562
aese v4.16b, v27.16b
5563
aesmc v4.16b, v4.16b //AES block 4 - round 4
5564
aese v6.16b, v27.16b
5565
aesmc v6.16b, v6.16b //AES block 6 - round 4
5566
aese v1.16b, v27.16b
5567
aesmc v1.16b, v1.16b //AES block 1 - round 4
5568
5569
aese v2.16b, v27.16b
5570
aesmc v2.16b, v2.16b //AES block 2 - round 4
5571
aese v0.16b, v27.16b
5572
aesmc v0.16b, v0.16b //AES block 0 - round 4
5573
5574
aese v3.16b, v27.16b
5575
aesmc v3.16b, v3.16b //AES block 3 - round 4
5576
aese v7.16b, v27.16b
5577
aesmc v7.16b, v7.16b //AES block 7 - round 4
5578
aese v5.16b, v27.16b
5579
aesmc v5.16b, v5.16b //AES block 5 - round 4
5580
5581
aese v0.16b, v28.16b
5582
aesmc v0.16b, v0.16b //AES block 0 - round 5
5583
aese v2.16b, v28.16b
5584
aesmc v2.16b, v2.16b //AES block 2 - round 5
5585
ldp q26, q27, [x8, #96] //load rk6, rk7
5586
5587
aese v1.16b, v28.16b
5588
aesmc v1.16b, v1.16b //AES block 1 - round 5
5589
aese v4.16b, v28.16b
5590
aesmc v4.16b, v4.16b //AES block 4 - round 5
5591
aese v5.16b, v28.16b
5592
aesmc v5.16b, v5.16b //AES block 5 - round 5
5593
5594
aese v3.16b, v28.16b
5595
aesmc v3.16b, v3.16b //AES block 3 - round 5
5596
aese v6.16b, v28.16b
5597
aesmc v6.16b, v6.16b //AES block 6 - round 5
5598
aese v7.16b, v28.16b
5599
aesmc v7.16b, v7.16b //AES block 7 - round 5
5600
5601
aese v1.16b, v26.16b
5602
aesmc v1.16b, v1.16b //AES block 1 - round 6
5603
aese v5.16b, v26.16b
5604
aesmc v5.16b, v5.16b //AES block 5 - round 6
5605
aese v4.16b, v26.16b
5606
aesmc v4.16b, v4.16b //AES block 4 - round 6
5607
5608
aese v2.16b, v26.16b
5609
aesmc v2.16b, v2.16b //AES block 2 - round 6
5610
aese v6.16b, v26.16b
5611
aesmc v6.16b, v6.16b //AES block 6 - round 6
5612
aese v0.16b, v26.16b
5613
aesmc v0.16b, v0.16b //AES block 0 - round 6
5614
5615
aese v7.16b, v26.16b
5616
aesmc v7.16b, v7.16b //AES block 7 - round 6
5617
aese v3.16b, v26.16b
5618
aesmc v3.16b, v3.16b //AES block 3 - round 6
5619
ldp q28, q26, [x8, #128] //load rk8, rk9
5620
5621
aese v2.16b, v27.16b
5622
aesmc v2.16b, v2.16b //AES block 2 - round 7
5623
aese v0.16b, v27.16b
5624
aesmc v0.16b, v0.16b //AES block 0 - round 7
5625
5626
aese v7.16b, v27.16b
5627
aesmc v7.16b, v7.16b //AES block 7 - round 7
5628
aese v6.16b, v27.16b
5629
aesmc v6.16b, v6.16b //AES block 6 - round 7
5630
aese v1.16b, v27.16b
5631
aesmc v1.16b, v1.16b //AES block 1 - round 7
5632
5633
aese v5.16b, v27.16b
5634
aesmc v5.16b, v5.16b //AES block 5 - round 7
5635
aese v3.16b, v27.16b
5636
aesmc v3.16b, v3.16b //AES block 3 - round 7
5637
5638
aese v4.16b, v27.16b
5639
aesmc v4.16b, v4.16b //AES block 4 - round 7
5640
5641
aese v6.16b, v28.16b
5642
aesmc v6.16b, v6.16b //AES block 6 - round 8
5643
aese v1.16b, v28.16b
5644
aesmc v1.16b, v1.16b //AES block 1 - round 8
5645
5646
aese v3.16b, v28.16b
5647
aesmc v3.16b, v3.16b //AES block 3 - round 8
5648
aese v0.16b, v28.16b
5649
aesmc v0.16b, v0.16b //AES block 0 - round 8
5650
aese v7.16b, v28.16b
5651
aesmc v7.16b, v7.16b //AES block 7 - round 8
5652
5653
aese v5.16b, v28.16b
5654
aesmc v5.16b, v5.16b //AES block 5 - round 8
5655
aese v4.16b, v28.16b
5656
aesmc v4.16b, v4.16b //AES block 4 - round 8
5657
aese v2.16b, v28.16b
5658
aesmc v2.16b, v2.16b //AES block 2 - round 8
5659
5660
ld1 { v19.16b}, [x3]
5661
ext v19.16b, v19.16b, v19.16b, #8
5662
rev64 v19.16b, v19.16b
5663
ldp q27, q28, [x8, #160] //load rk10, rk11
5664
5665
aese v6.16b, v26.16b
5666
aesmc v6.16b, v6.16b //AES block 6 - round 9
5667
aese v7.16b, v26.16b
5668
aesmc v7.16b, v7.16b //AES block 7 - round 9
5669
aese v3.16b, v26.16b
5670
aesmc v3.16b, v3.16b //AES block 3 - round 9
5671
5672
aese v4.16b, v26.16b
5673
aesmc v4.16b, v4.16b //AES block 4 - round 9
5674
aese v5.16b, v26.16b
5675
aesmc v5.16b, v5.16b //AES block 5 - round 9
5676
aese v2.16b, v26.16b
5677
aesmc v2.16b, v2.16b //AES block 2 - round 9
5678
5679
aese v1.16b, v26.16b
5680
aesmc v1.16b, v1.16b //AES block 1 - round 9
5681
5682
aese v7.16b, v27.16b
5683
aesmc v7.16b, v7.16b //AES block 7 - round 10
5684
aese v4.16b, v27.16b
5685
aesmc v4.16b, v4.16b //AES block 4 - round 10
5686
aese v0.16b, v26.16b
5687
aesmc v0.16b, v0.16b //AES block 0 - round 9
5688
5689
aese v1.16b, v27.16b
5690
aesmc v1.16b, v1.16b //AES block 1 - round 10
5691
aese v5.16b, v27.16b
5692
aesmc v5.16b, v5.16b //AES block 5 - round 10
5693
aese v3.16b, v27.16b
5694
aesmc v3.16b, v3.16b //AES block 3 - round 10
5695
5696
aese v2.16b, v27.16b
5697
aesmc v2.16b, v2.16b //AES block 2 - round 10
5698
aese v0.16b, v27.16b
5699
aesmc v0.16b, v0.16b //AES block 0 - round 10
5700
aese v6.16b, v27.16b
5701
aesmc v6.16b, v6.16b //AES block 6 - round 10
5702
5703
aese v4.16b, v28.16b
5704
aesmc v4.16b, v4.16b //AES block 4 - round 11
5705
ldp q26, q27, [x8, #192] //load rk12, rk13
5706
aese v5.16b, v28.16b
5707
aesmc v5.16b, v5.16b //AES block 5 - round 11
5708
5709
aese v2.16b, v28.16b
5710
aesmc v2.16b, v2.16b //AES block 2 - round 11
5711
aese v6.16b, v28.16b
5712
aesmc v6.16b, v6.16b //AES block 6 - round 11
5713
aese v1.16b, v28.16b
5714
aesmc v1.16b, v1.16b //AES block 1 - round 11
5715
5716
aese v0.16b, v28.16b
5717
aesmc v0.16b, v0.16b //AES block 0 - round 11
5718
aese v3.16b, v28.16b
5719
aesmc v3.16b, v3.16b //AES block 3 - round 11
5720
aese v7.16b, v28.16b
5721
aesmc v7.16b, v7.16b //AES block 7 - round 11
5722
5723
add v30.4s, v30.4s, v31.4s //CTR block 7
5724
ldr q28, [x8, #224] //load rk14
5725
5726
aese v4.16b, v26.16b
5727
aesmc v4.16b, v4.16b //AES block 4 - round 12
5728
aese v2.16b, v26.16b
5729
aesmc v2.16b, v2.16b //AES block 2 - round 12
5730
aese v1.16b, v26.16b
5731
aesmc v1.16b, v1.16b //AES block 1 - round 12
5732
5733
aese v0.16b, v26.16b
5734
aesmc v0.16b, v0.16b //AES block 0 - round 12
5735
aese v5.16b, v26.16b
5736
aesmc v5.16b, v5.16b //AES block 5 - round 12
5737
aese v3.16b, v26.16b
5738
aesmc v3.16b, v3.16b //AES block 3 - round 12
5739
5740
aese v2.16b, v27.16b //AES block 2 - round 13
5741
aese v1.16b, v27.16b //AES block 1 - round 13
5742
aese v4.16b, v27.16b //AES block 4 - round 13
5743
5744
aese v6.16b, v26.16b
5745
aesmc v6.16b, v6.16b //AES block 6 - round 12
5746
aese v7.16b, v26.16b
5747
aesmc v7.16b, v7.16b //AES block 7 - round 12
5748
5749
aese v0.16b, v27.16b //AES block 0 - round 13
5750
aese v5.16b, v27.16b //AES block 5 - round 13
5751
5752
aese v6.16b, v27.16b //AES block 6 - round 13
5753
aese v7.16b, v27.16b //AES block 7 - round 13
5754
aese v3.16b, v27.16b //AES block 3 - round 13
5755
5756
add x4, x0, x1, lsr #3 //end_input_ptr
5757
cmp x0, x5 //check if we have <= 8 blocks
5758
b.ge .L256_enc_tail //handle tail
5759
5760
ldp q8, q9, [x0], #32 //AES block 0, 1 - load plaintext
5761
5762
ldp q10, q11, [x0], #32 //AES block 2, 3 - load plaintext
5763
5764
.inst 0xce007108 //eor3 v8.16b, v8.16b, v0.16b, v28.16b //AES block 0 - result
5765
rev32 v0.16b, v30.16b //CTR block 8
5766
add v30.4s, v30.4s, v31.4s //CTR block 8
5767
5768
.inst 0xce017129 //eor3 v9.16b, v9.16b, v1.16b, v28.16b //AES block 1 - result
5769
.inst 0xce03716b //eor3 v11.16b, v11.16b, v3.16b, v28.16b //AES block 3 - result
5770
5771
rev32 v1.16b, v30.16b //CTR block 9
5772
add v30.4s, v30.4s, v31.4s //CTR block 9
5773
ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext
5774
5775
ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext
5776
.inst 0xce02714a //eor3 v10.16b, v10.16b, v2.16b, v28.16b //AES block 2 - result
5777
cmp x0, x5 //check if we have <= 8 blocks
5778
5779
rev32 v2.16b, v30.16b //CTR block 10
5780
add v30.4s, v30.4s, v31.4s //CTR block 10
5781
stp q8, q9, [x2], #32 //AES block 0, 1 - store result
5782
5783
stp q10, q11, [x2], #32 //AES block 2, 3 - store result
5784
5785
rev32 v3.16b, v30.16b //CTR block 11
5786
add v30.4s, v30.4s, v31.4s //CTR block 11
5787
5788
.inst 0xce04718c //eor3 v12.16b, v12.16b, v4.16b, v28.16b //AES block 4 - result
5789
5790
.inst 0xce0771ef //eor3 v15.16b, v15.16b, v7.16b, v28.16b //AES block 7 - result
5791
.inst 0xce0671ce //eor3 v14.16b, v14.16b, v6.16b, v28.16b //AES block 6 - result
5792
.inst 0xce0571ad //eor3 v13.16b, v13.16b, v5.16b, v28.16b //AES block 5 - result
5793
5794
stp q12, q13, [x2], #32 //AES block 4, 5 - store result
5795
rev32 v4.16b, v30.16b //CTR block 12
5796
5797
stp q14, q15, [x2], #32 //AES block 6, 7 - store result
5798
add v30.4s, v30.4s, v31.4s //CTR block 12
5799
b.ge .L256_enc_prepretail //do prepretail
5800
5801
.L256_enc_main_loop: //main loop start
5802
ldp q26, q27, [x8, #0] //load rk0, rk1
5803
5804
rev32 v5.16b, v30.16b //CTR block 8k+13
5805
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
5806
ldr q21, [x3, #144] //load h6k | h5k
5807
ldr q24, [x3, #192] //load h8k | h7k
5808
5809
rev64 v11.16b, v11.16b //GHASH block 8k+3
5810
ldr q20, [x3, #128] //load h5l | h5h
5811
ext v20.16b, v20.16b, v20.16b, #8
5812
ldr q22, [x3, #160] //load h6l | h6h
5813
ext v22.16b, v22.16b, v22.16b, #8
5814
rev64 v9.16b, v9.16b //GHASH block 8k+1
5815
5816
rev32 v6.16b, v30.16b //CTR block 8k+14
5817
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
5818
rev64 v8.16b, v8.16b //GHASH block 8k
5819
5820
rev64 v12.16b, v12.16b //GHASH block 8k+4
5821
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
5822
ldr q23, [x3, #176] //load h7l | h7h
5823
ext v23.16b, v23.16b, v23.16b, #8
5824
ldr q25, [x3, #208] //load h8l | h8h
5825
ext v25.16b, v25.16b, v25.16b, #8
5826
5827
aese v3.16b, v26.16b
5828
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
5829
aese v5.16b, v26.16b
5830
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
5831
rev32 v7.16b, v30.16b //CTR block 8k+15
5832
5833
aese v0.16b, v26.16b
5834
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
5835
aese v1.16b, v26.16b
5836
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
5837
aese v6.16b, v26.16b
5838
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
5839
5840
aese v7.16b, v26.16b
5841
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
5842
aese v2.16b, v26.16b
5843
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
5844
aese v4.16b, v26.16b
5845
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
5846
5847
ldp q28, q26, [x8, #32] //load rk2, rk3
5848
eor v8.16b, v8.16b, v19.16b //PRE 1
5849
aese v6.16b, v27.16b
5850
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
5851
5852
aese v2.16b, v27.16b
5853
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
5854
aese v1.16b, v27.16b
5855
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
5856
aese v0.16b, v27.16b
5857
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
5858
5859
aese v4.16b, v27.16b
5860
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
5861
aese v3.16b, v27.16b
5862
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
5863
aese v5.16b, v27.16b
5864
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
5865
5866
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
5867
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
5868
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
5869
5870
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
5871
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
5872
aese v7.16b, v27.16b
5873
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
5874
5875
aese v1.16b, v28.16b
5876
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
5877
aese v5.16b, v28.16b
5878
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
5879
aese v6.16b, v28.16b
5880
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
5881
5882
aese v2.16b, v28.16b
5883
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
5884
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
5885
aese v4.16b, v28.16b
5886
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
5887
5888
aese v5.16b, v26.16b
5889
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
5890
aese v6.16b, v26.16b
5891
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
5892
aese v0.16b, v28.16b
5893
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
5894
5895
aese v1.16b, v26.16b
5896
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
5897
aese v7.16b, v28.16b
5898
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
5899
aese v3.16b, v28.16b
5900
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
5901
5902
aese v4.16b, v26.16b
5903
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
5904
rev64 v14.16b, v14.16b //GHASH block 8k+6
5905
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
5906
5907
aese v3.16b, v26.16b
5908
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
5909
ldp q27, q28, [x8, #64] //load rk4, rk5
5910
rev64 v10.16b, v10.16b //GHASH block 8k+2
5911
5912
aese v2.16b, v26.16b
5913
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
5914
aese v7.16b, v26.16b
5915
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
5916
aese v0.16b, v26.16b
5917
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
5918
5919
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
5920
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
5921
rev64 v13.16b, v13.16b //GHASH block 8k+5
5922
5923
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
5924
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
5925
ldr q23, [x3, #80] //load h3l | h3h
5926
ext v23.16b, v23.16b, v23.16b, #8
5927
ldr q25, [x3, #112] //load h4l | h4h
5928
ext v25.16b, v25.16b, v25.16b, #8
5929
5930
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
5931
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
5932
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
5933
5934
aese v4.16b, v27.16b
5935
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
5936
aese v1.16b, v27.16b
5937
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
5938
aese v5.16b, v27.16b
5939
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
5940
5941
aese v7.16b, v27.16b
5942
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
5943
aese v3.16b, v27.16b
5944
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
5945
aese v2.16b, v27.16b
5946
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
5947
5948
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
5949
aese v6.16b, v27.16b
5950
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
5951
aese v0.16b, v27.16b
5952
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
5953
5954
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
5955
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
5956
ldp q26, q27, [x8, #96] //load rk6, rk7
5957
5958
aese v5.16b, v28.16b
5959
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
5960
aese v7.16b, v28.16b
5961
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
5962
aese v4.16b, v28.16b
5963
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
5964
5965
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
5966
aese v2.16b, v28.16b
5967
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
5968
rev64 v15.16b, v15.16b //GHASH block 8k+7
5969
5970
aese v3.16b, v28.16b
5971
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
5972
aese v6.16b, v28.16b
5973
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
5974
aese v1.16b, v28.16b
5975
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
5976
5977
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
5978
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
5979
aese v0.16b, v28.16b
5980
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
5981
5982
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
5983
aese v4.16b, v26.16b
5984
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
5985
aese v2.16b, v26.16b
5986
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
5987
5988
aese v6.16b, v26.16b
5989
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
5990
aese v1.16b, v26.16b
5991
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
5992
aese v7.16b, v26.16b
5993
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
5994
5995
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
5996
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
5997
aese v5.16b, v26.16b
5998
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
5999
6000
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
6001
aese v3.16b, v26.16b
6002
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
6003
aese v0.16b, v26.16b
6004
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
6005
6006
ldp q28, q26, [x8, #128] //load rk8, rk9
6007
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
6008
aese v5.16b, v27.16b
6009
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
6010
6011
ldr q20, [x3, #32] //load h1l | h1h
6012
ext v20.16b, v20.16b, v20.16b, #8
6013
ldr q22, [x3, #64] //load h2l | h2h
6014
ext v22.16b, v22.16b, v22.16b, #8
6015
aese v2.16b, v27.16b
6016
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
6017
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
6018
6019
ldr q21, [x3, #48] //load h2k | h1k
6020
ldr q24, [x3, #96] //load h4k | h3k
6021
aese v6.16b, v27.16b
6022
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
6023
aese v3.16b, v27.16b
6024
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
6025
6026
aese v0.16b, v27.16b
6027
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
6028
aese v7.16b, v27.16b
6029
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
6030
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
6031
6032
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
6033
aese v4.16b, v27.16b
6034
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
6035
aese v1.16b, v27.16b
6036
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
6037
6038
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
6039
aese v7.16b, v28.16b
6040
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
6041
aese v0.16b, v28.16b
6042
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
6043
6044
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
6045
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
6046
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
6047
6048
aese v3.16b, v28.16b
6049
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
6050
aese v0.16b, v26.16b
6051
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
6052
aese v1.16b, v28.16b
6053
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
6054
6055
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
6056
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
6057
aese v2.16b, v28.16b
6058
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
6059
6060
aese v5.16b, v28.16b
6061
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
6062
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
6063
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
6064
6065
aese v6.16b, v28.16b
6066
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
6067
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
6068
aese v4.16b, v28.16b
6069
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
6070
6071
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
6072
aese v7.16b, v26.16b
6073
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
6074
aese v5.16b, v26.16b
6075
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
6076
6077
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
6078
aese v6.16b, v26.16b
6079
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
6080
aese v4.16b, v26.16b
6081
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
6082
6083
ldp q27, q28, [x8, #160] //load rk10, rk11
6084
aese v2.16b, v26.16b
6085
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
6086
aese v3.16b, v26.16b
6087
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
6088
6089
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
6090
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
6091
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
6092
6093
ldr d16, [x10] //MODULO - load modulo constant
6094
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
6095
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
6096
6097
aese v1.16b, v26.16b
6098
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
6099
6100
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
6101
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
6102
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
6103
6104
aese v4.16b, v27.16b
6105
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
6106
aese v3.16b, v27.16b
6107
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
6108
aese v5.16b, v27.16b
6109
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
6110
6111
aese v0.16b, v27.16b
6112
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
6113
aese v2.16b, v27.16b
6114
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
6115
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
6116
6117
aese v1.16b, v27.16b
6118
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
6119
aese v7.16b, v27.16b
6120
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
6121
aese v6.16b, v27.16b
6122
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
6123
6124
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
6125
6126
ldp q26, q27, [x8, #192] //load rk12, rk13
6127
rev32 v20.16b, v30.16b //CTR block 8k+16
6128
6129
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
6130
ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load plaintext
6131
aese v2.16b, v28.16b
6132
aesmc v2.16b, v2.16b //AES block 8k+10 - round 11
6133
6134
aese v6.16b, v28.16b
6135
aesmc v6.16b, v6.16b //AES block 8k+14 - round 11
6136
add v30.4s, v30.4s, v31.4s //CTR block 8k+16
6137
aese v3.16b, v28.16b
6138
aesmc v3.16b, v3.16b //AES block 8k+11 - round 11
6139
6140
aese v0.16b, v28.16b
6141
aesmc v0.16b, v0.16b //AES block 8k+8 - round 11
6142
aese v7.16b, v28.16b
6143
aesmc v7.16b, v7.16b //AES block 8k+15 - round 11
6144
6145
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
6146
aese v1.16b, v28.16b
6147
aesmc v1.16b, v1.16b //AES block 8k+9 - round 11
6148
6149
aese v7.16b, v26.16b
6150
aesmc v7.16b, v7.16b //AES block 8k+15 - round 12
6151
aese v5.16b, v28.16b
6152
aesmc v5.16b, v5.16b //AES block 8k+13 - round 11
6153
6154
aese v3.16b, v26.16b
6155
aesmc v3.16b, v3.16b //AES block 8k+11 - round 12
6156
aese v6.16b, v26.16b
6157
aesmc v6.16b, v6.16b //AES block 8k+14 - round 12
6158
rev32 v22.16b, v30.16b //CTR block 8k+17
6159
6160
add v30.4s, v30.4s, v31.4s //CTR block 8k+17
6161
aese v4.16b, v28.16b
6162
aesmc v4.16b, v4.16b //AES block 8k+12 - round 11
6163
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
6164
6165
aese v5.16b, v26.16b
6166
aesmc v5.16b, v5.16b //AES block 8k+13 - round 12
6167
ldr q28, [x8, #224] //load rk14
6168
aese v7.16b, v27.16b //AES block 8k+15 - round 13
6169
6170
ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load plaintext
6171
aese v2.16b, v26.16b
6172
aesmc v2.16b, v2.16b //AES block 8k+10 - round 12
6173
aese v4.16b, v26.16b
6174
aesmc v4.16b, v4.16b //AES block 8k+12 - round 12
6175
6176
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
6177
aese v1.16b, v26.16b
6178
aesmc v1.16b, v1.16b //AES block 8k+9 - round 12
6179
ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext
6180
6181
ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext
6182
aese v2.16b, v27.16b //AES block 8k+10 - round 13
6183
aese v4.16b, v27.16b //AES block 8k+12 - round 13
6184
6185
rev32 v23.16b, v30.16b //CTR block 8k+18
6186
add v30.4s, v30.4s, v31.4s //CTR block 8k+18
6187
aese v5.16b, v27.16b //AES block 8k+13 - round 13
6188
6189
aese v0.16b, v26.16b
6190
aesmc v0.16b, v0.16b //AES block 8k+8 - round 12
6191
aese v3.16b, v27.16b //AES block 8k+11 - round 13
6192
cmp x0, x5 //.LOOP CONTROL
6193
6194
.inst 0xce02714a //eor3 v10.16b, v10.16b, v2.16b, v28.16b //AES block 8k+10 - result
6195
rev32 v25.16b, v30.16b //CTR block 8k+19
6196
add v30.4s, v30.4s, v31.4s //CTR block 8k+19
6197
6198
aese v0.16b, v27.16b //AES block 8k+8 - round 13
6199
aese v6.16b, v27.16b //AES block 8k+14 - round 13
6200
.inst 0xce0571ad //eor3 v13.16b, v13.16b, v5.16b, v28.16b //AES block 5 - result
6201
6202
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
6203
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
6204
aese v1.16b, v27.16b //AES block 8k+9 - round 13
6205
6206
.inst 0xce04718c //eor3 v12.16b, v12.16b, v4.16b, v28.16b //AES block 4 - result
6207
rev32 v4.16b, v30.16b //CTR block 8k+20
6208
.inst 0xce03716b //eor3 v11.16b, v11.16b, v3.16b, v28.16b //AES block 8k+11 - result
6209
6210
mov v3.16b, v25.16b //CTR block 8k+19
6211
.inst 0xce017129 //eor3 v9.16b, v9.16b, v1.16b, v28.16b //AES block 8k+9 - result
6212
.inst 0xce007108 //eor3 v8.16b, v8.16b, v0.16b, v28.16b //AES block 8k+8 - result
6213
6214
add v30.4s, v30.4s, v31.4s //CTR block 8k+20
6215
stp q8, q9, [x2], #32 //AES block 8k+8, 8k+9 - store result
6216
mov v2.16b, v23.16b //CTR block 8k+18
6217
6218
.inst 0xce0771ef //eor3 v15.16b, v15.16b, v7.16b, v28.16b //AES block 7 - result
6219
.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low
6220
stp q10, q11, [x2], #32 //AES block 8k+10, 8k+11 - store result
6221
6222
.inst 0xce0671ce //eor3 v14.16b, v14.16b, v6.16b, v28.16b //AES block 6 - result
6223
mov v1.16b, v22.16b //CTR block 8k+17
6224
stp q12, q13, [x2], #32 //AES block 4, 5 - store result
6225
6226
stp q14, q15, [x2], #32 //AES block 6, 7 - store result
6227
mov v0.16b, v20.16b //CTR block 8k+16
6228
b.lt .L256_enc_main_loop
6229
6230
.L256_enc_prepretail: //PREPRETAIL
6231
rev32 v5.16b, v30.16b //CTR block 8k+13
6232
ldp q26, q27, [x8, #0] //load rk0, rk1
6233
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
6234
6235
rev64 v10.16b, v10.16b //GHASH block 8k+2
6236
6237
rev32 v6.16b, v30.16b //CTR block 8k+14
6238
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
6239
6240
rev64 v13.16b, v13.16b //GHASH block 8k+5
6241
ldr q21, [x3, #144] //load h6k | h5k
6242
ldr q24, [x3, #192] //load h8k | h7k
6243
6244
rev32 v7.16b, v30.16b //CTR block 8k+15
6245
6246
aese v6.16b, v26.16b
6247
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
6248
aese v4.16b, v26.16b
6249
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
6250
aese v1.16b, v26.16b
6251
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
6252
6253
aese v5.16b, v26.16b
6254
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
6255
aese v0.16b, v26.16b
6256
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
6257
6258
aese v2.16b, v26.16b
6259
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
6260
aese v7.16b, v26.16b
6261
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
6262
aese v3.16b, v26.16b
6263
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
6264
6265
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
6266
rev64 v8.16b, v8.16b //GHASH block 8k
6267
aese v1.16b, v27.16b
6268
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
6269
6270
rev64 v9.16b, v9.16b //GHASH block 8k+1
6271
ldp q28, q26, [x8, #32] //load rk2, rk3
6272
aese v3.16b, v27.16b
6273
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
6274
6275
ldr q23, [x3, #176] //load h7l | h7h
6276
ext v23.16b, v23.16b, v23.16b, #8
6277
ldr q25, [x3, #208] //load h8l | h8h
6278
ext v25.16b, v25.16b, v25.16b, #8
6279
aese v2.16b, v27.16b
6280
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
6281
6282
ldr q20, [x3, #128] //load h5l | h5h
6283
ext v20.16b, v20.16b, v20.16b, #8
6284
ldr q22, [x3, #160] //load h6l | h6h
6285
ext v22.16b, v22.16b, v22.16b, #8
6286
aese v0.16b, v27.16b
6287
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
6288
aese v5.16b, v27.16b
6289
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
6290
6291
aese v4.16b, v27.16b
6292
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
6293
eor v8.16b, v8.16b, v19.16b //PRE 1
6294
6295
rev64 v11.16b, v11.16b //GHASH block 8k+3
6296
aese v6.16b, v27.16b
6297
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
6298
6299
aese v1.16b, v28.16b
6300
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
6301
aese v2.16b, v28.16b
6302
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
6303
aese v7.16b, v27.16b
6304
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
6305
6306
aese v4.16b, v28.16b
6307
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
6308
aese v0.16b, v28.16b
6309
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
6310
aese v6.16b, v28.16b
6311
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
6312
6313
aese v5.16b, v28.16b
6314
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
6315
aese v7.16b, v28.16b
6316
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
6317
aese v3.16b, v28.16b
6318
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
6319
6320
ldp q27, q28, [x8, #64] //load rk4, rk5
6321
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
6322
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
6323
6324
rev64 v14.16b, v14.16b //GHASH block 8k+6
6325
aese v4.16b, v26.16b
6326
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
6327
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
6328
6329
aese v7.16b, v26.16b
6330
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
6331
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
6332
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
6333
6334
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
6335
aese v6.16b, v26.16b
6336
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
6337
6338
aese v2.16b, v26.16b
6339
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
6340
aese v3.16b, v26.16b
6341
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
6342
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
6343
6344
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
6345
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
6346
aese v1.16b, v26.16b
6347
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
6348
6349
aese v0.16b, v26.16b
6350
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
6351
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
6352
aese v5.16b, v26.16b
6353
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
6354
6355
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
6356
aese v1.16b, v27.16b
6357
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
6358
aese v6.16b, v27.16b
6359
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
6360
6361
aese v0.16b, v27.16b
6362
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
6363
aese v2.16b, v27.16b
6364
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
6365
aese v4.16b, v27.16b
6366
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
6367
6368
aese v6.16b, v28.16b
6369
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
6370
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
6371
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
6372
6373
aese v7.16b, v27.16b
6374
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
6375
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
6376
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
6377
6378
aese v5.16b, v27.16b
6379
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
6380
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
6381
aese v3.16b, v27.16b
6382
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
6383
6384
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
6385
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
6386
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
6387
6388
rev64 v12.16b, v12.16b //GHASH block 8k+4
6389
aese v1.16b, v28.16b
6390
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
6391
aese v0.16b, v28.16b
6392
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
6393
6394
aese v7.16b, v28.16b
6395
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
6396
aese v4.16b, v28.16b
6397
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
6398
ldp q26, q27, [x8, #96] //load rk6, rk7
6399
6400
ldr q23, [x3, #80] //load h3l | h3h
6401
ext v23.16b, v23.16b, v23.16b, #8
6402
ldr q25, [x3, #112] //load h4l | h4h
6403
ext v25.16b, v25.16b, v25.16b, #8
6404
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
6405
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
6406
6407
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
6408
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
6409
6410
aese v5.16b, v28.16b
6411
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
6412
rev64 v15.16b, v15.16b //GHASH block 8k+7
6413
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
6414
6415
aese v3.16b, v28.16b
6416
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
6417
aese v2.16b, v28.16b
6418
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
6419
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
6420
6421
aese v7.16b, v26.16b
6422
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
6423
aese v4.16b, v26.16b
6424
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
6425
aese v6.16b, v26.16b
6426
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
6427
6428
ldr q21, [x3, #48] //load h2k | h1k
6429
ldr q24, [x3, #96] //load h4k | h3k
6430
aese v5.16b, v26.16b
6431
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
6432
aese v3.16b, v26.16b
6433
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
6434
6435
aese v0.16b, v26.16b
6436
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
6437
aese v1.16b, v26.16b
6438
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
6439
aese v2.16b, v26.16b
6440
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
6441
6442
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
6443
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
6444
ldr q20, [x3, #32] //load h1l | h1h
6445
ext v20.16b, v20.16b, v20.16b, #8
6446
ldr q22, [x3, #64] //load h2l | h2h
6447
ext v22.16b, v22.16b, v22.16b, #8
6448
6449
ldp q28, q26, [x8, #128] //load rk8, rk9
6450
aese v1.16b, v27.16b
6451
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
6452
aese v4.16b, v27.16b
6453
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
6454
6455
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
6456
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
6457
6458
aese v5.16b, v27.16b
6459
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
6460
aese v6.16b, v27.16b
6461
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
6462
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
6463
6464
aese v7.16b, v27.16b
6465
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
6466
aese v3.16b, v27.16b
6467
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
6468
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
6469
6470
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
6471
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
6472
aese v2.16b, v27.16b
6473
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
6474
6475
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
6476
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
6477
aese v0.16b, v27.16b
6478
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
6479
6480
aese v7.16b, v28.16b
6481
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
6482
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
6483
aese v2.16b, v28.16b
6484
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
6485
6486
aese v6.16b, v28.16b
6487
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
6488
aese v4.16b, v28.16b
6489
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
6490
aese v3.16b, v28.16b
6491
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
6492
6493
aese v5.16b, v28.16b
6494
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
6495
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
6496
aese v0.16b, v28.16b
6497
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
6498
6499
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
6500
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
6501
aese v1.16b, v28.16b
6502
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
6503
6504
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
6505
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
6506
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
6507
6508
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
6509
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
6510
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
6511
6512
ldp q27, q28, [x8, #160] //load rk10, rk11
6513
aese v1.16b, v26.16b
6514
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
6515
aese v0.16b, v26.16b
6516
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
6517
6518
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
6519
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
6520
ldr d16, [x10] //MODULO - load modulo constant
6521
6522
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
6523
6524
aese v3.16b, v26.16b
6525
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
6526
aese v7.16b, v26.16b
6527
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
6528
aese v5.16b, v26.16b
6529
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
6530
6531
aese v2.16b, v26.16b
6532
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
6533
aese v6.16b, v26.16b
6534
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
6535
6536
aese v5.16b, v27.16b
6537
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
6538
aese v1.16b, v27.16b
6539
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
6540
aese v4.16b, v26.16b
6541
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
6542
6543
aese v7.16b, v27.16b
6544
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
6545
aese v6.16b, v27.16b
6546
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
6547
aese v3.16b, v27.16b
6548
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
6549
6550
aese v4.16b, v27.16b
6551
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
6552
aese v0.16b, v27.16b
6553
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
6554
aese v2.16b, v27.16b
6555
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
6556
6557
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
6558
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
6559
aese v7.16b, v28.16b
6560
aesmc v7.16b, v7.16b //AES block 8k+15 - round 11
6561
6562
ldp q26, q27, [x8, #192] //load rk12, rk13
6563
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
6564
aese v2.16b, v28.16b
6565
aesmc v2.16b, v2.16b //AES block 8k+10 - round 11
6566
6567
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
6568
aese v1.16b, v28.16b
6569
aesmc v1.16b, v1.16b //AES block 8k+9 - round 11
6570
aese v6.16b, v28.16b
6571
aesmc v6.16b, v6.16b //AES block 8k+14 - round 11
6572
6573
aese v0.16b, v28.16b
6574
aesmc v0.16b, v0.16b //AES block 8k+8 - round 11
6575
aese v4.16b, v28.16b
6576
aesmc v4.16b, v4.16b //AES block 8k+12 - round 11
6577
aese v5.16b, v28.16b
6578
aesmc v5.16b, v5.16b //AES block 8k+13 - round 11
6579
6580
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
6581
aese v3.16b, v28.16b
6582
aesmc v3.16b, v3.16b //AES block 8k+11 - round 11
6583
ldr q28, [x8, #224] //load rk14
6584
6585
aese v1.16b, v26.16b
6586
aesmc v1.16b, v1.16b //AES block 8k+9 - round 12
6587
aese v2.16b, v26.16b
6588
aesmc v2.16b, v2.16b //AES block 8k+10 - round 12
6589
aese v0.16b, v26.16b
6590
aesmc v0.16b, v0.16b //AES block 8k+8 - round 12
6591
6592
aese v6.16b, v26.16b
6593
aesmc v6.16b, v6.16b //AES block 8k+14 - round 12
6594
aese v5.16b, v26.16b
6595
aesmc v5.16b, v5.16b //AES block 8k+13 - round 12
6596
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
6597
6598
aese v4.16b, v26.16b
6599
aesmc v4.16b, v4.16b //AES block 8k+12 - round 12
6600
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
6601
6602
aese v3.16b, v26.16b
6603
aesmc v3.16b, v3.16b //AES block 8k+11 - round 12
6604
aese v7.16b, v26.16b
6605
aesmc v7.16b, v7.16b //AES block 8k+15 - round 12
6606
aese v0.16b, v27.16b //AES block 8k+8 - round 13
6607
6608
.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low
6609
aese v5.16b, v27.16b //AES block 8k+13 - round 13
6610
aese v1.16b, v27.16b //AES block 8k+9 - round 13
6611
6612
aese v3.16b, v27.16b //AES block 8k+11 - round 13
6613
aese v4.16b, v27.16b //AES block 8k+12 - round 13
6614
aese v7.16b, v27.16b //AES block 8k+15 - round 13
6615
6616
aese v2.16b, v27.16b //AES block 8k+10 - round 13
6617
aese v6.16b, v27.16b //AES block 8k+14 - round 13
6618
.L256_enc_tail: //TAIL
6619
6620
ldp q24, q25, [x3, #192] //load h8l | h8h
6621
ext v25.16b, v25.16b, v25.16b, #8
6622
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
6623
6624
ldr q8, [x0], #16 //AES block 8k+8 - load plaintext
6625
6626
ldp q20, q21, [x3, #128] //load h5l | h5h
6627
ext v20.16b, v20.16b, v20.16b, #8
6628
6629
ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
6630
ldp q22, q23, [x3, #160] //load h6l | h6h
6631
ext v22.16b, v22.16b, v22.16b, #8
6632
ext v23.16b, v23.16b, v23.16b, #8
6633
mov v29.16b, v28.16b
6634
6635
cmp x5, #112
6636
.inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result
6637
b.gt .L256_enc_blocks_more_than_7
6638
6639
movi v19.8b, #0
6640
mov v7.16b, v6.16b
6641
movi v17.8b, #0
6642
6643
mov v6.16b, v5.16b
6644
mov v5.16b, v4.16b
6645
mov v4.16b, v3.16b
6646
6647
mov v3.16b, v2.16b
6648
sub v30.4s, v30.4s, v31.4s
6649
mov v2.16b, v1.16b
6650
6651
movi v18.8b, #0
6652
cmp x5, #96
6653
b.gt .L256_enc_blocks_more_than_6
6654
6655
mov v7.16b, v6.16b
6656
mov v6.16b, v5.16b
6657
cmp x5, #80
6658
6659
mov v5.16b, v4.16b
6660
mov v4.16b, v3.16b
6661
mov v3.16b, v1.16b
6662
6663
sub v30.4s, v30.4s, v31.4s
6664
b.gt .L256_enc_blocks_more_than_5
6665
6666
mov v7.16b, v6.16b
6667
sub v30.4s, v30.4s, v31.4s
6668
6669
mov v6.16b, v5.16b
6670
mov v5.16b, v4.16b
6671
6672
cmp x5, #64
6673
mov v4.16b, v1.16b
6674
b.gt .L256_enc_blocks_more_than_4
6675
6676
cmp x5, #48
6677
mov v7.16b, v6.16b
6678
mov v6.16b, v5.16b
6679
6680
mov v5.16b, v1.16b
6681
sub v30.4s, v30.4s, v31.4s
6682
b.gt .L256_enc_blocks_more_than_3
6683
6684
cmp x5, #32
6685
mov v7.16b, v6.16b
6686
ldr q24, [x3, #96] //load h4k | h3k
6687
6688
mov v6.16b, v1.16b
6689
sub v30.4s, v30.4s, v31.4s
6690
b.gt .L256_enc_blocks_more_than_2
6691
6692
mov v7.16b, v1.16b
6693
6694
sub v30.4s, v30.4s, v31.4s
6695
cmp x5, #16
6696
b.gt .L256_enc_blocks_more_than_1
6697
6698
sub v30.4s, v30.4s, v31.4s
6699
ldr q21, [x3, #48] //load h2k | h1k
6700
b .L256_enc_blocks_less_than_1
6701
.L256_enc_blocks_more_than_7: //blocks left > 7
6702
st1 { v9.16b}, [x2], #16 //AES final-7 block - store result
6703
6704
rev64 v8.16b, v9.16b //GHASH final-7 block
6705
6706
eor v8.16b, v8.16b, v16.16b //feed in partial tag
6707
6708
ldr q9, [x0], #16 //AES final-6 block - load plaintext
6709
6710
pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
6711
ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
6712
ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
6713
6714
movi v16.8b, #0 //suppress further partial tag feed in
6715
6716
eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
6717
.inst 0xce017529 //eor3 v9.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
6718
6719
pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
6720
pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
6721
.L256_enc_blocks_more_than_6: //blocks left > 6
6722
6723
st1 { v9.16b}, [x2], #16 //AES final-6 block - store result
6724
6725
rev64 v8.16b, v9.16b //GHASH final-6 block
6726
6727
eor v8.16b, v8.16b, v16.16b //feed in partial tag
6728
6729
pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
6730
ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
6731
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
6732
6733
ldr q9, [x0], #16 //AES final-5 block - load plaintext
6734
6735
eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
6736
6737
eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
6738
6739
pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
6740
.inst 0xce027529 //eor3 v9.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
6741
6742
movi v16.8b, #0 //suppress further partial tag feed in
6743
6744
eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
6745
eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
6746
.L256_enc_blocks_more_than_5: //blocks left > 5
6747
6748
st1 { v9.16b}, [x2], #16 //AES final-5 block - store result
6749
6750
rev64 v8.16b, v9.16b //GHASH final-5 block
6751
6752
eor v8.16b, v8.16b, v16.16b //feed in partial tag
6753
6754
ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
6755
6756
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
6757
6758
eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
6759
eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
6760
6761
ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
6762
6763
ldr q9, [x0], #16 //AES final-4 block - load plaintext
6764
pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
6765
6766
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
6767
movi v16.8b, #0 //suppress further partial tag feed in
6768
eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
6769
6770
eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
6771
.inst 0xce037529 //eor3 v9.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
6772
.L256_enc_blocks_more_than_4: //blocks left > 4
6773
6774
st1 { v9.16b}, [x2], #16 //AES final-4 block - store result
6775
6776
rev64 v8.16b, v9.16b //GHASH final-4 block
6777
6778
ldr q9, [x0], #16 //AES final-3 block - load plaintext
6779
6780
eor v8.16b, v8.16b, v16.16b //feed in partial tag
6781
6782
ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
6783
pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
6784
6785
.inst 0xce047529 //eor3 v9.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
6786
pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
6787
6788
eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
6789
eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
6790
6791
pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
6792
6793
movi v16.8b, #0 //suppress further partial tag feed in
6794
6795
eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
6796
eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
6797
.L256_enc_blocks_more_than_3: //blocks left > 3
6798
6799
st1 { v9.16b}, [x2], #16 //AES final-3 block - store result
6800
6801
ldr q25, [x3, #112] //load h4l | h4h
6802
ext v25.16b, v25.16b, v25.16b, #8
6803
rev64 v8.16b, v9.16b //GHASH final-3 block
6804
6805
eor v8.16b, v8.16b, v16.16b //feed in partial tag
6806
6807
ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
6808
pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
6809
6810
eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
6811
eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
6812
ldr q24, [x3, #96] //load h4k | h3k
6813
6814
ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
6815
ldr q9, [x0], #16 //AES final-2 block - load plaintext
6816
6817
pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
6818
pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
6819
6820
.inst 0xce057529 //eor3 v9.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
6821
movi v16.8b, #0 //suppress further partial tag feed in
6822
6823
eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
6824
eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
6825
.L256_enc_blocks_more_than_2: //blocks left > 2
6826
6827
ldr q23, [x3, #80] //load h3l | h3h
6828
ext v23.16b, v23.16b, v23.16b, #8
6829
6830
st1 { v9.16b}, [x2], #16 //AES final-2 block - store result
6831
6832
rev64 v8.16b, v9.16b //GHASH final-2 block
6833
ldr q9, [x0], #16 //AES final-1 block - load plaintext
6834
6835
eor v8.16b, v8.16b, v16.16b //feed in partial tag
6836
6837
ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
6838
6839
movi v16.8b, #0 //suppress further partial tag feed in
6840
6841
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
6842
.inst 0xce067529 //eor3 v9.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
6843
6844
eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
6845
6846
eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
6847
6848
pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
6849
pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
6850
6851
eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
6852
eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
6853
.L256_enc_blocks_more_than_1: //blocks left > 1
6854
6855
st1 { v9.16b}, [x2], #16 //AES final-1 block - store result
6856
6857
ldr q22, [x3, #64] //load h2l | h2h
6858
ext v22.16b, v22.16b, v22.16b, #8
6859
rev64 v8.16b, v9.16b //GHASH final-1 block
6860
ldr q9, [x0], #16 //AES final block - load plaintext
6861
6862
eor v8.16b, v8.16b, v16.16b //feed in partial tag
6863
movi v16.8b, #0 //suppress further partial tag feed in
6864
6865
ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
6866
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
6867
6868
.inst 0xce077529 //eor3 v9.16b, v9.16b, v7.16b, v29.16b //AES final block - result
6869
eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
6870
6871
pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
6872
eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
6873
6874
ldr q21, [x3, #48] //load h2k | h1k
6875
6876
eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
6877
ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
6878
6879
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
6880
6881
eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
6882
.L256_enc_blocks_less_than_1: //blocks left <= 1
6883
6884
and x1, x1, #127 //bit_length %= 128
6885
6886
sub x1, x1, #128 //bit_length -= 128
6887
6888
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
6889
6890
mvn x6, xzr //temp0_x = 0xffffffffffffffff
6891
and x1, x1, #127 //bit_length %= 128
6892
6893
lsr x6, x6, x1 //temp0_x is mask for top 64b of last block
6894
cmp x1, #64
6895
mvn x7, xzr //temp1_x = 0xffffffffffffffff
6896
6897
csel x14, x6, xzr, lt
6898
csel x13, x7, x6, lt
6899
6900
mov v0.d[0], x13 //ctr0b is mask for last block
6901
ldr q20, [x3, #32] //load h1l | h1h
6902
ext v20.16b, v20.16b, v20.16b, #8
6903
6904
ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
6905
mov v0.d[1], x14
6906
6907
and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
6908
6909
rev64 v8.16b, v9.16b //GHASH final block
6910
6911
rev32 v30.16b, v30.16b
6912
bif v9.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
6913
str q30, [x16] //store the updated counter
6914
6915
eor v8.16b, v8.16b, v16.16b //feed in partial tag
6916
st1 { v9.16b}, [x2] //store all 16B
6917
6918
ins v16.d[0], v8.d[1] //GHASH final block - mid
6919
pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
6920
pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
6921
6922
eor v17.16b, v17.16b, v28.16b //GHASH final block - high
6923
eor v19.16b, v19.16b, v26.16b //GHASH final block - low
6924
6925
eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
6926
6927
pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
6928
6929
eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
6930
ldr d16, [x10] //MODULO - load modulo constant
6931
6932
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
6933
6934
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
6935
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
6936
6937
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
6938
6939
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
6940
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
6941
6942
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
6943
ext v19.16b, v19.16b, v19.16b, #8
6944
rev64 v19.16b, v19.16b
6945
st1 { v19.16b }, [x3]
6946
mov x0, x9 //return sizes
6947
6948
ldp d10, d11, [sp, #16]
6949
ldp d12, d13, [sp, #32]
6950
ldp d14, d15, [sp, #48]
6951
ldp d8, d9, [sp], #80
6952
ret
6953
6954
.L256_enc_ret:
6955
mov w0, #0x0
6956
ret
6957
.size unroll8_eor3_aes_gcm_enc_256_kernel,.-unroll8_eor3_aes_gcm_enc_256_kernel
6958
.globl unroll8_eor3_aes_gcm_dec_256_kernel
6959
.type unroll8_eor3_aes_gcm_dec_256_kernel,%function
6960
.align 4
6961
unroll8_eor3_aes_gcm_dec_256_kernel:
6962
AARCH64_VALID_CALL_TARGET
6963
cbz x1, .L256_dec_ret
6964
stp d8, d9, [sp, #-80]!
6965
lsr x9, x1, #3
6966
mov x16, x4
6967
mov x8, x5
6968
stp d10, d11, [sp, #16]
6969
stp d12, d13, [sp, #32]
6970
stp d14, d15, [sp, #48]
6971
mov x5, #0xc200000000000000
6972
stp x5, xzr, [sp, #64]
6973
add x10, sp, #64
6974
6975
ld1 { v0.16b}, [x16] //CTR block 0
6976
6977
mov x15, #0x100000000 //set up counter increment
6978
movi v31.16b, #0x0
6979
mov v31.d[1], x15
6980
mov x5, x9
6981
6982
sub x5, x5, #1 //byte_len - 1
6983
6984
rev32 v30.16b, v0.16b //set up reversed counter
6985
6986
add v30.4s, v30.4s, v31.4s //CTR block 0
6987
6988
rev32 v1.16b, v30.16b //CTR block 1
6989
add v30.4s, v30.4s, v31.4s //CTR block 1
6990
6991
rev32 v2.16b, v30.16b //CTR block 2
6992
add v30.4s, v30.4s, v31.4s //CTR block 2
6993
ldp q26, q27, [x8, #0] //load rk0, rk1
6994
6995
rev32 v3.16b, v30.16b //CTR block 3
6996
add v30.4s, v30.4s, v31.4s //CTR block 3
6997
6998
rev32 v4.16b, v30.16b //CTR block 4
6999
add v30.4s, v30.4s, v31.4s //CTR block 4
7000
7001
aese v0.16b, v26.16b
7002
aesmc v0.16b, v0.16b //AES block 0 - round 0
7003
7004
rev32 v5.16b, v30.16b //CTR block 5
7005
add v30.4s, v30.4s, v31.4s //CTR block 5
7006
7007
aese v1.16b, v26.16b
7008
aesmc v1.16b, v1.16b //AES block 1 - round 0
7009
aese v2.16b, v26.16b
7010
aesmc v2.16b, v2.16b //AES block 2 - round 0
7011
7012
rev32 v6.16b, v30.16b //CTR block 6
7013
add v30.4s, v30.4s, v31.4s //CTR block 6
7014
7015
rev32 v7.16b, v30.16b //CTR block 7
7016
aese v4.16b, v26.16b
7017
aesmc v4.16b, v4.16b //AES block 4 - round 0
7018
7019
aese v6.16b, v26.16b
7020
aesmc v6.16b, v6.16b //AES block 6 - round 0
7021
aese v5.16b, v26.16b
7022
aesmc v5.16b, v5.16b //AES block 5 - round 0
7023
7024
aese v3.16b, v26.16b
7025
aesmc v3.16b, v3.16b //AES block 3 - round 0
7026
aese v7.16b, v26.16b
7027
aesmc v7.16b, v7.16b //AES block 7 - round 0
7028
ldp q28, q26, [x8, #32] //load rk2, rk3
7029
7030
aese v6.16b, v27.16b
7031
aesmc v6.16b, v6.16b //AES block 6 - round 1
7032
aese v4.16b, v27.16b
7033
aesmc v4.16b, v4.16b //AES block 4 - round 1
7034
aese v0.16b, v27.16b
7035
aesmc v0.16b, v0.16b //AES block 0 - round 1
7036
7037
aese v5.16b, v27.16b
7038
aesmc v5.16b, v5.16b //AES block 5 - round 1
7039
aese v7.16b, v27.16b
7040
aesmc v7.16b, v7.16b //AES block 7 - round 1
7041
aese v1.16b, v27.16b
7042
aesmc v1.16b, v1.16b //AES block 1 - round 1
7043
7044
aese v2.16b, v27.16b
7045
aesmc v2.16b, v2.16b //AES block 2 - round 1
7046
aese v3.16b, v27.16b
7047
aesmc v3.16b, v3.16b //AES block 3 - round 1
7048
7049
aese v3.16b, v28.16b
7050
aesmc v3.16b, v3.16b //AES block 3 - round 2
7051
aese v2.16b, v28.16b
7052
aesmc v2.16b, v2.16b //AES block 2 - round 2
7053
aese v6.16b, v28.16b
7054
aesmc v6.16b, v6.16b //AES block 6 - round 2
7055
7056
aese v1.16b, v28.16b
7057
aesmc v1.16b, v1.16b //AES block 1 - round 2
7058
aese v7.16b, v28.16b
7059
aesmc v7.16b, v7.16b //AES block 7 - round 2
7060
aese v5.16b, v28.16b
7061
aesmc v5.16b, v5.16b //AES block 5 - round 2
7062
7063
aese v0.16b, v28.16b
7064
aesmc v0.16b, v0.16b //AES block 0 - round 2
7065
aese v4.16b, v28.16b
7066
aesmc v4.16b, v4.16b //AES block 4 - round 2
7067
ldp q27, q28, [x8, #64] //load rk4, rk5
7068
7069
aese v1.16b, v26.16b
7070
aesmc v1.16b, v1.16b //AES block 1 - round 3
7071
aese v2.16b, v26.16b
7072
aesmc v2.16b, v2.16b //AES block 2 - round 3
7073
7074
aese v3.16b, v26.16b
7075
aesmc v3.16b, v3.16b //AES block 3 - round 3
7076
aese v4.16b, v26.16b
7077
aesmc v4.16b, v4.16b //AES block 4 - round 3
7078
7079
aese v5.16b, v26.16b
7080
aesmc v5.16b, v5.16b //AES block 5 - round 3
7081
aese v7.16b, v26.16b
7082
aesmc v7.16b, v7.16b //AES block 7 - round 3
7083
aese v0.16b, v26.16b
7084
aesmc v0.16b, v0.16b //AES block 0 - round 3
7085
7086
aese v6.16b, v26.16b
7087
aesmc v6.16b, v6.16b //AES block 6 - round 3
7088
7089
aese v7.16b, v27.16b
7090
aesmc v7.16b, v7.16b //AES block 7 - round 4
7091
aese v3.16b, v27.16b
7092
aesmc v3.16b, v3.16b //AES block 3 - round 4
7093
7094
aese v6.16b, v27.16b
7095
aesmc v6.16b, v6.16b //AES block 6 - round 4
7096
aese v2.16b, v27.16b
7097
aesmc v2.16b, v2.16b //AES block 2 - round 4
7098
aese v0.16b, v27.16b
7099
aesmc v0.16b, v0.16b //AES block 0 - round 4
7100
7101
aese v4.16b, v27.16b
7102
aesmc v4.16b, v4.16b //AES block 4 - round 4
7103
aese v1.16b, v27.16b
7104
aesmc v1.16b, v1.16b //AES block 1 - round 4
7105
aese v5.16b, v27.16b
7106
aesmc v5.16b, v5.16b //AES block 5 - round 4
7107
7108
aese v0.16b, v28.16b
7109
aesmc v0.16b, v0.16b //AES block 0 - round 5
7110
aese v6.16b, v28.16b
7111
aesmc v6.16b, v6.16b //AES block 6 - round 5
7112
7113
ldp q26, q27, [x8, #96] //load rk6, rk7
7114
aese v4.16b, v28.16b
7115
aesmc v4.16b, v4.16b //AES block 4 - round 5
7116
aese v7.16b, v28.16b
7117
aesmc v7.16b, v7.16b //AES block 7 - round 5
7118
7119
aese v5.16b, v28.16b
7120
aesmc v5.16b, v5.16b //AES block 5 - round 5
7121
7122
aese v2.16b, v28.16b
7123
aesmc v2.16b, v2.16b //AES block 2 - round 5
7124
aese v3.16b, v28.16b
7125
aesmc v3.16b, v3.16b //AES block 3 - round 5
7126
7127
aese v1.16b, v28.16b
7128
aesmc v1.16b, v1.16b //AES block 1 - round 5
7129
7130
aese v4.16b, v26.16b
7131
aesmc v4.16b, v4.16b //AES block 4 - round 6
7132
aese v3.16b, v26.16b
7133
aesmc v3.16b, v3.16b //AES block 3 - round 6
7134
aese v7.16b, v26.16b
7135
aesmc v7.16b, v7.16b //AES block 7 - round 6
7136
7137
aese v6.16b, v26.16b
7138
aesmc v6.16b, v6.16b //AES block 6 - round 6
7139
aese v0.16b, v26.16b
7140
aesmc v0.16b, v0.16b //AES block 0 - round 6
7141
aese v5.16b, v26.16b
7142
aesmc v5.16b, v5.16b //AES block 5 - round 6
7143
7144
aese v2.16b, v26.16b
7145
aesmc v2.16b, v2.16b //AES block 2 - round 6
7146
aese v1.16b, v26.16b
7147
aesmc v1.16b, v1.16b //AES block 1 - round 6
7148
ldp q28, q26, [x8, #128] //load rk8, rk9
7149
7150
aese v5.16b, v27.16b
7151
aesmc v5.16b, v5.16b //AES block 5 - round 7
7152
aese v0.16b, v27.16b
7153
aesmc v0.16b, v0.16b //AES block 0 - round 7
7154
7155
aese v3.16b, v27.16b
7156
aesmc v3.16b, v3.16b //AES block 3 - round 7
7157
aese v2.16b, v27.16b
7158
aesmc v2.16b, v2.16b //AES block 2 - round 7
7159
aese v7.16b, v27.16b
7160
aesmc v7.16b, v7.16b //AES block 7 - round 7
7161
7162
aese v4.16b, v27.16b
7163
aesmc v4.16b, v4.16b //AES block 4 - round 7
7164
aese v1.16b, v27.16b
7165
aesmc v1.16b, v1.16b //AES block 1 - round 7
7166
aese v6.16b, v27.16b
7167
aesmc v6.16b, v6.16b //AES block 6 - round 7
7168
7169
and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
7170
aese v7.16b, v28.16b
7171
aesmc v7.16b, v7.16b //AES block 7 - round 8
7172
aese v5.16b, v28.16b
7173
aesmc v5.16b, v5.16b //AES block 5 - round 8
7174
7175
aese v0.16b, v28.16b
7176
aesmc v0.16b, v0.16b //AES block 0 - round 8
7177
aese v1.16b, v28.16b
7178
aesmc v1.16b, v1.16b //AES block 1 - round 8
7179
aese v2.16b, v28.16b
7180
aesmc v2.16b, v2.16b //AES block 2 - round 8
7181
7182
aese v4.16b, v28.16b
7183
aesmc v4.16b, v4.16b //AES block 4 - round 8
7184
aese v3.16b, v28.16b
7185
aesmc v3.16b, v3.16b //AES block 3 - round 8
7186
aese v6.16b, v28.16b
7187
aesmc v6.16b, v6.16b //AES block 6 - round 8
7188
7189
aese v2.16b, v26.16b
7190
aesmc v2.16b, v2.16b //AES block 2 - round 9
7191
7192
ld1 { v19.16b}, [x3]
7193
ext v19.16b, v19.16b, v19.16b, #8
7194
rev64 v19.16b, v19.16b
7195
ldp q27, q28, [x8, #160] //load rk10, rk11
7196
add x4, x0, x1, lsr #3 //end_input_ptr
7197
add x5, x5, x0
7198
7199
aese v3.16b, v26.16b
7200
aesmc v3.16b, v3.16b //AES block 3 - round 9
7201
aese v6.16b, v26.16b
7202
aesmc v6.16b, v6.16b //AES block 6 - round 9
7203
7204
aese v4.16b, v26.16b
7205
aesmc v4.16b, v4.16b //AES block 4 - round 9
7206
aese v5.16b, v26.16b
7207
aesmc v5.16b, v5.16b //AES block 5 - round 9
7208
7209
aese v7.16b, v26.16b
7210
aesmc v7.16b, v7.16b //AES block 7 - round 9
7211
7212
aese v0.16b, v26.16b
7213
aesmc v0.16b, v0.16b //AES block 0 - round 9
7214
aese v1.16b, v26.16b
7215
aesmc v1.16b, v1.16b //AES block 1 - round 9
7216
7217
aese v4.16b, v27.16b
7218
aesmc v4.16b, v4.16b //AES block 4 - round 10
7219
aese v7.16b, v27.16b
7220
aesmc v7.16b, v7.16b //AES block 7 - round 10
7221
aese v5.16b, v27.16b
7222
aesmc v5.16b, v5.16b //AES block 5 - round 10
7223
7224
aese v1.16b, v27.16b
7225
aesmc v1.16b, v1.16b //AES block 1 - round 10
7226
aese v2.16b, v27.16b
7227
aesmc v2.16b, v2.16b //AES block 2 - round 10
7228
aese v0.16b, v27.16b
7229
aesmc v0.16b, v0.16b //AES block 0 - round 10
7230
7231
aese v6.16b, v27.16b
7232
aesmc v6.16b, v6.16b //AES block 6 - round 10
7233
aese v3.16b, v27.16b
7234
aesmc v3.16b, v3.16b //AES block 3 - round 10
7235
ldp q26, q27, [x8, #192] //load rk12, rk13
7236
7237
aese v0.16b, v28.16b
7238
aesmc v0.16b, v0.16b //AES block 0 - round 11
7239
add v30.4s, v30.4s, v31.4s //CTR block 7
7240
7241
aese v7.16b, v28.16b
7242
aesmc v7.16b, v7.16b //AES block 7 - round 11
7243
aese v3.16b, v28.16b
7244
aesmc v3.16b, v3.16b //AES block 3 - round 11
7245
aese v1.16b, v28.16b
7246
aesmc v1.16b, v1.16b //AES block 1 - round 11
7247
7248
aese v5.16b, v28.16b
7249
aesmc v5.16b, v5.16b //AES block 5 - round 11
7250
aese v4.16b, v28.16b
7251
aesmc v4.16b, v4.16b //AES block 4 - round 11
7252
aese v2.16b, v28.16b
7253
aesmc v2.16b, v2.16b //AES block 2 - round 11
7254
7255
aese v6.16b, v28.16b
7256
aesmc v6.16b, v6.16b //AES block 6 - round 11
7257
ldr q28, [x8, #224] //load rk14
7258
7259
aese v1.16b, v26.16b
7260
aesmc v1.16b, v1.16b //AES block 1 - round 12
7261
aese v4.16b, v26.16b
7262
aesmc v4.16b, v4.16b //AES block 4 - round 12
7263
aese v5.16b, v26.16b
7264
aesmc v5.16b, v5.16b //AES block 5 - round 12
7265
7266
cmp x0, x5 //check if we have <= 8 blocks
7267
aese v3.16b, v26.16b
7268
aesmc v3.16b, v3.16b //AES block 3 - round 12
7269
aese v2.16b, v26.16b
7270
aesmc v2.16b, v2.16b //AES block 2 - round 12
7271
7272
aese v6.16b, v26.16b
7273
aesmc v6.16b, v6.16b //AES block 6 - round 12
7274
aese v0.16b, v26.16b
7275
aesmc v0.16b, v0.16b //AES block 0 - round 12
7276
aese v7.16b, v26.16b
7277
aesmc v7.16b, v7.16b //AES block 7 - round 12
7278
7279
aese v5.16b, v27.16b //AES block 5 - round 13
7280
aese v1.16b, v27.16b //AES block 1 - round 13
7281
aese v2.16b, v27.16b //AES block 2 - round 13
7282
7283
aese v0.16b, v27.16b //AES block 0 - round 13
7284
aese v4.16b, v27.16b //AES block 4 - round 13
7285
aese v6.16b, v27.16b //AES block 6 - round 13
7286
7287
aese v3.16b, v27.16b //AES block 3 - round 13
7288
aese v7.16b, v27.16b //AES block 7 - round 13
7289
b.ge .L256_dec_tail //handle tail
7290
7291
ldp q8, q9, [x0], #32 //AES block 0, 1 - load ciphertext
7292
7293
ldp q10, q11, [x0], #32 //AES block 2, 3 - load ciphertext
7294
7295
ldp q12, q13, [x0], #32 //AES block 4, 5 - load ciphertext
7296
7297
ldp q14, q15, [x0], #32 //AES block 6, 7 - load ciphertext
7298
cmp x0, x5 //check if we have <= 8 blocks
7299
7300
.inst 0xce017121 //eor3 v1.16b, v9.16b, v1.16b, v28.16b //AES block 1 - result
7301
.inst 0xce007100 //eor3 v0.16b, v8.16b, v0.16b, v28.16b //AES block 0 - result
7302
stp q0, q1, [x2], #32 //AES block 0, 1 - store result
7303
7304
rev32 v0.16b, v30.16b //CTR block 8
7305
add v30.4s, v30.4s, v31.4s //CTR block 8
7306
.inst 0xce037163 //eor3 v3.16b, v11.16b, v3.16b, v28.16b //AES block 3 - result
7307
7308
.inst 0xce0571a5 //eor3 v5.16b, v13.16b, v5.16b, v28.16b //AES block 5 - result
7309
7310
.inst 0xce047184 //eor3 v4.16b, v12.16b, v4.16b, v28.16b //AES block 4 - result
7311
rev32 v1.16b, v30.16b //CTR block 9
7312
add v30.4s, v30.4s, v31.4s //CTR block 9
7313
7314
.inst 0xce027142 //eor3 v2.16b, v10.16b, v2.16b, v28.16b //AES block 2 - result
7315
stp q2, q3, [x2], #32 //AES block 2, 3 - store result
7316
7317
rev32 v2.16b, v30.16b //CTR block 10
7318
add v30.4s, v30.4s, v31.4s //CTR block 10
7319
7320
.inst 0xce0671c6 //eor3 v6.16b, v14.16b, v6.16b, v28.16b //AES block 6 - result
7321
7322
rev32 v3.16b, v30.16b //CTR block 11
7323
add v30.4s, v30.4s, v31.4s //CTR block 11
7324
stp q4, q5, [x2], #32 //AES block 4, 5 - store result
7325
7326
.inst 0xce0771e7 //eor3 v7.16b, v15.16b, v7.16b, v28.16b //AES block 7 - result
7327
stp q6, q7, [x2], #32 //AES block 6, 7 - store result
7328
7329
rev32 v4.16b, v30.16b //CTR block 12
7330
add v30.4s, v30.4s, v31.4s //CTR block 12
7331
b.ge .L256_dec_prepretail //do prepretail
7332
7333
.L256_dec_main_loop: //main loop start
7334
rev32 v5.16b, v30.16b //CTR block 8k+13
7335
ldp q26, q27, [x8, #0] //load rk0, rk1
7336
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
7337
7338
rev64 v9.16b, v9.16b //GHASH block 8k+1
7339
ldr q23, [x3, #176] //load h7l | h7h
7340
ext v23.16b, v23.16b, v23.16b, #8
7341
ldr q25, [x3, #208] //load h8l | h8h
7342
ext v25.16b, v25.16b, v25.16b, #8
7343
7344
rev32 v6.16b, v30.16b //CTR block 8k+14
7345
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
7346
rev64 v8.16b, v8.16b //GHASH block 8k
7347
7348
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
7349
rev64 v12.16b, v12.16b //GHASH block 8k+4
7350
rev64 v11.16b, v11.16b //GHASH block 8k+3
7351
7352
rev32 v7.16b, v30.16b //CTR block 8k+15
7353
rev64 v15.16b, v15.16b //GHASH block 8k+7
7354
7355
aese v3.16b, v26.16b
7356
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
7357
aese v6.16b, v26.16b
7358
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
7359
aese v2.16b, v26.16b
7360
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
7361
7362
aese v7.16b, v26.16b
7363
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
7364
aese v0.16b, v26.16b
7365
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
7366
aese v5.16b, v26.16b
7367
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
7368
7369
aese v4.16b, v26.16b
7370
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
7371
aese v1.16b, v26.16b
7372
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
7373
ldp q28, q26, [x8, #32] //load rk2, rk3
7374
7375
eor v8.16b, v8.16b, v19.16b //PRE 1
7376
ldr q20, [x3, #128] //load h5l | h5h
7377
ext v20.16b, v20.16b, v20.16b, #8
7378
ldr q22, [x3, #160] //load h6l | h6h
7379
ext v22.16b, v22.16b, v22.16b, #8
7380
aese v6.16b, v27.16b
7381
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
7382
7383
aese v4.16b, v27.16b
7384
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
7385
rev64 v10.16b, v10.16b //GHASH block 8k+2
7386
aese v3.16b, v27.16b
7387
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
7388
7389
aese v0.16b, v27.16b
7390
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
7391
aese v5.16b, v27.16b
7392
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
7393
aese v2.16b, v27.16b
7394
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
7395
7396
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
7397
aese v7.16b, v27.16b
7398
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
7399
aese v1.16b, v27.16b
7400
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
7401
7402
aese v4.16b, v28.16b
7403
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
7404
aese v0.16b, v28.16b
7405
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
7406
aese v3.16b, v28.16b
7407
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
7408
7409
aese v6.16b, v28.16b
7410
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
7411
aese v7.16b, v28.16b
7412
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
7413
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
7414
7415
aese v5.16b, v28.16b
7416
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
7417
aese v2.16b, v28.16b
7418
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
7419
aese v1.16b, v28.16b
7420
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
7421
7422
ldp q27, q28, [x8, #64] //load rk4, rk5
7423
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
7424
aese v3.16b, v26.16b
7425
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
7426
7427
aese v0.16b, v26.16b
7428
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
7429
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
7430
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
7431
7432
aese v5.16b, v26.16b
7433
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
7434
aese v6.16b, v26.16b
7435
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
7436
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
7437
7438
aese v4.16b, v26.16b
7439
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
7440
aese v1.16b, v26.16b
7441
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
7442
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
7443
7444
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
7445
aese v2.16b, v26.16b
7446
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
7447
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
7448
7449
aese v5.16b, v27.16b
7450
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
7451
aese v7.16b, v26.16b
7452
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
7453
aese v3.16b, v27.16b
7454
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
7455
7456
aese v2.16b, v27.16b
7457
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
7458
aese v0.16b, v27.16b
7459
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
7460
aese v1.16b, v27.16b
7461
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
7462
7463
aese v6.16b, v27.16b
7464
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
7465
aese v7.16b, v27.16b
7466
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
7467
aese v4.16b, v27.16b
7468
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
7469
7470
ldr q21, [x3, #144] //load h6k | h5k
7471
ldr q24, [x3, #192] //load h8k | h7k
7472
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
7473
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
7474
7475
ldp q26, q27, [x8, #96] //load rk6, rk7
7476
aese v5.16b, v28.16b
7477
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
7478
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
7479
7480
aese v0.16b, v28.16b
7481
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
7482
aese v3.16b, v28.16b
7483
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
7484
aese v7.16b, v28.16b
7485
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
7486
7487
aese v1.16b, v28.16b
7488
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
7489
aese v2.16b, v28.16b
7490
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
7491
aese v6.16b, v28.16b
7492
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
7493
7494
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
7495
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
7496
rev64 v13.16b, v13.16b //GHASH block 8k+5
7497
7498
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
7499
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
7500
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
7501
7502
aese v3.16b, v26.16b
7503
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
7504
aese v0.16b, v26.16b
7505
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
7506
aese v4.16b, v28.16b
7507
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
7508
7509
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
7510
aese v1.16b, v26.16b
7511
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
7512
aese v6.16b, v26.16b
7513
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
7514
7515
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
7516
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
7517
aese v4.16b, v26.16b
7518
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
7519
7520
aese v2.16b, v26.16b
7521
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
7522
aese v5.16b, v26.16b
7523
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
7524
aese v7.16b, v26.16b
7525
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
7526
7527
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
7528
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
7529
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
7530
7531
ldr q23, [x3, #80] //load h3l | h3h
7532
ext v23.16b, v23.16b, v23.16b, #8
7533
ldr q25, [x3, #112] //load h4l | h4h
7534
ext v25.16b, v25.16b, v25.16b, #8
7535
rev64 v14.16b, v14.16b //GHASH block 8k+6
7536
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
7537
7538
aese v2.16b, v27.16b
7539
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
7540
aese v5.16b, v27.16b
7541
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
7542
ldp q28, q26, [x8, #128] //load rk8, rk9
7543
7544
ldr q20, [x3, #32] //load h1l | h1h
7545
ext v20.16b, v20.16b, v20.16b, #8
7546
ldr q22, [x3, #64] //load h2l | h2h
7547
ext v22.16b, v22.16b, v22.16b, #8
7548
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
7549
aese v7.16b, v27.16b
7550
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
7551
7552
aese v1.16b, v27.16b
7553
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
7554
aese v3.16b, v27.16b
7555
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
7556
aese v6.16b, v27.16b
7557
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
7558
7559
ldr q21, [x3, #48] //load h2k | h1k
7560
ldr q24, [x3, #96] //load h4k | h3k
7561
aese v0.16b, v27.16b
7562
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
7563
aese v4.16b, v27.16b
7564
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
7565
7566
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
7567
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
7568
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
7569
7570
aese v5.16b, v28.16b
7571
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
7572
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
7573
aese v2.16b, v28.16b
7574
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
7575
7576
aese v6.16b, v28.16b
7577
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
7578
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
7579
aese v1.16b, v28.16b
7580
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
7581
7582
aese v4.16b, v28.16b
7583
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
7584
aese v0.16b, v28.16b
7585
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
7586
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
7587
7588
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
7589
aese v3.16b, v28.16b
7590
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
7591
aese v7.16b, v28.16b
7592
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
7593
7594
ldp q27, q28, [x8, #160] //load rk10, rk11
7595
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
7596
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
7597
7598
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
7599
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
7600
aese v3.16b, v26.16b
7601
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
7602
7603
aese v6.16b, v26.16b
7604
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
7605
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
7606
aese v5.16b, v26.16b
7607
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
7608
7609
ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load ciphertext
7610
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
7611
aese v7.16b, v26.16b
7612
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
7613
7614
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
7615
aese v2.16b, v26.16b
7616
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
7617
aese v1.16b, v26.16b
7618
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
7619
7620
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
7621
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
7622
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
7623
7624
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
7625
aese v3.16b, v27.16b
7626
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
7627
aese v6.16b, v27.16b
7628
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
7629
7630
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
7631
aese v0.16b, v26.16b
7632
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
7633
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
7634
7635
aese v4.16b, v26.16b
7636
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
7637
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
7638
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
7639
7640
aese v2.16b, v27.16b
7641
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
7642
aese v5.16b, v27.16b
7643
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
7644
aese v7.16b, v27.16b
7645
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
7646
7647
aese v1.16b, v27.16b
7648
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
7649
aese v0.16b, v27.16b
7650
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
7651
aese v4.16b, v27.16b
7652
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
7653
7654
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
7655
rev32 v20.16b, v30.16b //CTR block 8k+16
7656
ldr d16, [x10] //MODULO - load modulo constant
7657
7658
add v30.4s, v30.4s, v31.4s //CTR block 8k+16
7659
aese v1.16b, v28.16b
7660
aesmc v1.16b, v1.16b //AES block 8k+9 - round 11
7661
ldp q26, q27, [x8, #192] //load rk12, rk13
7662
7663
aese v0.16b, v28.16b
7664
aesmc v0.16b, v0.16b //AES block 8k+8 - round 11
7665
aese v6.16b, v28.16b
7666
aesmc v6.16b, v6.16b //AES block 8k+14 - round 11
7667
7668
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
7669
rev32 v22.16b, v30.16b //CTR block 8k+17
7670
aese v2.16b, v28.16b
7671
aesmc v2.16b, v2.16b //AES block 8k+10 - round 11
7672
7673
ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load ciphertext
7674
aese v7.16b, v28.16b
7675
aesmc v7.16b, v7.16b //AES block 8k+15 - round 11
7676
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
7677
7678
aese v5.16b, v28.16b
7679
aesmc v5.16b, v5.16b //AES block 8k+13 - round 11
7680
add v30.4s, v30.4s, v31.4s //CTR block 8k+17
7681
aese v3.16b, v28.16b
7682
aesmc v3.16b, v3.16b //AES block 8k+11 - round 11
7683
7684
aese v2.16b, v26.16b
7685
aesmc v2.16b, v2.16b //AES block 8k+10 - round 12
7686
aese v7.16b, v26.16b
7687
aesmc v7.16b, v7.16b //AES block 8k+15 - round 12
7688
aese v6.16b, v26.16b
7689
aesmc v6.16b, v6.16b //AES block 8k+14 - round 12
7690
7691
rev32 v23.16b, v30.16b //CTR block 8k+18
7692
add v30.4s, v30.4s, v31.4s //CTR block 8k+18
7693
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
7694
7695
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
7696
aese v1.16b, v26.16b
7697
aesmc v1.16b, v1.16b //AES block 8k+9 - round 12
7698
aese v4.16b, v28.16b
7699
aesmc v4.16b, v4.16b //AES block 8k+12 - round 11
7700
7701
ldr q28, [x8, #224] //load rk14
7702
aese v5.16b, v26.16b
7703
aesmc v5.16b, v5.16b //AES block 8k+13 - round 12
7704
aese v3.16b, v26.16b
7705
aesmc v3.16b, v3.16b //AES block 8k+11 - round 12
7706
7707
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
7708
aese v0.16b, v26.16b
7709
aesmc v0.16b, v0.16b //AES block 8k+8 - round 12
7710
aese v4.16b, v26.16b
7711
aesmc v4.16b, v4.16b //AES block 8k+12 - round 12
7712
7713
ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load ciphertext
7714
aese v1.16b, v27.16b //AES block 8k+9 - round 13
7715
aese v2.16b, v27.16b //AES block 8k+10 - round 13
7716
7717
ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load ciphertext
7718
aese v0.16b, v27.16b //AES block 8k+8 - round 13
7719
aese v5.16b, v27.16b //AES block 8k+13 - round 13
7720
7721
rev32 v25.16b, v30.16b //CTR block 8k+19
7722
.inst 0xce027142 //eor3 v2.16b, v10.16b, v2.16b, v28.16b //AES block 8k+10 - result
7723
.inst 0xce017121 //eor3 v1.16b, v9.16b, v1.16b, v28.16b //AES block 8k+9 - result
7724
7725
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
7726
aese v7.16b, v27.16b //AES block 8k+15 - round 13
7727
7728
add v30.4s, v30.4s, v31.4s //CTR block 8k+19
7729
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
7730
aese v4.16b, v27.16b //AES block 8k+12 - round 13
7731
7732
.inst 0xce0571a5 //eor3 v5.16b, v13.16b, v5.16b, v28.16b //AES block 8k+13 - result
7733
.inst 0xce007100 //eor3 v0.16b, v8.16b, v0.16b, v28.16b //AES block 8k+8 - result
7734
aese v3.16b, v27.16b //AES block 8k+11 - round 13
7735
7736
stp q0, q1, [x2], #32 //AES block 8k+8, 8k+9 - store result
7737
mov v0.16b, v20.16b //CTR block 8k+16
7738
.inst 0xce047184 //eor3 v4.16b, v12.16b, v4.16b, v28.16b //AES block 8k+12 - result
7739
7740
.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low
7741
.inst 0xce037163 //eor3 v3.16b, v11.16b, v3.16b, v28.16b //AES block 8k+11 - result
7742
stp q2, q3, [x2], #32 //AES block 8k+10, 8k+11 - store result
7743
7744
mov v3.16b, v25.16b //CTR block 8k+19
7745
mov v2.16b, v23.16b //CTR block 8k+18
7746
aese v6.16b, v27.16b //AES block 8k+14 - round 13
7747
7748
mov v1.16b, v22.16b //CTR block 8k+17
7749
stp q4, q5, [x2], #32 //AES block 8k+12, 8k+13 - store result
7750
.inst 0xce0771e7 //eor3 v7.16b, v15.16b, v7.16b, v28.16b //AES block 8k+15 - result
7751
7752
.inst 0xce0671c6 //eor3 v6.16b, v14.16b, v6.16b, v28.16b //AES block 8k+14 - result
7753
rev32 v4.16b, v30.16b //CTR block 8k+20
7754
add v30.4s, v30.4s, v31.4s //CTR block 8k+20
7755
7756
cmp x0, x5 //.LOOP CONTROL
7757
stp q6, q7, [x2], #32 //AES block 8k+14, 8k+15 - store result
7758
b.lt .L256_dec_main_loop
7759
7760
.L256_dec_prepretail: //PREPRETAIL
7761
ldp q26, q27, [x8, #0] //load rk0, rk1
7762
rev32 v5.16b, v30.16b //CTR block 8k+13
7763
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
7764
7765
rev64 v12.16b, v12.16b //GHASH block 8k+4
7766
ldr q21, [x3, #144] //load h6k | h5k
7767
ldr q24, [x3, #192] //load h8k | h7k
7768
7769
rev32 v6.16b, v30.16b //CTR block 8k+14
7770
rev64 v8.16b, v8.16b //GHASH block 8k
7771
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
7772
7773
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
7774
ldr q23, [x3, #176] //load h7l | h7h
7775
ext v23.16b, v23.16b, v23.16b, #8
7776
ldr q25, [x3, #208] //load h8l | h8h
7777
ext v25.16b, v25.16b, v25.16b, #8
7778
rev64 v9.16b, v9.16b //GHASH block 8k+1
7779
7780
rev32 v7.16b, v30.16b //CTR block 8k+15
7781
rev64 v10.16b, v10.16b //GHASH block 8k+2
7782
ldr q20, [x3, #128] //load h5l | h5h
7783
ext v20.16b, v20.16b, v20.16b, #8
7784
ldr q22, [x3, #160] //load h6l | h6h
7785
ext v22.16b, v22.16b, v22.16b, #8
7786
7787
aese v0.16b, v26.16b
7788
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
7789
aese v1.16b, v26.16b
7790
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
7791
aese v4.16b, v26.16b
7792
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
7793
7794
aese v3.16b, v26.16b
7795
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
7796
aese v5.16b, v26.16b
7797
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
7798
aese v6.16b, v26.16b
7799
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
7800
7801
aese v4.16b, v27.16b
7802
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
7803
aese v7.16b, v26.16b
7804
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
7805
aese v2.16b, v26.16b
7806
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
7807
7808
ldp q28, q26, [x8, #32] //load rk2, rk3
7809
aese v0.16b, v27.16b
7810
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
7811
eor v8.16b, v8.16b, v19.16b //PRE 1
7812
7813
aese v7.16b, v27.16b
7814
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
7815
aese v6.16b, v27.16b
7816
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
7817
aese v2.16b, v27.16b
7818
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
7819
7820
aese v3.16b, v27.16b
7821
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
7822
aese v1.16b, v27.16b
7823
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
7824
aese v5.16b, v27.16b
7825
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
7826
7827
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
7828
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
7829
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
7830
7831
rev64 v11.16b, v11.16b //GHASH block 8k+3
7832
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
7833
7834
aese v5.16b, v28.16b
7835
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
7836
aese v7.16b, v28.16b
7837
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
7838
aese v1.16b, v28.16b
7839
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
7840
7841
aese v3.16b, v28.16b
7842
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
7843
aese v6.16b, v28.16b
7844
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
7845
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
7846
7847
aese v0.16b, v28.16b
7848
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
7849
aese v7.16b, v26.16b
7850
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
7851
7852
aese v5.16b, v26.16b
7853
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
7854
rev64 v14.16b, v14.16b //GHASH block 8k+6
7855
7856
aese v0.16b, v26.16b
7857
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
7858
aese v2.16b, v28.16b
7859
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
7860
aese v6.16b, v26.16b
7861
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
7862
7863
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
7864
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
7865
aese v4.16b, v28.16b
7866
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
7867
7868
ldp q27, q28, [x8, #64] //load rk4, rk5
7869
aese v1.16b, v26.16b
7870
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
7871
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
7872
7873
aese v2.16b, v26.16b
7874
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
7875
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
7876
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
7877
7878
aese v4.16b, v26.16b
7879
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
7880
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
7881
aese v3.16b, v26.16b
7882
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
7883
7884
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
7885
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
7886
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
7887
7888
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
7889
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
7890
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
7891
7892
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
7893
aese v5.16b, v27.16b
7894
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
7895
aese v0.16b, v27.16b
7896
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
7897
7898
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
7899
ldr q20, [x3, #32] //load h1l | h1h
7900
ext v20.16b, v20.16b, v20.16b, #8
7901
ldr q22, [x3, #64] //load h2l | h2h
7902
ext v22.16b, v22.16b, v22.16b, #8
7903
aese v7.16b, v27.16b
7904
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
7905
7906
aese v2.16b, v27.16b
7907
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
7908
aese v6.16b, v27.16b
7909
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
7910
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
7911
7912
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
7913
aese v7.16b, v28.16b
7914
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
7915
aese v1.16b, v27.16b
7916
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
7917
7918
aese v2.16b, v28.16b
7919
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
7920
aese v3.16b, v27.16b
7921
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
7922
aese v4.16b, v27.16b
7923
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
7924
7925
aese v1.16b, v28.16b
7926
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
7927
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
7928
aese v6.16b, v28.16b
7929
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
7930
7931
aese v4.16b, v28.16b
7932
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
7933
aese v3.16b, v28.16b
7934
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
7935
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
7936
7937
aese v0.16b, v28.16b
7938
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
7939
aese v5.16b, v28.16b
7940
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
7941
ldp q26, q27, [x8, #96] //load rk6, rk7
7942
7943
ldr q23, [x3, #80] //load h3l | h3h
7944
ext v23.16b, v23.16b, v23.16b, #8
7945
ldr q25, [x3, #112] //load h4l | h4h
7946
ext v25.16b, v25.16b, v25.16b, #8
7947
rev64 v15.16b, v15.16b //GHASH block 8k+7
7948
rev64 v13.16b, v13.16b //GHASH block 8k+5
7949
7950
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
7951
7952
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
7953
7954
aese v0.16b, v26.16b
7955
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
7956
ldr q21, [x3, #48] //load h2k | h1k
7957
ldr q24, [x3, #96] //load h4k | h3k
7958
aese v6.16b, v26.16b
7959
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
7960
7961
aese v5.16b, v26.16b
7962
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
7963
aese v7.16b, v26.16b
7964
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
7965
7966
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
7967
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
7968
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
7969
7970
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
7971
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
7972
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
7973
7974
aese v7.16b, v27.16b
7975
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
7976
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
7977
aese v1.16b, v26.16b
7978
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
7979
7980
aese v2.16b, v26.16b
7981
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
7982
aese v3.16b, v26.16b
7983
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
7984
aese v4.16b, v26.16b
7985
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
7986
7987
ldp q28, q26, [x8, #128] //load rk8, rk9
7988
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
7989
aese v5.16b, v27.16b
7990
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
7991
7992
aese v1.16b, v27.16b
7993
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
7994
aese v4.16b, v27.16b
7995
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
7996
7997
aese v6.16b, v27.16b
7998
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
7999
aese v2.16b, v27.16b
8000
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
8001
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
8002
8003
aese v0.16b, v27.16b
8004
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
8005
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
8006
aese v3.16b, v27.16b
8007
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
8008
8009
aese v0.16b, v28.16b
8010
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
8011
aese v7.16b, v28.16b
8012
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
8013
aese v4.16b, v28.16b
8014
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
8015
8016
aese v1.16b, v28.16b
8017
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
8018
aese v5.16b, v28.16b
8019
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
8020
aese v6.16b, v28.16b
8021
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
8022
8023
aese v3.16b, v28.16b
8024
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
8025
aese v4.16b, v26.16b
8026
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
8027
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
8028
8029
aese v0.16b, v26.16b
8030
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
8031
aese v1.16b, v26.16b
8032
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
8033
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
8034
8035
aese v6.16b, v26.16b
8036
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
8037
aese v7.16b, v26.16b
8038
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
8039
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
8040
8041
aese v2.16b, v28.16b
8042
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
8043
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
8044
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
8045
8046
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
8047
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
8048
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
8049
8050
ldp q27, q28, [x8, #160] //load rk10, rk11
8051
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
8052
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
8053
8054
aese v2.16b, v26.16b
8055
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
8056
aese v3.16b, v26.16b
8057
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
8058
aese v5.16b, v26.16b
8059
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
8060
8061
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
8062
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
8063
ldr d16, [x10] //MODULO - load modulo constant
8064
8065
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
8066
8067
aese v4.16b, v27.16b
8068
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
8069
aese v6.16b, v27.16b
8070
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
8071
aese v5.16b, v27.16b
8072
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
8073
8074
aese v0.16b, v27.16b
8075
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
8076
aese v2.16b, v27.16b
8077
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
8078
aese v3.16b, v27.16b
8079
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
8080
8081
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
8082
8083
aese v7.16b, v27.16b
8084
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
8085
aese v1.16b, v27.16b
8086
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
8087
ldp q26, q27, [x8, #192] //load rk12, rk13
8088
8089
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
8090
8091
aese v2.16b, v28.16b
8092
aesmc v2.16b, v2.16b //AES block 8k+10 - round 11
8093
aese v1.16b, v28.16b
8094
aesmc v1.16b, v1.16b //AES block 8k+9 - round 11
8095
aese v0.16b, v28.16b
8096
aesmc v0.16b, v0.16b //AES block 8k+8 - round 11
8097
8098
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
8099
aese v3.16b, v28.16b
8100
aesmc v3.16b, v3.16b //AES block 8k+11 - round 11
8101
8102
aese v7.16b, v28.16b
8103
aesmc v7.16b, v7.16b //AES block 8k+15 - round 11
8104
aese v6.16b, v28.16b
8105
aesmc v6.16b, v6.16b //AES block 8k+14 - round 11
8106
aese v4.16b, v28.16b
8107
aesmc v4.16b, v4.16b //AES block 8k+12 - round 11
8108
8109
aese v5.16b, v28.16b
8110
aesmc v5.16b, v5.16b //AES block 8k+13 - round 11
8111
aese v3.16b, v26.16b
8112
aesmc v3.16b, v3.16b //AES block 8k+11 - round 12
8113
8114
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
8115
8116
aese v3.16b, v27.16b //AES block 8k+11 - round 13
8117
aese v2.16b, v26.16b
8118
aesmc v2.16b, v2.16b //AES block 8k+10 - round 12
8119
aese v6.16b, v26.16b
8120
aesmc v6.16b, v6.16b //AES block 8k+14 - round 12
8121
8122
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
8123
aese v4.16b, v26.16b
8124
aesmc v4.16b, v4.16b //AES block 8k+12 - round 12
8125
aese v7.16b, v26.16b
8126
aesmc v7.16b, v7.16b //AES block 8k+15 - round 12
8127
8128
aese v0.16b, v26.16b
8129
aesmc v0.16b, v0.16b //AES block 8k+8 - round 12
8130
ldr q28, [x8, #224] //load rk14
8131
aese v1.16b, v26.16b
8132
aesmc v1.16b, v1.16b //AES block 8k+9 - round 12
8133
8134
aese v4.16b, v27.16b //AES block 8k+12 - round 13
8135
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
8136
aese v5.16b, v26.16b
8137
aesmc v5.16b, v5.16b //AES block 8k+13 - round 12
8138
8139
aese v6.16b, v27.16b //AES block 8k+14 - round 13
8140
aese v2.16b, v27.16b //AES block 8k+10 - round 13
8141
aese v1.16b, v27.16b //AES block 8k+9 - round 13
8142
8143
aese v5.16b, v27.16b //AES block 8k+13 - round 13
8144
.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low
8145
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
8146
8147
aese v7.16b, v27.16b //AES block 8k+15 - round 13
8148
aese v0.16b, v27.16b //AES block 8k+8 - round 13
8149
.L256_dec_tail: //TAIL
8150
8151
ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
8152
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
8153
cmp x5, #112
8154
8155
ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext
8156
8157
ldp q24, q25, [x3, #192] //load h8k | h7k
8158
ext v25.16b, v25.16b, v25.16b, #8
8159
mov v29.16b, v28.16b
8160
8161
ldp q20, q21, [x3, #128] //load h5l | h5h
8162
ext v20.16b, v20.16b, v20.16b, #8
8163
8164
.inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result
8165
ldp q22, q23, [x3, #160] //load h6l | h6h
8166
ext v22.16b, v22.16b, v22.16b, #8
8167
ext v23.16b, v23.16b, v23.16b, #8
8168
b.gt .L256_dec_blocks_more_than_7
8169
8170
mov v7.16b, v6.16b
8171
sub v30.4s, v30.4s, v31.4s
8172
mov v6.16b, v5.16b
8173
8174
mov v5.16b, v4.16b
8175
mov v4.16b, v3.16b
8176
movi v19.8b, #0
8177
8178
movi v17.8b, #0
8179
movi v18.8b, #0
8180
mov v3.16b, v2.16b
8181
8182
cmp x5, #96
8183
mov v2.16b, v1.16b
8184
b.gt .L256_dec_blocks_more_than_6
8185
8186
mov v7.16b, v6.16b
8187
mov v6.16b, v5.16b
8188
8189
mov v5.16b, v4.16b
8190
cmp x5, #80
8191
sub v30.4s, v30.4s, v31.4s
8192
8193
mov v4.16b, v3.16b
8194
mov v3.16b, v1.16b
8195
b.gt .L256_dec_blocks_more_than_5
8196
8197
cmp x5, #64
8198
mov v7.16b, v6.16b
8199
sub v30.4s, v30.4s, v31.4s
8200
8201
mov v6.16b, v5.16b
8202
8203
mov v5.16b, v4.16b
8204
mov v4.16b, v1.16b
8205
b.gt .L256_dec_blocks_more_than_4
8206
8207
sub v30.4s, v30.4s, v31.4s
8208
mov v7.16b, v6.16b
8209
cmp x5, #48
8210
8211
mov v6.16b, v5.16b
8212
mov v5.16b, v1.16b
8213
b.gt .L256_dec_blocks_more_than_3
8214
8215
ldr q24, [x3, #96] //load h4k | h3k
8216
sub v30.4s, v30.4s, v31.4s
8217
mov v7.16b, v6.16b
8218
8219
cmp x5, #32
8220
mov v6.16b, v1.16b
8221
b.gt .L256_dec_blocks_more_than_2
8222
8223
sub v30.4s, v30.4s, v31.4s
8224
8225
mov v7.16b, v1.16b
8226
cmp x5, #16
8227
b.gt .L256_dec_blocks_more_than_1
8228
8229
sub v30.4s, v30.4s, v31.4s
8230
ldr q21, [x3, #48] //load h2k | h1k
8231
b .L256_dec_blocks_less_than_1
8232
.L256_dec_blocks_more_than_7: //blocks left > 7
8233
rev64 v8.16b, v9.16b //GHASH final-7 block
8234
ldr q9, [x0], #16 //AES final-6 block - load ciphertext
8235
st1 { v12.16b}, [x2], #16 //AES final-7 block - store result
8236
8237
ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
8238
8239
eor v8.16b, v8.16b, v16.16b //feed in partial tag
8240
8241
ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
8242
.inst 0xce01752c //eor3 v12.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
8243
8244
pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
8245
8246
eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
8247
movi v16.8b, #0 //suppress further partial tag feed in
8248
8249
pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
8250
pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
8251
.L256_dec_blocks_more_than_6: //blocks left > 6
8252
8253
rev64 v8.16b, v9.16b //GHASH final-6 block
8254
8255
eor v8.16b, v8.16b, v16.16b //feed in partial tag
8256
ldr q9, [x0], #16 //AES final-5 block - load ciphertext
8257
movi v16.8b, #0 //suppress further partial tag feed in
8258
8259
ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
8260
st1 { v12.16b}, [x2], #16 //AES final-6 block - store result
8261
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
8262
8263
pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
8264
8265
.inst 0xce02752c //eor3 v12.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
8266
eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
8267
eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
8268
8269
pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
8270
8271
eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
8272
eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
8273
.L256_dec_blocks_more_than_5: //blocks left > 5
8274
8275
rev64 v8.16b, v9.16b //GHASH final-5 block
8276
8277
eor v8.16b, v8.16b, v16.16b //feed in partial tag
8278
8279
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
8280
ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
8281
8282
ldr q9, [x0], #16 //AES final-4 block - load ciphertext
8283
8284
eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
8285
st1 { v12.16b}, [x2], #16 //AES final-5 block - store result
8286
8287
pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
8288
ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
8289
8290
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
8291
8292
eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
8293
.inst 0xce03752c //eor3 v12.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
8294
eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
8295
8296
eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
8297
movi v16.8b, #0 //suppress further partial tag feed in
8298
.L256_dec_blocks_more_than_4: //blocks left > 4
8299
8300
rev64 v8.16b, v9.16b //GHASH final-4 block
8301
8302
eor v8.16b, v8.16b, v16.16b //feed in partial tag
8303
8304
ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
8305
ldr q9, [x0], #16 //AES final-3 block - load ciphertext
8306
8307
movi v16.8b, #0 //suppress further partial tag feed in
8308
8309
pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
8310
pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
8311
8312
eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
8313
8314
eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
8315
8316
pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
8317
8318
eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
8319
st1 { v12.16b}, [x2], #16 //AES final-4 block - store result
8320
8321
eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
8322
.inst 0xce04752c //eor3 v12.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
8323
.L256_dec_blocks_more_than_3: //blocks left > 3
8324
8325
ldr q25, [x3, #112] //load h4l | h4h
8326
ext v25.16b, v25.16b, v25.16b, #8
8327
rev64 v8.16b, v9.16b //GHASH final-3 block
8328
8329
eor v8.16b, v8.16b, v16.16b //feed in partial tag
8330
ldr q9, [x0], #16 //AES final-2 block - load ciphertext
8331
ldr q24, [x3, #96] //load h4k | h3k
8332
8333
ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
8334
st1 { v12.16b}, [x2], #16 //AES final-3 block - store result
8335
8336
.inst 0xce05752c //eor3 v12.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
8337
8338
eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
8339
8340
ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
8341
pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
8342
pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
8343
8344
movi v16.8b, #0 //suppress further partial tag feed in
8345
pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
8346
eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
8347
8348
eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
8349
8350
eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
8351
.L256_dec_blocks_more_than_2: //blocks left > 2
8352
8353
rev64 v8.16b, v9.16b //GHASH final-2 block
8354
8355
ldr q23, [x3, #80] //load h3l | h3h
8356
ext v23.16b, v23.16b, v23.16b, #8
8357
ldr q9, [x0], #16 //AES final-1 block - load ciphertext
8358
8359
eor v8.16b, v8.16b, v16.16b //feed in partial tag
8360
8361
ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
8362
8363
pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
8364
st1 { v12.16b}, [x2], #16 //AES final-2 block - store result
8365
.inst 0xce06752c //eor3 v12.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
8366
8367
eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
8368
eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
8369
movi v16.8b, #0 //suppress further partial tag feed in
8370
8371
pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
8372
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
8373
8374
eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
8375
eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
8376
.L256_dec_blocks_more_than_1: //blocks left > 1
8377
8378
rev64 v8.16b, v9.16b //GHASH final-1 block
8379
8380
eor v8.16b, v8.16b, v16.16b //feed in partial tag
8381
8382
ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
8383
ldr q22, [x3, #64] //load h2l | h2h
8384
ext v22.16b, v22.16b, v22.16b, #8
8385
8386
eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
8387
ldr q9, [x0], #16 //AES final block - load ciphertext
8388
st1 { v12.16b}, [x2], #16 //AES final-1 block - store result
8389
8390
ldr q21, [x3, #48] //load h2k | h1k
8391
pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
8392
8393
ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
8394
8395
eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
8396
8397
.inst 0xce07752c //eor3 v12.16b, v9.16b, v7.16b, v29.16b //AES final block - result
8398
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
8399
8400
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
8401
8402
movi v16.8b, #0 //suppress further partial tag feed in
8403
eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
8404
8405
eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
8406
.L256_dec_blocks_less_than_1: //blocks left <= 1
8407
8408
ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
8409
mvn x6, xzr //temp0_x = 0xffffffffffffffff
8410
and x1, x1, #127 //bit_length %= 128
8411
8412
sub x1, x1, #128 //bit_length -= 128
8413
rev32 v30.16b, v30.16b
8414
str q30, [x16] //store the updated counter
8415
8416
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
8417
8418
and x1, x1, #127 //bit_length %= 128
8419
8420
lsr x6, x6, x1 //temp0_x is mask for top 64b of last block
8421
cmp x1, #64
8422
mvn x7, xzr //temp1_x = 0xffffffffffffffff
8423
8424
csel x14, x6, xzr, lt
8425
csel x13, x7, x6, lt
8426
8427
mov v0.d[0], x13 //ctr0b is mask for last block
8428
mov v0.d[1], x14
8429
8430
and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
8431
ldr q20, [x3, #32] //load h1l | h1h
8432
ext v20.16b, v20.16b, v20.16b, #8
8433
bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
8434
8435
rev64 v8.16b, v9.16b //GHASH final block
8436
8437
eor v8.16b, v8.16b, v16.16b //feed in partial tag
8438
8439
ins v16.d[0], v8.d[1] //GHASH final block - mid
8440
pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
8441
8442
eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
8443
8444
pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
8445
eor v17.16b, v17.16b, v28.16b //GHASH final block - high
8446
8447
pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
8448
8449
eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
8450
ldr d16, [x10] //MODULO - load modulo constant
8451
eor v19.16b, v19.16b, v26.16b //GHASH final block - low
8452
8453
pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
8454
eor v14.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
8455
8456
ext v17.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
8457
st1 { v12.16b}, [x2] //store all 16B
8458
8459
eor v18.16b, v18.16b, v14.16b //MODULO - karatsuba tidy up
8460
8461
eor v21.16b, v17.16b, v21.16b //MODULO - fold into mid
8462
eor v18.16b, v18.16b, v21.16b //MODULO - fold into mid
8463
8464
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
8465
8466
ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
8467
eor v19.16b, v19.16b, v17.16b //MODULO - fold into low
8468
8469
eor v19.16b, v19.16b, v18.16b //MODULO - fold into low
8470
ext v19.16b, v19.16b, v19.16b, #8
8471
rev64 v19.16b, v19.16b
8472
st1 { v19.16b }, [x3]
8473
mov x0, x9
8474
8475
ldp d10, d11, [sp, #16]
8476
ldp d12, d13, [sp, #32]
8477
ldp d14, d15, [sp, #48]
8478
ldp d8, d9, [sp], #80
8479
ret
8480
8481
.L256_dec_ret:
8482
mov w0, #0x0
8483
ret
8484
.size unroll8_eor3_aes_gcm_dec_256_kernel,.-unroll8_eor3_aes_gcm_dec_256_kernel
8485
.byte 65,69,83,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,65,82,77,118,56,44,32,83,80,68,88,32,66,83,68,45,51,45,67,108,97,117,115,101,32,98,121,32,60,120,105,97,111,107,97,110,103,46,113,105,97,110,64,97,114,109,46,99,111,109,62,0
8486
.align 2
8487
.align 2
8488
#endif
8489
8490