Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/arm64/crypto/aes-modes.S
26424 views
1
/* SPDX-License-Identifier: GPL-2.0-only */
2
/*
3
* linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
4
*
5
* Copyright (C) 2013 - 2017 Linaro Ltd <[email protected]>
6
*/
7
8
/* included by aes-ce.S and aes-neon.S */
9
10
.text
11
.align 4
12
13
#ifndef MAX_STRIDE
14
#define MAX_STRIDE 4
15
#endif
16
17
#if MAX_STRIDE == 4
18
#define ST4(x...) x
19
#define ST5(x...)
20
#else
21
#define ST4(x...)
22
#define ST5(x...) x
23
#endif
24
25
SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
26
encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
27
ret
28
SYM_FUNC_END(aes_encrypt_block4x)
29
30
SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
31
decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
32
ret
33
SYM_FUNC_END(aes_decrypt_block4x)
34
35
#if MAX_STRIDE == 5
36
SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
37
encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
38
ret
39
SYM_FUNC_END(aes_encrypt_block5x)
40
41
SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
42
decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
43
ret
44
SYM_FUNC_END(aes_decrypt_block5x)
45
#endif
46
47
/*
48
* aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
49
* int blocks)
50
* aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
51
* int blocks)
52
*/
53
54
AES_FUNC_START(aes_ecb_encrypt)
55
frame_push 0
56
57
enc_prepare w3, x2, x5
58
59
.LecbencloopNx:
60
subs w4, w4, #MAX_STRIDE
61
bmi .Lecbenc1x
62
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
63
ST4( bl aes_encrypt_block4x )
64
ST5( ld1 {v4.16b}, [x1], #16 )
65
ST5( bl aes_encrypt_block5x )
66
st1 {v0.16b-v3.16b}, [x0], #64
67
ST5( st1 {v4.16b}, [x0], #16 )
68
b .LecbencloopNx
69
.Lecbenc1x:
70
adds w4, w4, #MAX_STRIDE
71
beq .Lecbencout
72
.Lecbencloop:
73
ld1 {v0.16b}, [x1], #16 /* get next pt block */
74
encrypt_block v0, w3, x2, x5, w6
75
st1 {v0.16b}, [x0], #16
76
subs w4, w4, #1
77
bne .Lecbencloop
78
.Lecbencout:
79
frame_pop
80
ret
81
AES_FUNC_END(aes_ecb_encrypt)
82
83
84
AES_FUNC_START(aes_ecb_decrypt)
85
frame_push 0
86
87
dec_prepare w3, x2, x5
88
89
.LecbdecloopNx:
90
subs w4, w4, #MAX_STRIDE
91
bmi .Lecbdec1x
92
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
93
ST4( bl aes_decrypt_block4x )
94
ST5( ld1 {v4.16b}, [x1], #16 )
95
ST5( bl aes_decrypt_block5x )
96
st1 {v0.16b-v3.16b}, [x0], #64
97
ST5( st1 {v4.16b}, [x0], #16 )
98
b .LecbdecloopNx
99
.Lecbdec1x:
100
adds w4, w4, #MAX_STRIDE
101
beq .Lecbdecout
102
.Lecbdecloop:
103
ld1 {v0.16b}, [x1], #16 /* get next ct block */
104
decrypt_block v0, w3, x2, x5, w6
105
st1 {v0.16b}, [x0], #16
106
subs w4, w4, #1
107
bne .Lecbdecloop
108
.Lecbdecout:
109
frame_pop
110
ret
111
AES_FUNC_END(aes_ecb_decrypt)
112
113
114
/*
115
* aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
116
* int blocks, u8 iv[])
117
* aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
118
* int blocks, u8 iv[])
119
* aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
120
* int rounds, int blocks, u8 iv[],
121
* u32 const rk2[]);
122
* aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
123
* int rounds, int blocks, u8 iv[],
124
* u32 const rk2[]);
125
*/
126
127
AES_FUNC_START(aes_essiv_cbc_encrypt)
128
ld1 {v4.16b}, [x5] /* get iv */
129
130
mov w8, #14 /* AES-256: 14 rounds */
131
enc_prepare w8, x6, x7
132
encrypt_block v4, w8, x6, x7, w9
133
enc_switch_key w3, x2, x6
134
b .Lcbcencloop4x
135
136
AES_FUNC_START(aes_cbc_encrypt)
137
ld1 {v4.16b}, [x5] /* get iv */
138
enc_prepare w3, x2, x6
139
140
.Lcbcencloop4x:
141
subs w4, w4, #4
142
bmi .Lcbcenc1x
143
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
144
eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */
145
encrypt_block v0, w3, x2, x6, w7
146
eor v1.16b, v1.16b, v0.16b
147
encrypt_block v1, w3, x2, x6, w7
148
eor v2.16b, v2.16b, v1.16b
149
encrypt_block v2, w3, x2, x6, w7
150
eor v3.16b, v3.16b, v2.16b
151
encrypt_block v3, w3, x2, x6, w7
152
st1 {v0.16b-v3.16b}, [x0], #64
153
mov v4.16b, v3.16b
154
b .Lcbcencloop4x
155
.Lcbcenc1x:
156
adds w4, w4, #4
157
beq .Lcbcencout
158
.Lcbcencloop:
159
ld1 {v0.16b}, [x1], #16 /* get next pt block */
160
eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */
161
encrypt_block v4, w3, x2, x6, w7
162
st1 {v4.16b}, [x0], #16
163
subs w4, w4, #1
164
bne .Lcbcencloop
165
.Lcbcencout:
166
st1 {v4.16b}, [x5] /* return iv */
167
ret
168
AES_FUNC_END(aes_cbc_encrypt)
169
AES_FUNC_END(aes_essiv_cbc_encrypt)
170
171
AES_FUNC_START(aes_essiv_cbc_decrypt)
172
ld1 {cbciv.16b}, [x5] /* get iv */
173
174
mov w8, #14 /* AES-256: 14 rounds */
175
enc_prepare w8, x6, x7
176
encrypt_block cbciv, w8, x6, x7, w9
177
b .Lessivcbcdecstart
178
179
AES_FUNC_START(aes_cbc_decrypt)
180
ld1 {cbciv.16b}, [x5] /* get iv */
181
.Lessivcbcdecstart:
182
frame_push 0
183
dec_prepare w3, x2, x6
184
185
.LcbcdecloopNx:
186
subs w4, w4, #MAX_STRIDE
187
bmi .Lcbcdec1x
188
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
189
#if MAX_STRIDE == 5
190
ld1 {v4.16b}, [x1], #16 /* get 1 ct block */
191
mov v5.16b, v0.16b
192
mov v6.16b, v1.16b
193
mov v7.16b, v2.16b
194
bl aes_decrypt_block5x
195
sub x1, x1, #32
196
eor v0.16b, v0.16b, cbciv.16b
197
eor v1.16b, v1.16b, v5.16b
198
ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */
199
ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
200
eor v2.16b, v2.16b, v6.16b
201
eor v3.16b, v3.16b, v7.16b
202
eor v4.16b, v4.16b, v5.16b
203
#else
204
mov v4.16b, v0.16b
205
mov v5.16b, v1.16b
206
mov v6.16b, v2.16b
207
bl aes_decrypt_block4x
208
sub x1, x1, #16
209
eor v0.16b, v0.16b, cbciv.16b
210
eor v1.16b, v1.16b, v4.16b
211
ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
212
eor v2.16b, v2.16b, v5.16b
213
eor v3.16b, v3.16b, v6.16b
214
#endif
215
st1 {v0.16b-v3.16b}, [x0], #64
216
ST5( st1 {v4.16b}, [x0], #16 )
217
b .LcbcdecloopNx
218
.Lcbcdec1x:
219
adds w4, w4, #MAX_STRIDE
220
beq .Lcbcdecout
221
.Lcbcdecloop:
222
ld1 {v1.16b}, [x1], #16 /* get next ct block */
223
mov v0.16b, v1.16b /* ...and copy to v0 */
224
decrypt_block v0, w3, x2, x6, w7
225
eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */
226
mov cbciv.16b, v1.16b /* ct is next iv */
227
st1 {v0.16b}, [x0], #16
228
subs w4, w4, #1
229
bne .Lcbcdecloop
230
.Lcbcdecout:
231
st1 {cbciv.16b}, [x5] /* return iv */
232
frame_pop
233
ret
234
AES_FUNC_END(aes_cbc_decrypt)
235
AES_FUNC_END(aes_essiv_cbc_decrypt)
236
237
238
/*
239
* aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
240
* int rounds, int bytes, u8 const iv[])
241
* aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
242
* int rounds, int bytes, u8 const iv[])
243
*/
244
245
AES_FUNC_START(aes_cbc_cts_encrypt)
246
adr_l x8, .Lcts_permute_table
247
sub x4, x4, #16
248
add x9, x8, #32
249
add x8, x8, x4
250
sub x9, x9, x4
251
ld1 {v3.16b}, [x8]
252
ld1 {v4.16b}, [x9]
253
254
ld1 {v0.16b}, [x1], x4 /* overlapping loads */
255
ld1 {v1.16b}, [x1]
256
257
ld1 {v5.16b}, [x5] /* get iv */
258
enc_prepare w3, x2, x6
259
260
eor v0.16b, v0.16b, v5.16b /* xor with iv */
261
tbl v1.16b, {v1.16b}, v4.16b
262
encrypt_block v0, w3, x2, x6, w7
263
264
eor v1.16b, v1.16b, v0.16b
265
tbl v0.16b, {v0.16b}, v3.16b
266
encrypt_block v1, w3, x2, x6, w7
267
268
add x4, x0, x4
269
st1 {v0.16b}, [x4] /* overlapping stores */
270
st1 {v1.16b}, [x0]
271
ret
272
AES_FUNC_END(aes_cbc_cts_encrypt)
273
274
AES_FUNC_START(aes_cbc_cts_decrypt)
275
adr_l x8, .Lcts_permute_table
276
sub x4, x4, #16
277
add x9, x8, #32
278
add x8, x8, x4
279
sub x9, x9, x4
280
ld1 {v3.16b}, [x8]
281
ld1 {v4.16b}, [x9]
282
283
ld1 {v0.16b}, [x1], x4 /* overlapping loads */
284
ld1 {v1.16b}, [x1]
285
286
ld1 {v5.16b}, [x5] /* get iv */
287
dec_prepare w3, x2, x6
288
289
decrypt_block v0, w3, x2, x6, w7
290
tbl v2.16b, {v0.16b}, v3.16b
291
eor v2.16b, v2.16b, v1.16b
292
293
tbx v0.16b, {v1.16b}, v4.16b
294
decrypt_block v0, w3, x2, x6, w7
295
eor v0.16b, v0.16b, v5.16b /* xor with iv */
296
297
add x4, x0, x4
298
st1 {v2.16b}, [x4] /* overlapping stores */
299
st1 {v0.16b}, [x0]
300
ret
301
AES_FUNC_END(aes_cbc_cts_decrypt)
302
303
.section ".rodata", "a"
304
.align 6
305
.Lcts_permute_table:
306
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
307
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
308
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
309
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
310
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
311
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
312
.previous
313
314
/*
315
* This macro generates the code for CTR and XCTR mode.
316
*/
317
.macro ctr_encrypt xctr
318
// Arguments
319
OUT .req x0
320
IN .req x1
321
KEY .req x2
322
ROUNDS_W .req w3
323
BYTES_W .req w4
324
IV .req x5
325
BYTE_CTR_W .req w6 // XCTR only
326
// Intermediate values
327
CTR_W .req w11 // XCTR only
328
CTR .req x11 // XCTR only
329
IV_PART .req x12
330
BLOCKS .req x13
331
BLOCKS_W .req w13
332
333
frame_push 0
334
335
enc_prepare ROUNDS_W, KEY, IV_PART
336
ld1 {vctr.16b}, [IV]
337
338
/*
339
* Keep 64 bits of the IV in a register. For CTR mode this lets us
340
* easily increment the IV. For XCTR mode this lets us efficiently XOR
341
* the 64-bit counter with the IV.
342
*/
343
.if \xctr
344
umov IV_PART, vctr.d[0]
345
lsr CTR_W, BYTE_CTR_W, #4
346
.else
347
umov IV_PART, vctr.d[1]
348
rev IV_PART, IV_PART
349
.endif
350
351
.LctrloopNx\xctr:
352
add BLOCKS_W, BYTES_W, #15
353
sub BYTES_W, BYTES_W, #MAX_STRIDE << 4
354
lsr BLOCKS_W, BLOCKS_W, #4
355
mov w8, #MAX_STRIDE
356
cmp BLOCKS_W, w8
357
csel BLOCKS_W, BLOCKS_W, w8, lt
358
359
/*
360
* Set up the counter values in v0-v{MAX_STRIDE-1}.
361
*
362
* If we are encrypting less than MAX_STRIDE blocks, the tail block
363
* handling code expects the last keystream block to be in
364
* v{MAX_STRIDE-1}. For example: if encrypting two blocks with
365
* MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks.
366
*/
367
.if \xctr
368
add CTR, CTR, BLOCKS
369
.else
370
adds IV_PART, IV_PART, BLOCKS
371
.endif
372
mov v0.16b, vctr.16b
373
mov v1.16b, vctr.16b
374
mov v2.16b, vctr.16b
375
mov v3.16b, vctr.16b
376
ST5( mov v4.16b, vctr.16b )
377
.if \xctr
378
sub x6, CTR, #MAX_STRIDE - 1
379
sub x7, CTR, #MAX_STRIDE - 2
380
sub x8, CTR, #MAX_STRIDE - 3
381
sub x9, CTR, #MAX_STRIDE - 4
382
ST5( sub x10, CTR, #MAX_STRIDE - 5 )
383
eor x6, x6, IV_PART
384
eor x7, x7, IV_PART
385
eor x8, x8, IV_PART
386
eor x9, x9, IV_PART
387
ST5( eor x10, x10, IV_PART )
388
mov v0.d[0], x6
389
mov v1.d[0], x7
390
mov v2.d[0], x8
391
mov v3.d[0], x9
392
ST5( mov v4.d[0], x10 )
393
.else
394
bcs 0f
395
.subsection 1
396
/*
397
* This subsection handles carries.
398
*
399
* Conditional branching here is allowed with respect to time
400
* invariance since the branches are dependent on the IV instead
401
* of the plaintext or key. This code is rarely executed in
402
* practice anyway.
403
*/
404
405
/* Apply carry to outgoing counter. */
406
0: umov x8, vctr.d[0]
407
rev x8, x8
408
add x8, x8, #1
409
rev x8, x8
410
ins vctr.d[0], x8
411
412
/*
413
* Apply carry to counter blocks if needed.
414
*
415
* Since the carry flag was set, we know 0 <= IV_PART <
416
* MAX_STRIDE. Using the value of IV_PART we can determine how
417
* many counter blocks need to be updated.
418
*/
419
cbz IV_PART, 2f
420
adr x16, 1f
421
sub x16, x16, IV_PART, lsl #3
422
br x16
423
bti c
424
mov v0.d[0], vctr.d[0]
425
bti c
426
mov v1.d[0], vctr.d[0]
427
bti c
428
mov v2.d[0], vctr.d[0]
429
bti c
430
mov v3.d[0], vctr.d[0]
431
ST5( bti c )
432
ST5( mov v4.d[0], vctr.d[0] )
433
1: b 2f
434
.previous
435
436
2: rev x7, IV_PART
437
ins vctr.d[1], x7
438
sub x7, IV_PART, #MAX_STRIDE - 1
439
sub x8, IV_PART, #MAX_STRIDE - 2
440
sub x9, IV_PART, #MAX_STRIDE - 3
441
rev x7, x7
442
rev x8, x8
443
mov v1.d[1], x7
444
rev x9, x9
445
ST5( sub x10, IV_PART, #MAX_STRIDE - 4 )
446
mov v2.d[1], x8
447
ST5( rev x10, x10 )
448
mov v3.d[1], x9
449
ST5( mov v4.d[1], x10 )
450
.endif
451
452
/*
453
* If there are at least MAX_STRIDE blocks left, XOR the data with
454
* keystream and store. Otherwise jump to tail handling.
455
*/
456
tbnz BYTES_W, #31, .Lctrtail\xctr
457
ld1 {v5.16b-v7.16b}, [IN], #48
458
ST4( bl aes_encrypt_block4x )
459
ST5( bl aes_encrypt_block5x )
460
eor v0.16b, v5.16b, v0.16b
461
ST4( ld1 {v5.16b}, [IN], #16 )
462
eor v1.16b, v6.16b, v1.16b
463
ST5( ld1 {v5.16b-v6.16b}, [IN], #32 )
464
eor v2.16b, v7.16b, v2.16b
465
eor v3.16b, v5.16b, v3.16b
466
ST5( eor v4.16b, v6.16b, v4.16b )
467
st1 {v0.16b-v3.16b}, [OUT], #64
468
ST5( st1 {v4.16b}, [OUT], #16 )
469
cbz BYTES_W, .Lctrout\xctr
470
b .LctrloopNx\xctr
471
472
.Lctrout\xctr:
473
.if !\xctr
474
st1 {vctr.16b}, [IV] /* return next CTR value */
475
.endif
476
frame_pop
477
ret
478
479
.Lctrtail\xctr:
480
/*
481
* Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
482
*
483
* This code expects the last keystream block to be in v{MAX_STRIDE-1}.
484
* For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and
485
* v4 should have the next two counter blocks.
486
*
487
* This allows us to store the ciphertext by writing to overlapping
488
* regions of memory. Any invalid ciphertext blocks get overwritten by
489
* correctly computed blocks. This approach greatly simplifies the
490
* logic for storing the ciphertext.
491
*/
492
mov x16, #16
493
ands w7, BYTES_W, #0xf
494
csel x13, x7, x16, ne
495
496
ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4))
497
ST5( csel x14, x16, xzr, gt )
498
cmp BYTES_W, #48 - (MAX_STRIDE << 4)
499
csel x15, x16, xzr, gt
500
cmp BYTES_W, #32 - (MAX_STRIDE << 4)
501
csel x16, x16, xzr, gt
502
cmp BYTES_W, #16 - (MAX_STRIDE << 4)
503
504
adr_l x9, .Lcts_permute_table
505
add x9, x9, x13
506
ble .Lctrtail1x\xctr
507
508
ST5( ld1 {v5.16b}, [IN], x14 )
509
ld1 {v6.16b}, [IN], x15
510
ld1 {v7.16b}, [IN], x16
511
512
ST4( bl aes_encrypt_block4x )
513
ST5( bl aes_encrypt_block5x )
514
515
ld1 {v8.16b}, [IN], x13
516
ld1 {v9.16b}, [IN]
517
ld1 {v10.16b}, [x9]
518
519
ST4( eor v6.16b, v6.16b, v0.16b )
520
ST4( eor v7.16b, v7.16b, v1.16b )
521
ST4( tbl v3.16b, {v3.16b}, v10.16b )
522
ST4( eor v8.16b, v8.16b, v2.16b )
523
ST4( eor v9.16b, v9.16b, v3.16b )
524
525
ST5( eor v5.16b, v5.16b, v0.16b )
526
ST5( eor v6.16b, v6.16b, v1.16b )
527
ST5( tbl v4.16b, {v4.16b}, v10.16b )
528
ST5( eor v7.16b, v7.16b, v2.16b )
529
ST5( eor v8.16b, v8.16b, v3.16b )
530
ST5( eor v9.16b, v9.16b, v4.16b )
531
532
ST5( st1 {v5.16b}, [OUT], x14 )
533
st1 {v6.16b}, [OUT], x15
534
st1 {v7.16b}, [OUT], x16
535
add x13, x13, OUT
536
st1 {v9.16b}, [x13] // overlapping stores
537
st1 {v8.16b}, [OUT]
538
b .Lctrout\xctr
539
540
.Lctrtail1x\xctr:
541
/*
542
* Handle <= 16 bytes of plaintext
543
*
544
* This code always reads and writes 16 bytes. To avoid out of bounds
545
* accesses, XCTR and CTR modes must use a temporary buffer when
546
* encrypting/decrypting less than 16 bytes.
547
*
548
* This code is unusual in that it loads the input and stores the output
549
* relative to the end of the buffers rather than relative to the start.
550
* This causes unusual behaviour when encrypting/decrypting less than 16
551
* bytes; the end of the data is expected to be at the end of the
552
* temporary buffer rather than the start of the data being at the start
553
* of the temporary buffer.
554
*/
555
sub x8, x7, #16
556
csel x7, x7, x8, eq
557
add IN, IN, x7
558
add OUT, OUT, x7
559
ld1 {v5.16b}, [IN]
560
ld1 {v6.16b}, [OUT]
561
ST5( mov v3.16b, v4.16b )
562
encrypt_block v3, ROUNDS_W, KEY, x8, w7
563
ld1 {v10.16b-v11.16b}, [x9]
564
tbl v3.16b, {v3.16b}, v10.16b
565
sshr v11.16b, v11.16b, #7
566
eor v5.16b, v5.16b, v3.16b
567
bif v5.16b, v6.16b, v11.16b
568
st1 {v5.16b}, [OUT]
569
b .Lctrout\xctr
570
571
// Arguments
572
.unreq OUT
573
.unreq IN
574
.unreq KEY
575
.unreq ROUNDS_W
576
.unreq BYTES_W
577
.unreq IV
578
.unreq BYTE_CTR_W // XCTR only
579
// Intermediate values
580
.unreq CTR_W // XCTR only
581
.unreq CTR // XCTR only
582
.unreq IV_PART
583
.unreq BLOCKS
584
.unreq BLOCKS_W
585
.endm
586
587
/*
588
* aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
589
* int bytes, u8 ctr[])
590
*
591
* The input and output buffers must always be at least 16 bytes even if
592
* encrypting/decrypting less than 16 bytes. Otherwise out of bounds
593
* accesses will occur. The data to be encrypted/decrypted is expected
594
* to be at the end of this 16-byte temporary buffer rather than the
595
* start.
596
*/
597
598
AES_FUNC_START(aes_ctr_encrypt)
599
ctr_encrypt 0
600
AES_FUNC_END(aes_ctr_encrypt)
601
602
/*
603
* aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
604
* int bytes, u8 const iv[], int byte_ctr)
605
*
606
* The input and output buffers must always be at least 16 bytes even if
607
* encrypting/decrypting less than 16 bytes. Otherwise out of bounds
608
* accesses will occur. The data to be encrypted/decrypted is expected
609
* to be at the end of this 16-byte temporary buffer rather than the
610
* start.
611
*/
612
613
AES_FUNC_START(aes_xctr_encrypt)
614
ctr_encrypt 1
615
AES_FUNC_END(aes_xctr_encrypt)
616
617
618
/*
619
* aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
620
* int bytes, u8 const rk2[], u8 iv[], int first)
621
* aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
622
* int bytes, u8 const rk2[], u8 iv[], int first)
623
*/
624
625
.macro next_tweak, out, in, tmp
626
sshr \tmp\().2d, \in\().2d, #63
627
and \tmp\().16b, \tmp\().16b, xtsmask.16b
628
add \out\().2d, \in\().2d, \in\().2d
629
ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
630
eor \out\().16b, \out\().16b, \tmp\().16b
631
.endm
632
633
.macro xts_load_mask, tmp
634
movi xtsmask.2s, #0x1
635
movi \tmp\().2s, #0x87
636
uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s
637
.endm
638
639
AES_FUNC_START(aes_xts_encrypt)
640
frame_push 0
641
642
ld1 {v4.16b}, [x6]
643
xts_load_mask v8
644
cbz w7, .Lxtsencnotfirst
645
646
enc_prepare w3, x5, x8
647
xts_cts_skip_tw w7, .LxtsencNx
648
encrypt_block v4, w3, x5, x8, w7 /* first tweak */
649
enc_switch_key w3, x2, x8
650
b .LxtsencNx
651
652
.Lxtsencnotfirst:
653
enc_prepare w3, x2, x8
654
.LxtsencloopNx:
655
next_tweak v4, v4, v8
656
.LxtsencNx:
657
subs w4, w4, #64
658
bmi .Lxtsenc1x
659
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
660
next_tweak v5, v4, v8
661
eor v0.16b, v0.16b, v4.16b
662
next_tweak v6, v5, v8
663
eor v1.16b, v1.16b, v5.16b
664
eor v2.16b, v2.16b, v6.16b
665
next_tweak v7, v6, v8
666
eor v3.16b, v3.16b, v7.16b
667
bl aes_encrypt_block4x
668
eor v3.16b, v3.16b, v7.16b
669
eor v0.16b, v0.16b, v4.16b
670
eor v1.16b, v1.16b, v5.16b
671
eor v2.16b, v2.16b, v6.16b
672
st1 {v0.16b-v3.16b}, [x0], #64
673
mov v4.16b, v7.16b
674
cbz w4, .Lxtsencret
675
xts_reload_mask v8
676
b .LxtsencloopNx
677
.Lxtsenc1x:
678
adds w4, w4, #64
679
beq .Lxtsencout
680
subs w4, w4, #16
681
bmi .LxtsencctsNx
682
.Lxtsencloop:
683
ld1 {v0.16b}, [x1], #16
684
.Lxtsencctsout:
685
eor v0.16b, v0.16b, v4.16b
686
encrypt_block v0, w3, x2, x8, w7
687
eor v0.16b, v0.16b, v4.16b
688
cbz w4, .Lxtsencout
689
subs w4, w4, #16
690
next_tweak v4, v4, v8
691
bmi .Lxtsenccts
692
st1 {v0.16b}, [x0], #16
693
b .Lxtsencloop
694
.Lxtsencout:
695
st1 {v0.16b}, [x0]
696
.Lxtsencret:
697
st1 {v4.16b}, [x6]
698
frame_pop
699
ret
700
701
.LxtsencctsNx:
702
mov v0.16b, v3.16b
703
sub x0, x0, #16
704
.Lxtsenccts:
705
adr_l x8, .Lcts_permute_table
706
707
add x1, x1, w4, sxtw /* rewind input pointer */
708
add w4, w4, #16 /* # bytes in final block */
709
add x9, x8, #32
710
add x8, x8, x4
711
sub x9, x9, x4
712
add x4, x0, x4 /* output address of final block */
713
714
ld1 {v1.16b}, [x1] /* load final block */
715
ld1 {v2.16b}, [x8]
716
ld1 {v3.16b}, [x9]
717
718
tbl v2.16b, {v0.16b}, v2.16b
719
tbx v0.16b, {v1.16b}, v3.16b
720
st1 {v2.16b}, [x4] /* overlapping stores */
721
mov w4, wzr
722
b .Lxtsencctsout
723
AES_FUNC_END(aes_xts_encrypt)
724
725
AES_FUNC_START(aes_xts_decrypt)
726
frame_push 0
727
728
/* subtract 16 bytes if we are doing CTS */
729
sub w8, w4, #0x10
730
tst w4, #0xf
731
csel w4, w4, w8, eq
732
733
ld1 {v4.16b}, [x6]
734
xts_load_mask v8
735
xts_cts_skip_tw w7, .Lxtsdecskiptw
736
cbz w7, .Lxtsdecnotfirst
737
738
enc_prepare w3, x5, x8
739
encrypt_block v4, w3, x5, x8, w7 /* first tweak */
740
.Lxtsdecskiptw:
741
dec_prepare w3, x2, x8
742
b .LxtsdecNx
743
744
.Lxtsdecnotfirst:
745
dec_prepare w3, x2, x8
746
.LxtsdecloopNx:
747
next_tweak v4, v4, v8
748
.LxtsdecNx:
749
subs w4, w4, #64
750
bmi .Lxtsdec1x
751
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
752
next_tweak v5, v4, v8
753
eor v0.16b, v0.16b, v4.16b
754
next_tweak v6, v5, v8
755
eor v1.16b, v1.16b, v5.16b
756
eor v2.16b, v2.16b, v6.16b
757
next_tweak v7, v6, v8
758
eor v3.16b, v3.16b, v7.16b
759
bl aes_decrypt_block4x
760
eor v3.16b, v3.16b, v7.16b
761
eor v0.16b, v0.16b, v4.16b
762
eor v1.16b, v1.16b, v5.16b
763
eor v2.16b, v2.16b, v6.16b
764
st1 {v0.16b-v3.16b}, [x0], #64
765
mov v4.16b, v7.16b
766
cbz w4, .Lxtsdecout
767
xts_reload_mask v8
768
b .LxtsdecloopNx
769
.Lxtsdec1x:
770
adds w4, w4, #64
771
beq .Lxtsdecout
772
subs w4, w4, #16
773
.Lxtsdecloop:
774
ld1 {v0.16b}, [x1], #16
775
bmi .Lxtsdeccts
776
.Lxtsdecctsout:
777
eor v0.16b, v0.16b, v4.16b
778
decrypt_block v0, w3, x2, x8, w7
779
eor v0.16b, v0.16b, v4.16b
780
st1 {v0.16b}, [x0], #16
781
cbz w4, .Lxtsdecout
782
subs w4, w4, #16
783
next_tweak v4, v4, v8
784
b .Lxtsdecloop
785
.Lxtsdecout:
786
st1 {v4.16b}, [x6]
787
frame_pop
788
ret
789
790
.Lxtsdeccts:
791
adr_l x8, .Lcts_permute_table
792
793
add x1, x1, w4, sxtw /* rewind input pointer */
794
add w4, w4, #16 /* # bytes in final block */
795
add x9, x8, #32
796
add x8, x8, x4
797
sub x9, x9, x4
798
add x4, x0, x4 /* output address of final block */
799
800
next_tweak v5, v4, v8
801
802
ld1 {v1.16b}, [x1] /* load final block */
803
ld1 {v2.16b}, [x8]
804
ld1 {v3.16b}, [x9]
805
806
eor v0.16b, v0.16b, v5.16b
807
decrypt_block v0, w3, x2, x8, w7
808
eor v0.16b, v0.16b, v5.16b
809
810
tbl v2.16b, {v0.16b}, v2.16b
811
tbx v0.16b, {v1.16b}, v3.16b
812
813
st1 {v2.16b}, [x4] /* overlapping stores */
814
mov w4, wzr
815
b .Lxtsdecctsout
816
AES_FUNC_END(aes_xts_decrypt)
817
818
/*
819
* aes_mac_update(u8 const in[], u32 const rk[], int rounds,
820
* int blocks, u8 dg[], int enc_before, int enc_after)
821
*/
822
AES_FUNC_START(aes_mac_update)
823
ld1 {v0.16b}, [x4] /* get dg */
824
enc_prepare w2, x1, x7
825
cbz w5, .Lmacloop4x
826
827
encrypt_block v0, w2, x1, x7, w8
828
829
.Lmacloop4x:
830
subs w3, w3, #4
831
bmi .Lmac1x
832
ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */
833
eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
834
encrypt_block v0, w2, x1, x7, w8
835
eor v0.16b, v0.16b, v2.16b
836
encrypt_block v0, w2, x1, x7, w8
837
eor v0.16b, v0.16b, v3.16b
838
encrypt_block v0, w2, x1, x7, w8
839
eor v0.16b, v0.16b, v4.16b
840
cmp w3, wzr
841
csinv x5, x6, xzr, eq
842
cbz w5, .Lmacout
843
encrypt_block v0, w2, x1, x7, w8
844
st1 {v0.16b}, [x4] /* return dg */
845
cond_yield .Lmacout, x7, x8
846
b .Lmacloop4x
847
.Lmac1x:
848
add w3, w3, #4
849
.Lmacloop:
850
cbz w3, .Lmacout
851
ld1 {v1.16b}, [x0], #16 /* get next pt block */
852
eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
853
854
subs w3, w3, #1
855
csinv x5, x6, xzr, eq
856
cbz w5, .Lmacout
857
858
.Lmacenc:
859
encrypt_block v0, w2, x1, x7, w8
860
b .Lmacloop
861
862
.Lmacout:
863
st1 {v0.16b}, [x4] /* return dg */
864
mov w0, w3
865
ret
866
AES_FUNC_END(aes_mac_update)
867
868