CoCalc -- aes-modes.S

GitHub Repository: torvalds/linux
Path: blob/master/arch/arm64/crypto/aes-modes.S
²⁶⁴²⁴ views
1
/* SPDX-License-Identifier: GPL-2.0-only */
2
/*
3
 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
4
 *
5
 * Copyright (C) 2013 - 2017 Linaro Ltd <[email protected]>
6
 */
7

8
/* included by aes-ce.S and aes-neon.S */
9

10
	.text
11
	.align		4
12

13
#ifndef MAX_STRIDE
14
#define MAX_STRIDE	4
15
#endif
16

17
#if MAX_STRIDE == 4
18
#define ST4(x...) x
19
#define ST5(x...)
20
#else
21
#define ST4(x...)
22
#define ST5(x...) x
23
#endif
24

25
SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
26
	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
27
	ret
28
SYM_FUNC_END(aes_encrypt_block4x)
29

30
SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
31
	decrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
32
	ret
33
SYM_FUNC_END(aes_decrypt_block4x)
34

35
#if MAX_STRIDE == 5
36
SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
37
	encrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
38
	ret
39
SYM_FUNC_END(aes_encrypt_block5x)
40

41
SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
42
	decrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
43
	ret
44
SYM_FUNC_END(aes_decrypt_block5x)
45
#endif
46

47
	/*
48
	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
49
	 *		   int blocks)
50
	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
51
	 *		   int blocks)
52
	 */
53

54
AES_FUNC_START(aes_ecb_encrypt)
55
	frame_push	0
56

57
	enc_prepare	w3, x2, x5
58

59
.LecbencloopNx:
60
	subs		w4, w4, #MAX_STRIDE
61
	bmi		.Lecbenc1x
62
	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
63
ST4(	bl		aes_encrypt_block4x		)
64
ST5(	ld1		{v4.16b}, [x1], #16		)
65
ST5(	bl		aes_encrypt_block5x		)
66
	st1		{v0.16b-v3.16b}, [x0], #64
67
ST5(	st1		{v4.16b}, [x0], #16		)
68
	b		.LecbencloopNx
69
.Lecbenc1x:
70
	adds		w4, w4, #MAX_STRIDE
71
	beq		.Lecbencout
72
.Lecbencloop:
73
	ld1		{v0.16b}, [x1], #16		/* get next pt block */
74
	encrypt_block	v0, w3, x2, x5, w6
75
	st1		{v0.16b}, [x0], #16
76
	subs		w4, w4, #1
77
	bne		.Lecbencloop
78
.Lecbencout:
79
	frame_pop
80
	ret
81
AES_FUNC_END(aes_ecb_encrypt)
82

83

84
AES_FUNC_START(aes_ecb_decrypt)
85
	frame_push	0
86

87
	dec_prepare	w3, x2, x5
88

89
.LecbdecloopNx:
90
	subs		w4, w4, #MAX_STRIDE
91
	bmi		.Lecbdec1x
92
	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
93
ST4(	bl		aes_decrypt_block4x		)
94
ST5(	ld1		{v4.16b}, [x1], #16		)
95
ST5(	bl		aes_decrypt_block5x		)
96
	st1		{v0.16b-v3.16b}, [x0], #64
97
ST5(	st1		{v4.16b}, [x0], #16		)
98
	b		.LecbdecloopNx
99
.Lecbdec1x:
100
	adds		w4, w4, #MAX_STRIDE
101
	beq		.Lecbdecout
102
.Lecbdecloop:
103
	ld1		{v0.16b}, [x1], #16		/* get next ct block */
104
	decrypt_block	v0, w3, x2, x5, w6
105
	st1		{v0.16b}, [x0], #16
106
	subs		w4, w4, #1
107
	bne		.Lecbdecloop
108
.Lecbdecout:
109
	frame_pop
110
	ret
111
AES_FUNC_END(aes_ecb_decrypt)
112

113

114
	/*
115
	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
116
	 *		   int blocks, u8 iv[])
117
	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
118
	 *		   int blocks, u8 iv[])
119
	 * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
120
	 *			 int rounds, int blocks, u8 iv[],
121
	 *			 u32 const rk2[]);
122
	 * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
123
	 *			 int rounds, int blocks, u8 iv[],
124
	 *			 u32 const rk2[]);
125
	 */
126

127
AES_FUNC_START(aes_essiv_cbc_encrypt)
128
	ld1		{v4.16b}, [x5]			/* get iv */
129

130
	mov		w8, #14				/* AES-256: 14 rounds */
131
	enc_prepare	w8, x6, x7
132
	encrypt_block	v4, w8, x6, x7, w9
133
	enc_switch_key	w3, x2, x6
134
	b		.Lcbcencloop4x
135

136
AES_FUNC_START(aes_cbc_encrypt)
137
	ld1		{v4.16b}, [x5]			/* get iv */
138
	enc_prepare	w3, x2, x6
139

140
.Lcbcencloop4x:
141
	subs		w4, w4, #4
142
	bmi		.Lcbcenc1x
143
	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
144
	eor		v0.16b, v0.16b, v4.16b		/* ..and xor with iv */
145
	encrypt_block	v0, w3, x2, x6, w7
146
	eor		v1.16b, v1.16b, v0.16b
147
	encrypt_block	v1, w3, x2, x6, w7
148
	eor		v2.16b, v2.16b, v1.16b
149
	encrypt_block	v2, w3, x2, x6, w7
150
	eor		v3.16b, v3.16b, v2.16b
151
	encrypt_block	v3, w3, x2, x6, w7
152
	st1		{v0.16b-v3.16b}, [x0], #64
153
	mov		v4.16b, v3.16b
154
	b		.Lcbcencloop4x
155
.Lcbcenc1x:
156
	adds		w4, w4, #4
157
	beq		.Lcbcencout
158
.Lcbcencloop:
159
	ld1		{v0.16b}, [x1], #16		/* get next pt block */
160
	eor		v4.16b, v4.16b, v0.16b		/* ..and xor with iv */
161
	encrypt_block	v4, w3, x2, x6, w7
162
	st1		{v4.16b}, [x0], #16
163
	subs		w4, w4, #1
164
	bne		.Lcbcencloop
165
.Lcbcencout:
166
	st1		{v4.16b}, [x5]			/* return iv */
167
	ret
168
AES_FUNC_END(aes_cbc_encrypt)
169
AES_FUNC_END(aes_essiv_cbc_encrypt)
170

171
AES_FUNC_START(aes_essiv_cbc_decrypt)
172
	ld1		{cbciv.16b}, [x5]		/* get iv */
173

174
	mov		w8, #14				/* AES-256: 14 rounds */
175
	enc_prepare	w8, x6, x7
176
	encrypt_block	cbciv, w8, x6, x7, w9
177
	b		.Lessivcbcdecstart
178

179
AES_FUNC_START(aes_cbc_decrypt)
180
	ld1		{cbciv.16b}, [x5]		/* get iv */
181
.Lessivcbcdecstart:
182
	frame_push	0
183
	dec_prepare	w3, x2, x6
184

185
.LcbcdecloopNx:
186
	subs		w4, w4, #MAX_STRIDE
187
	bmi		.Lcbcdec1x
188
	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
189
#if MAX_STRIDE == 5
190
	ld1		{v4.16b}, [x1], #16		/* get 1 ct block */
191
	mov		v5.16b, v0.16b
192
	mov		v6.16b, v1.16b
193
	mov		v7.16b, v2.16b
194
	bl		aes_decrypt_block5x
195
	sub		x1, x1, #32
196
	eor		v0.16b, v0.16b, cbciv.16b
197
	eor		v1.16b, v1.16b, v5.16b
198
	ld1		{v5.16b}, [x1], #16		/* reload 1 ct block */
199
	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
200
	eor		v2.16b, v2.16b, v6.16b
201
	eor		v3.16b, v3.16b, v7.16b
202
	eor		v4.16b, v4.16b, v5.16b
203
#else
204
	mov		v4.16b, v0.16b
205
	mov		v5.16b, v1.16b
206
	mov		v6.16b, v2.16b
207
	bl		aes_decrypt_block4x
208
	sub		x1, x1, #16
209
	eor		v0.16b, v0.16b, cbciv.16b
210
	eor		v1.16b, v1.16b, v4.16b
211
	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
212
	eor		v2.16b, v2.16b, v5.16b
213
	eor		v3.16b, v3.16b, v6.16b
214
#endif
215
	st1		{v0.16b-v3.16b}, [x0], #64
216
ST5(	st1		{v4.16b}, [x0], #16		)
217
	b		.LcbcdecloopNx
218
.Lcbcdec1x:
219
	adds		w4, w4, #MAX_STRIDE
220
	beq		.Lcbcdecout
221
.Lcbcdecloop:
222
	ld1		{v1.16b}, [x1], #16		/* get next ct block */
223
	mov		v0.16b, v1.16b			/* ...and copy to v0 */
224
	decrypt_block	v0, w3, x2, x6, w7
225
	eor		v0.16b, v0.16b, cbciv.16b	/* xor with iv => pt */
226
	mov		cbciv.16b, v1.16b		/* ct is next iv */
227
	st1		{v0.16b}, [x0], #16
228
	subs		w4, w4, #1
229
	bne		.Lcbcdecloop
230
.Lcbcdecout:
231
	st1		{cbciv.16b}, [x5]		/* return iv */
232
	frame_pop
233
	ret
234
AES_FUNC_END(aes_cbc_decrypt)
235
AES_FUNC_END(aes_essiv_cbc_decrypt)
236

237

238
	/*
239
	 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
240
	 *		       int rounds, int bytes, u8 const iv[])
241
	 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
242
	 *		       int rounds, int bytes, u8 const iv[])
243
	 */
244

245
AES_FUNC_START(aes_cbc_cts_encrypt)
246
	adr_l		x8, .Lcts_permute_table
247
	sub		x4, x4, #16
248
	add		x9, x8, #32
249
	add		x8, x8, x4
250
	sub		x9, x9, x4
251
	ld1		{v3.16b}, [x8]
252
	ld1		{v4.16b}, [x9]
253

254
	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
255
	ld1		{v1.16b}, [x1]
256

257
	ld1		{v5.16b}, [x5]			/* get iv */
258
	enc_prepare	w3, x2, x6
259

260
	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
261
	tbl		v1.16b, {v1.16b}, v4.16b
262
	encrypt_block	v0, w3, x2, x6, w7
263

264
	eor		v1.16b, v1.16b, v0.16b
265
	tbl		v0.16b, {v0.16b}, v3.16b
266
	encrypt_block	v1, w3, x2, x6, w7
267

268
	add		x4, x0, x4
269
	st1		{v0.16b}, [x4]			/* overlapping stores */
270
	st1		{v1.16b}, [x0]
271
	ret
272
AES_FUNC_END(aes_cbc_cts_encrypt)
273

274
AES_FUNC_START(aes_cbc_cts_decrypt)
275
	adr_l		x8, .Lcts_permute_table
276
	sub		x4, x4, #16
277
	add		x9, x8, #32
278
	add		x8, x8, x4
279
	sub		x9, x9, x4
280
	ld1		{v3.16b}, [x8]
281
	ld1		{v4.16b}, [x9]
282

283
	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
284
	ld1		{v1.16b}, [x1]
285

286
	ld1		{v5.16b}, [x5]			/* get iv */
287
	dec_prepare	w3, x2, x6
288

289
	decrypt_block	v0, w3, x2, x6, w7
290
	tbl		v2.16b, {v0.16b}, v3.16b
291
	eor		v2.16b, v2.16b, v1.16b
292

293
	tbx		v0.16b, {v1.16b}, v4.16b
294
	decrypt_block	v0, w3, x2, x6, w7
295
	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
296

297
	add		x4, x0, x4
298
	st1		{v2.16b}, [x4]			/* overlapping stores */
299
	st1		{v0.16b}, [x0]
300
	ret
301
AES_FUNC_END(aes_cbc_cts_decrypt)
302

303
	.section	".rodata", "a"
304
	.align		6
305
.Lcts_permute_table:
306
	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
307
	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
308
	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
309
	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
310
	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
311
	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
312
	.previous
313

314
	/*
315
	 * This macro generates the code for CTR and XCTR mode.
316
	 */
317
.macro ctr_encrypt xctr
318
	// Arguments
319
	OUT		.req x0
320
	IN		.req x1
321
	KEY		.req x2
322
	ROUNDS_W	.req w3
323
	BYTES_W		.req w4
324
	IV		.req x5
325
	BYTE_CTR_W 	.req w6		// XCTR only
326
	// Intermediate values
327
	CTR_W		.req w11	// XCTR only
328
	CTR		.req x11	// XCTR only
329
	IV_PART		.req x12
330
	BLOCKS		.req x13
331
	BLOCKS_W	.req w13
332

333
	frame_push	0
334

335
	enc_prepare	ROUNDS_W, KEY, IV_PART
336
	ld1		{vctr.16b}, [IV]
337

338
	/*
339
	 * Keep 64 bits of the IV in a register.  For CTR mode this lets us
340
	 * easily increment the IV.  For XCTR mode this lets us efficiently XOR
341
	 * the 64-bit counter with the IV.
342
	 */
343
	.if \xctr
344
		umov		IV_PART, vctr.d[0]
345
		lsr		CTR_W, BYTE_CTR_W, #4
346
	.else
347
		umov		IV_PART, vctr.d[1]
348
		rev		IV_PART, IV_PART
349
	.endif
350

351
.LctrloopNx\xctr:
352
	add		BLOCKS_W, BYTES_W, #15
353
	sub		BYTES_W, BYTES_W, #MAX_STRIDE << 4
354
	lsr		BLOCKS_W, BLOCKS_W, #4
355
	mov		w8, #MAX_STRIDE
356
	cmp		BLOCKS_W, w8
357
	csel		BLOCKS_W, BLOCKS_W, w8, lt
358

359
	/*
360
	 * Set up the counter values in v0-v{MAX_STRIDE-1}.
361
	 *
362
	 * If we are encrypting less than MAX_STRIDE blocks, the tail block
363
	 * handling code expects the last keystream block to be in
364
	 * v{MAX_STRIDE-1}.  For example: if encrypting two blocks with
365
	 * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks.
366
	 */
367
	.if \xctr
368
		add		CTR, CTR, BLOCKS
369
	.else
370
		adds		IV_PART, IV_PART, BLOCKS
371
	.endif
372
	mov		v0.16b, vctr.16b
373
	mov		v1.16b, vctr.16b
374
	mov		v2.16b, vctr.16b
375
	mov		v3.16b, vctr.16b
376
ST5(	mov		v4.16b, vctr.16b		)
377
	.if \xctr
378
		sub		x6, CTR, #MAX_STRIDE - 1
379
		sub		x7, CTR, #MAX_STRIDE - 2
380
		sub		x8, CTR, #MAX_STRIDE - 3
381
		sub		x9, CTR, #MAX_STRIDE - 4
382
ST5(		sub		x10, CTR, #MAX_STRIDE - 5	)
383
		eor		x6, x6, IV_PART
384
		eor		x7, x7, IV_PART
385
		eor		x8, x8, IV_PART
386
		eor		x9, x9, IV_PART
387
ST5(		eor		x10, x10, IV_PART		)
388
		mov		v0.d[0], x6
389
		mov		v1.d[0], x7
390
		mov		v2.d[0], x8
391
		mov		v3.d[0], x9
392
ST5(		mov		v4.d[0], x10			)
393
	.else
394
		bcs		0f
395
		.subsection	1
396
		/*
397
		 * This subsection handles carries.
398
		 *
399
		 * Conditional branching here is allowed with respect to time
400
		 * invariance since the branches are dependent on the IV instead
401
		 * of the plaintext or key.  This code is rarely executed in
402
		 * practice anyway.
403
		 */
404

405
		/* Apply carry to outgoing counter. */
406
0:		umov		x8, vctr.d[0]
407
		rev		x8, x8
408
		add		x8, x8, #1
409
		rev		x8, x8
410
		ins		vctr.d[0], x8
411

412
		/*
413
		 * Apply carry to counter blocks if needed.
414
		 *
415
		 * Since the carry flag was set, we know 0 <= IV_PART <
416
		 * MAX_STRIDE.  Using the value of IV_PART we can determine how
417
		 * many counter blocks need to be updated.
418
		 */
419
		cbz		IV_PART, 2f
420
		adr		x16, 1f
421
		sub		x16, x16, IV_PART, lsl #3
422
		br		x16
423
		bti		c
424
		mov		v0.d[0], vctr.d[0]
425
		bti		c
426
		mov		v1.d[0], vctr.d[0]
427
		bti		c
428
		mov		v2.d[0], vctr.d[0]
429
		bti		c
430
		mov		v3.d[0], vctr.d[0]
431
ST5(		bti		c				)
432
ST5(		mov		v4.d[0], vctr.d[0]		)
433
1:		b		2f
434
		.previous
435

436
2:		rev		x7, IV_PART
437
		ins		vctr.d[1], x7
438
		sub		x7, IV_PART, #MAX_STRIDE - 1
439
		sub		x8, IV_PART, #MAX_STRIDE - 2
440
		sub		x9, IV_PART, #MAX_STRIDE - 3
441
		rev		x7, x7
442
		rev		x8, x8
443
		mov		v1.d[1], x7
444
		rev		x9, x9
445
ST5(		sub		x10, IV_PART, #MAX_STRIDE - 4	)
446
		mov		v2.d[1], x8
447
ST5(		rev		x10, x10			)
448
		mov		v3.d[1], x9
449
ST5(		mov		v4.d[1], x10			)
450
	.endif
451

452
	/*
453
	 * If there are at least MAX_STRIDE blocks left, XOR the data with
454
	 * keystream and store.  Otherwise jump to tail handling.
455
	 */
456
	tbnz		BYTES_W, #31, .Lctrtail\xctr
457
	ld1		{v5.16b-v7.16b}, [IN], #48
458
ST4(	bl		aes_encrypt_block4x		)
459
ST5(	bl		aes_encrypt_block5x		)
460
	eor		v0.16b, v5.16b, v0.16b
461
ST4(	ld1		{v5.16b}, [IN], #16		)
462
	eor		v1.16b, v6.16b, v1.16b
463
ST5(	ld1		{v5.16b-v6.16b}, [IN], #32	)
464
	eor		v2.16b, v7.16b, v2.16b
465
	eor		v3.16b, v5.16b, v3.16b
466
ST5(	eor		v4.16b, v6.16b, v4.16b		)
467
	st1		{v0.16b-v3.16b}, [OUT], #64
468
ST5(	st1		{v4.16b}, [OUT], #16		)
469
	cbz		BYTES_W, .Lctrout\xctr
470
	b		.LctrloopNx\xctr
471

472
.Lctrout\xctr:
473
	.if !\xctr
474
		st1		{vctr.16b}, [IV] /* return next CTR value */
475
	.endif
476
	frame_pop
477
	ret
478

479
.Lctrtail\xctr:
480
	/*
481
	 * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
482
	 *
483
	 * This code expects the last keystream block to be in v{MAX_STRIDE-1}.
484
	 * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and
485
	 * v4 should have the next two counter blocks.
486
	 *
487
	 * This allows us to store the ciphertext by writing to overlapping
488
	 * regions of memory.  Any invalid ciphertext blocks get overwritten by
489
	 * correctly computed blocks.  This approach greatly simplifies the
490
	 * logic for storing the ciphertext.
491
	 */
492
	mov		x16, #16
493
	ands		w7, BYTES_W, #0xf
494
	csel		x13, x7, x16, ne
495

496
ST5(	cmp		BYTES_W, #64 - (MAX_STRIDE << 4))
497
ST5(	csel		x14, x16, xzr, gt		)
498
	cmp		BYTES_W, #48 - (MAX_STRIDE << 4)
499
	csel		x15, x16, xzr, gt
500
	cmp		BYTES_W, #32 - (MAX_STRIDE << 4)
501
	csel		x16, x16, xzr, gt
502
	cmp		BYTES_W, #16 - (MAX_STRIDE << 4)
503

504
	adr_l		x9, .Lcts_permute_table
505
	add		x9, x9, x13
506
	ble		.Lctrtail1x\xctr
507

508
ST5(	ld1		{v5.16b}, [IN], x14		)
509
	ld1		{v6.16b}, [IN], x15
510
	ld1		{v7.16b}, [IN], x16
511

512
ST4(	bl		aes_encrypt_block4x		)
513
ST5(	bl		aes_encrypt_block5x		)
514

515
	ld1		{v8.16b}, [IN], x13
516
	ld1		{v9.16b}, [IN]
517
	ld1		{v10.16b}, [x9]
518

519
ST4(	eor		v6.16b, v6.16b, v0.16b		)
520
ST4(	eor		v7.16b, v7.16b, v1.16b		)
521
ST4(	tbl		v3.16b, {v3.16b}, v10.16b	)
522
ST4(	eor		v8.16b, v8.16b, v2.16b		)
523
ST4(	eor		v9.16b, v9.16b, v3.16b		)
524

525
ST5(	eor		v5.16b, v5.16b, v0.16b		)
526
ST5(	eor		v6.16b, v6.16b, v1.16b		)
527
ST5(	tbl		v4.16b, {v4.16b}, v10.16b	)
528
ST5(	eor		v7.16b, v7.16b, v2.16b		)
529
ST5(	eor		v8.16b, v8.16b, v3.16b		)
530
ST5(	eor		v9.16b, v9.16b, v4.16b		)
531

532
ST5(	st1		{v5.16b}, [OUT], x14		)
533
	st1		{v6.16b}, [OUT], x15
534
	st1		{v7.16b}, [OUT], x16
535
	add		x13, x13, OUT
536
	st1		{v9.16b}, [x13]		// overlapping stores
537
	st1		{v8.16b}, [OUT]
538
	b		.Lctrout\xctr
539

540
.Lctrtail1x\xctr:
541
	/*
542
	 * Handle <= 16 bytes of plaintext
543
	 *
544
	 * This code always reads and writes 16 bytes.  To avoid out of bounds
545
	 * accesses, XCTR and CTR modes must use a temporary buffer when
546
	 * encrypting/decrypting less than 16 bytes.
547
	 *
548
	 * This code is unusual in that it loads the input and stores the output
549
	 * relative to the end of the buffers rather than relative to the start.
550
	 * This causes unusual behaviour when encrypting/decrypting less than 16
551
	 * bytes; the end of the data is expected to be at the end of the
552
	 * temporary buffer rather than the start of the data being at the start
553
	 * of the temporary buffer.
554
	 */
555
	sub		x8, x7, #16
556
	csel		x7, x7, x8, eq
557
	add		IN, IN, x7
558
	add		OUT, OUT, x7
559
	ld1		{v5.16b}, [IN]
560
	ld1		{v6.16b}, [OUT]
561
ST5(	mov		v3.16b, v4.16b			)
562
	encrypt_block	v3, ROUNDS_W, KEY, x8, w7
563
	ld1		{v10.16b-v11.16b}, [x9]
564
	tbl		v3.16b, {v3.16b}, v10.16b
565
	sshr		v11.16b, v11.16b, #7
566
	eor		v5.16b, v5.16b, v3.16b
567
	bif		v5.16b, v6.16b, v11.16b
568
	st1		{v5.16b}, [OUT]
569
	b		.Lctrout\xctr
570

571
	// Arguments
572
	.unreq OUT
573
	.unreq IN
574
	.unreq KEY
575
	.unreq ROUNDS_W
576
	.unreq BYTES_W
577
	.unreq IV
578
	.unreq BYTE_CTR_W	// XCTR only
579
	// Intermediate values
580
	.unreq CTR_W		// XCTR only
581
	.unreq CTR		// XCTR only
582
	.unreq IV_PART
583
	.unreq BLOCKS
584
	.unreq BLOCKS_W
585
.endm
586

587
	/*
588
	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
589
	 *		   int bytes, u8 ctr[])
590
	 *
591
	 * The input and output buffers must always be at least 16 bytes even if
592
	 * encrypting/decrypting less than 16 bytes.  Otherwise out of bounds
593
	 * accesses will occur.  The data to be encrypted/decrypted is expected
594
	 * to be at the end of this 16-byte temporary buffer rather than the
595
	 * start.
596
	 */
597

598
AES_FUNC_START(aes_ctr_encrypt)
599
	ctr_encrypt 0
600
AES_FUNC_END(aes_ctr_encrypt)
601

602
	/*
603
	 * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
604
	 *		   int bytes, u8 const iv[], int byte_ctr)
605
	 *
606
	 * The input and output buffers must always be at least 16 bytes even if
607
	 * encrypting/decrypting less than 16 bytes.  Otherwise out of bounds
608
	 * accesses will occur.  The data to be encrypted/decrypted is expected
609
	 * to be at the end of this 16-byte temporary buffer rather than the
610
	 * start.
611
	 */
612

613
AES_FUNC_START(aes_xctr_encrypt)
614
	ctr_encrypt 1
615
AES_FUNC_END(aes_xctr_encrypt)
616

617

618
	/*
619
	 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
620
	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
621
	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
622
	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
623
	 */
624

625
	.macro		next_tweak, out, in, tmp
626
	sshr		\tmp\().2d,  \in\().2d,   #63
627
	and		\tmp\().16b, \tmp\().16b, xtsmask.16b
628
	add		\out\().2d,  \in\().2d,   \in\().2d
629
	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
630
	eor		\out\().16b, \out\().16b, \tmp\().16b
631
	.endm
632

633
	.macro		xts_load_mask, tmp
634
	movi		xtsmask.2s, #0x1
635
	movi		\tmp\().2s, #0x87
636
	uzp1		xtsmask.4s, xtsmask.4s, \tmp\().4s
637
	.endm
638

639
AES_FUNC_START(aes_xts_encrypt)
640
	frame_push	0
641

642
	ld1		{v4.16b}, [x6]
643
	xts_load_mask	v8
644
	cbz		w7, .Lxtsencnotfirst
645

646
	enc_prepare	w3, x5, x8
647
	xts_cts_skip_tw	w7, .LxtsencNx
648
	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
649
	enc_switch_key	w3, x2, x8
650
	b		.LxtsencNx
651

652
.Lxtsencnotfirst:
653
	enc_prepare	w3, x2, x8
654
.LxtsencloopNx:
655
	next_tweak	v4, v4, v8
656
.LxtsencNx:
657
	subs		w4, w4, #64
658
	bmi		.Lxtsenc1x
659
	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
660
	next_tweak	v5, v4, v8
661
	eor		v0.16b, v0.16b, v4.16b
662
	next_tweak	v6, v5, v8
663
	eor		v1.16b, v1.16b, v5.16b
664
	eor		v2.16b, v2.16b, v6.16b
665
	next_tweak	v7, v6, v8
666
	eor		v3.16b, v3.16b, v7.16b
667
	bl		aes_encrypt_block4x
668
	eor		v3.16b, v3.16b, v7.16b
669
	eor		v0.16b, v0.16b, v4.16b
670
	eor		v1.16b, v1.16b, v5.16b
671
	eor		v2.16b, v2.16b, v6.16b
672
	st1		{v0.16b-v3.16b}, [x0], #64
673
	mov		v4.16b, v7.16b
674
	cbz		w4, .Lxtsencret
675
	xts_reload_mask	v8
676
	b		.LxtsencloopNx
677
.Lxtsenc1x:
678
	adds		w4, w4, #64
679
	beq		.Lxtsencout
680
	subs		w4, w4, #16
681
	bmi		.LxtsencctsNx
682
.Lxtsencloop:
683
	ld1		{v0.16b}, [x1], #16
684
.Lxtsencctsout:
685
	eor		v0.16b, v0.16b, v4.16b
686
	encrypt_block	v0, w3, x2, x8, w7
687
	eor		v0.16b, v0.16b, v4.16b
688
	cbz		w4, .Lxtsencout
689
	subs		w4, w4, #16
690
	next_tweak	v4, v4, v8
691
	bmi		.Lxtsenccts
692
	st1		{v0.16b}, [x0], #16
693
	b		.Lxtsencloop
694
.Lxtsencout:
695
	st1		{v0.16b}, [x0]
696
.Lxtsencret:
697
	st1		{v4.16b}, [x6]
698
	frame_pop
699
	ret
700

701
.LxtsencctsNx:
702
	mov		v0.16b, v3.16b
703
	sub		x0, x0, #16
704
.Lxtsenccts:
705
	adr_l		x8, .Lcts_permute_table
706

707
	add		x1, x1, w4, sxtw	/* rewind input pointer */
708
	add		w4, w4, #16		/* # bytes in final block */
709
	add		x9, x8, #32
710
	add		x8, x8, x4
711
	sub		x9, x9, x4
712
	add		x4, x0, x4		/* output address of final block */
713

714
	ld1		{v1.16b}, [x1]		/* load final block */
715
	ld1		{v2.16b}, [x8]
716
	ld1		{v3.16b}, [x9]
717

718
	tbl		v2.16b, {v0.16b}, v2.16b
719
	tbx		v0.16b, {v1.16b}, v3.16b
720
	st1		{v2.16b}, [x4]			/* overlapping stores */
721
	mov		w4, wzr
722
	b		.Lxtsencctsout
723
AES_FUNC_END(aes_xts_encrypt)
724

725
AES_FUNC_START(aes_xts_decrypt)
726
	frame_push	0
727

728
	/* subtract 16 bytes if we are doing CTS */
729
	sub		w8, w4, #0x10
730
	tst		w4, #0xf
731
	csel		w4, w4, w8, eq
732

733
	ld1		{v4.16b}, [x6]
734
	xts_load_mask	v8
735
	xts_cts_skip_tw	w7, .Lxtsdecskiptw
736
	cbz		w7, .Lxtsdecnotfirst
737

738
	enc_prepare	w3, x5, x8
739
	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
740
.Lxtsdecskiptw:
741
	dec_prepare	w3, x2, x8
742
	b		.LxtsdecNx
743

744
.Lxtsdecnotfirst:
745
	dec_prepare	w3, x2, x8
746
.LxtsdecloopNx:
747
	next_tweak	v4, v4, v8
748
.LxtsdecNx:
749
	subs		w4, w4, #64
750
	bmi		.Lxtsdec1x
751
	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
752
	next_tweak	v5, v4, v8
753
	eor		v0.16b, v0.16b, v4.16b
754
	next_tweak	v6, v5, v8
755
	eor		v1.16b, v1.16b, v5.16b
756
	eor		v2.16b, v2.16b, v6.16b
757
	next_tweak	v7, v6, v8
758
	eor		v3.16b, v3.16b, v7.16b
759
	bl		aes_decrypt_block4x
760
	eor		v3.16b, v3.16b, v7.16b
761
	eor		v0.16b, v0.16b, v4.16b
762
	eor		v1.16b, v1.16b, v5.16b
763
	eor		v2.16b, v2.16b, v6.16b
764
	st1		{v0.16b-v3.16b}, [x0], #64
765
	mov		v4.16b, v7.16b
766
	cbz		w4, .Lxtsdecout
767
	xts_reload_mask	v8
768
	b		.LxtsdecloopNx
769
.Lxtsdec1x:
770
	adds		w4, w4, #64
771
	beq		.Lxtsdecout
772
	subs		w4, w4, #16
773
.Lxtsdecloop:
774
	ld1		{v0.16b}, [x1], #16
775
	bmi		.Lxtsdeccts
776
.Lxtsdecctsout:
777
	eor		v0.16b, v0.16b, v4.16b
778
	decrypt_block	v0, w3, x2, x8, w7
779
	eor		v0.16b, v0.16b, v4.16b
780
	st1		{v0.16b}, [x0], #16
781
	cbz		w4, .Lxtsdecout
782
	subs		w4, w4, #16
783
	next_tweak	v4, v4, v8
784
	b		.Lxtsdecloop
785
.Lxtsdecout:
786
	st1		{v4.16b}, [x6]
787
	frame_pop
788
	ret
789

790
.Lxtsdeccts:
791
	adr_l		x8, .Lcts_permute_table
792

793
	add		x1, x1, w4, sxtw	/* rewind input pointer */
794
	add		w4, w4, #16		/* # bytes in final block */
795
	add		x9, x8, #32
796
	add		x8, x8, x4
797
	sub		x9, x9, x4
798
	add		x4, x0, x4		/* output address of final block */
799

800
	next_tweak	v5, v4, v8
801

802
	ld1		{v1.16b}, [x1]		/* load final block */
803
	ld1		{v2.16b}, [x8]
804
	ld1		{v3.16b}, [x9]
805

806
	eor		v0.16b, v0.16b, v5.16b
807
	decrypt_block	v0, w3, x2, x8, w7
808
	eor		v0.16b, v0.16b, v5.16b
809

810
	tbl		v2.16b, {v0.16b}, v2.16b
811
	tbx		v0.16b, {v1.16b}, v3.16b
812

813
	st1		{v2.16b}, [x4]			/* overlapping stores */
814
	mov		w4, wzr
815
	b		.Lxtsdecctsout
816
AES_FUNC_END(aes_xts_decrypt)
817

818
	/*
819
	 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
820
	 *		  int blocks, u8 dg[], int enc_before, int enc_after)
821
	 */
822
AES_FUNC_START(aes_mac_update)
823
	ld1		{v0.16b}, [x4]			/* get dg */
824
	enc_prepare	w2, x1, x7
825
	cbz		w5, .Lmacloop4x
826

827
	encrypt_block	v0, w2, x1, x7, w8
828

829
.Lmacloop4x:
830
	subs		w3, w3, #4
831
	bmi		.Lmac1x
832
	ld1		{v1.16b-v4.16b}, [x0], #64	/* get next pt block */
833
	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
834
	encrypt_block	v0, w2, x1, x7, w8
835
	eor		v0.16b, v0.16b, v2.16b
836
	encrypt_block	v0, w2, x1, x7, w8
837
	eor		v0.16b, v0.16b, v3.16b
838
	encrypt_block	v0, w2, x1, x7, w8
839
	eor		v0.16b, v0.16b, v4.16b
840
	cmp		w3, wzr
841
	csinv		x5, x6, xzr, eq
842
	cbz		w5, .Lmacout
843
	encrypt_block	v0, w2, x1, x7, w8
844
	st1		{v0.16b}, [x4]			/* return dg */
845
	cond_yield	.Lmacout, x7, x8
846
	b		.Lmacloop4x
847
.Lmac1x:
848
	add		w3, w3, #4
849
.Lmacloop:
850
	cbz		w3, .Lmacout
851
	ld1		{v1.16b}, [x0], #16		/* get next pt block */
852
	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
853

854
	subs		w3, w3, #1
855
	csinv		x5, x6, xzr, eq
856
	cbz		w5, .Lmacout
857

858
.Lmacenc:
859
	encrypt_block	v0, w2, x1, x7, w8
860
	b		.Lmacloop
861

862
.Lmacout:
863
	st1		{v0.16b}, [x4]			/* return dg */
864
	mov		w0, w3
865
	ret
866
AES_FUNC_END(aes_mac_update)
867

868
Product

Resources

Company