CoCalc -- chacha-scalar-core.S

GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/arm/chacha-scalar-core.S
²⁶²⁸² views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/*
3
 * Copyright (C) 2018 Google, Inc.
4
 */
5

6
#include <linux/linkage.h>
7
#include <asm/assembler.h>
8

9
/*
10
 * Design notes:
11
 *
12
 * 16 registers would be needed to hold the state matrix, but only 14 are
13
 * available because 'sp' and 'pc' cannot be used.  So we spill the elements
14
 * (x8, x9) to the stack and swap them out with (x10, x11).  This adds one
15
 * 'ldrd' and one 'strd' instruction per round.
16
 *
17
 * All rotates are performed using the implicit rotate operand accepted by the
18
 * 'add' and 'eor' instructions.  This is faster than using explicit rotate
19
 * instructions.  To make this work, we allow the values in the second and last
20
 * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
21
 * wrong rotation amount.  The rotation amount is then fixed up just in time
22
 * when the values are used.  'brot' is the number of bits the values in row 'b'
23
 * need to be rotated right to arrive at the correct values, and 'drot'
24
 * similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
25
 * that they end up as (25, 24) after every round.
26
 */
27

28
	// ChaCha state registers
29
	X0	.req	r0
30
	X1	.req	r1
31
	X2	.req	r2
32
	X3	.req	r3
33
	X4	.req	r4
34
	X5	.req	r5
35
	X6	.req	r6
36
	X7	.req	r7
37
	X8_X10	.req	r8	// shared by x8 and x10
38
	X9_X11	.req	r9	// shared by x9 and x11
39
	X12	.req	r10
40
	X13	.req	r11
41
	X14	.req	r12
42
	X15	.req	r14
43

44
.macro _le32_bswap_4x	a, b, c, d,  tmp
45
#ifdef __ARMEB__
46
	rev_l		\a,  \tmp
47
	rev_l		\b,  \tmp
48
	rev_l		\c,  \tmp
49
	rev_l		\d,  \tmp
50
#endif
51
.endm
52

53
.macro __ldrd		a, b, src, offset
54
#if __LINUX_ARM_ARCH__ >= 6
55
	ldrd		\a, \b, [\src, #\offset]
56
#else
57
	ldr		\a, [\src, #\offset]
58
	ldr		\b, [\src, #\offset + 4]
59
#endif
60
.endm
61

62
.macro __strd		a, b, dst, offset
63
#if __LINUX_ARM_ARCH__ >= 6
64
	strd		\a, \b, [\dst, #\offset]
65
#else
66
	str		\a, [\dst, #\offset]
67
	str		\b, [\dst, #\offset + 4]
68
#endif
69
.endm
70

71
.macro _halfround	a1, b1, c1, d1,  a2, b2, c2, d2
72

73
	// a += b; d ^= a; d = rol(d, 16);
74
	add		\a1, \a1, \b1, ror #brot
75
	add		\a2, \a2, \b2, ror #brot
76
	eor		\d1, \a1, \d1, ror #drot
77
	eor		\d2, \a2, \d2, ror #drot
78
	// drot == 32 - 16 == 16
79

80
	// c += d; b ^= c; b = rol(b, 12);
81
	add		\c1, \c1, \d1, ror #16
82
	add		\c2, \c2, \d2, ror #16
83
	eor		\b1, \c1, \b1, ror #brot
84
	eor		\b2, \c2, \b2, ror #brot
85
	// brot == 32 - 12 == 20
86

87
	// a += b; d ^= a; d = rol(d, 8);
88
	add		\a1, \a1, \b1, ror #20
89
	add		\a2, \a2, \b2, ror #20
90
	eor		\d1, \a1, \d1, ror #16
91
	eor		\d2, \a2, \d2, ror #16
92
	// drot == 32 - 8 == 24
93

94
	// c += d; b ^= c; b = rol(b, 7);
95
	add		\c1, \c1, \d1, ror #24
96
	add		\c2, \c2, \d2, ror #24
97
	eor		\b1, \c1, \b1, ror #20
98
	eor		\b2, \c2, \b2, ror #20
99
	// brot == 32 - 7 == 25
100
.endm
101

102
.macro _doubleround
103

104
	// column round
105

106
	// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
107
	_halfround	X0, X4, X8_X10, X12,  X1, X5, X9_X11, X13
108

109
	// save (x8, x9); restore (x10, x11)
110
	__strd		X8_X10, X9_X11, sp, 0
111
	__ldrd		X8_X10, X9_X11, sp, 8
112

113
	// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
114
	_halfround	X2, X6, X8_X10, X14,  X3, X7, X9_X11, X15
115

116
	.set brot, 25
117
	.set drot, 24
118

119
	// diagonal round
120

121
	// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
122
	_halfround	X0, X5, X8_X10, X15,  X1, X6, X9_X11, X12
123

124
	// save (x10, x11); restore (x8, x9)
125
	__strd		X8_X10, X9_X11, sp, 8
126
	__ldrd		X8_X10, X9_X11, sp, 0
127

128
	// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
129
	_halfround	X2, X7, X8_X10, X13,  X3, X4, X9_X11, X14
130
.endm
131

132
.macro _chacha_permute	nrounds
133
	.set brot, 0
134
	.set drot, 0
135
	.rept \nrounds / 2
136
	 _doubleround
137
	.endr
138
.endm
139

140
.macro _chacha		nrounds
141

142
.Lnext_block\@:
143
	// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
144
	// Registers contain x0-x9,x12-x15.
145

146
	// Do the core ChaCha permutation to update x0-x15.
147
	_chacha_permute	\nrounds
148

149
	add		sp, #8
150
	// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
151
	// Registers contain x0-x9,x12-x15.
152
	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
153

154
	// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
155
	push		{X8_X10, X9_X11, X12, X13, X14, X15}
156

157
	// Load (OUT, IN, LEN).
158
	ldr		r14, [sp, #96]
159
	ldr		r12, [sp, #100]
160
	ldr		r11, [sp, #104]
161

162
	orr		r10, r14, r12
163

164
	// Use slow path if fewer than 64 bytes remain.
165
	cmp		r11, #64
166
	blt		.Lxor_slowpath\@
167

168
	// Use slow path if IN and/or OUT isn't 4-byte aligned.  Needed even on
169
	// ARMv6+, since ldmia and stmia (used below) still require alignment.
170
	tst		r10, #3
171
	bne		.Lxor_slowpath\@
172

173
	// Fast path: XOR 64 bytes of aligned data.
174

175
	// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
176
	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
177
	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
178

179
	// x0-x3
180
	__ldrd		r8, r9, sp, 32
181
	__ldrd		r10, r11, sp, 40
182
	add		X0, X0, r8
183
	add		X1, X1, r9
184
	add		X2, X2, r10
185
	add		X3, X3, r11
186
	_le32_bswap_4x	X0, X1, X2, X3,  r8
187
	ldmia		r12!, {r8-r11}
188
	eor		X0, X0, r8
189
	eor		X1, X1, r9
190
	eor		X2, X2, r10
191
	eor		X3, X3, r11
192
	stmia		r14!, {X0-X3}
193

194
	// x4-x7
195
	__ldrd		r8, r9, sp, 48
196
	__ldrd		r10, r11, sp, 56
197
	add		X4, r8, X4, ror #brot
198
	add		X5, r9, X5, ror #brot
199
	ldmia		r12!, {X0-X3}
200
	add		X6, r10, X6, ror #brot
201
	add		X7, r11, X7, ror #brot
202
	_le32_bswap_4x	X4, X5, X6, X7,  r8
203
	eor		X4, X4, X0
204
	eor		X5, X5, X1
205
	eor		X6, X6, X2
206
	eor		X7, X7, X3
207
	stmia		r14!, {X4-X7}
208

209
	// x8-x15
210
	pop		{r0-r7}			// (x8-x9,x12-x15,x10-x11)
211
	__ldrd		r8, r9, sp, 32
212
	__ldrd		r10, r11, sp, 40
213
	add		r0, r0, r8		// x8
214
	add		r1, r1, r9		// x9
215
	add		r6, r6, r10		// x10
216
	add		r7, r7, r11		// x11
217
	_le32_bswap_4x	r0, r1, r6, r7,  r8
218
	ldmia		r12!, {r8-r11}
219
	eor		r0, r0, r8		// x8
220
	eor		r1, r1, r9		// x9
221
	eor		r6, r6, r10		// x10
222
	eor		r7, r7, r11		// x11
223
	stmia		r14!, {r0,r1,r6,r7}
224
	ldmia		r12!, {r0,r1,r6,r7}
225
	__ldrd		r8, r9, sp, 48
226
	__ldrd		r10, r11, sp, 56
227
	add		r2, r8, r2, ror #drot	// x12
228
	add		r3, r9, r3, ror #drot	// x13
229
	add		r4, r10, r4, ror #drot	// x14
230
	add		r5, r11, r5, ror #drot	// x15
231
	_le32_bswap_4x	r2, r3, r4, r5,  r9
232
	  ldr		r9, [sp, #72]		// load LEN
233
	eor		r2, r2, r0		// x12
234
	eor		r3, r3, r1		// x13
235
	eor		r4, r4, r6		// x14
236
	eor		r5, r5, r7		// x15
237
	  subs		r9, #64			// decrement and check LEN
238
	stmia		r14!, {r2-r5}
239

240
	beq		.Ldone\@
241

242
.Lprepare_for_next_block\@:
243

244
	// Stack: x0-x15 OUT IN LEN
245

246
	// Increment block counter (x12)
247
	add		r8, #1
248

249
	// Store updated (OUT, IN, LEN)
250
	str		r14, [sp, #64]
251
	str		r12, [sp, #68]
252
	str		r9, [sp, #72]
253

254
	  mov		r14, sp
255

256
	// Store updated block counter (x12)
257
	str		r8, [sp, #48]
258

259
	  sub		sp, #16
260

261
	// Reload state and do next block
262
	ldmia		r14!, {r0-r11}		// load x0-x11
263
	__strd		r10, r11, sp, 8		// store x10-x11 before state
264
	ldmia		r14, {r10-r12,r14}	// load x12-x15
265
	b		.Lnext_block\@
266

267
.Lxor_slowpath\@:
268
	// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
269
	// We handle it by storing the 64 bytes of keystream to the stack, then
270
	// XOR-ing the needed portion with the data.
271

272
	// Allocate keystream buffer
273
	sub		sp, #64
274
	mov		r14, sp
275

276
	// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
277
	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
278
	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
279

280
	// Save keystream for x0-x3
281
	__ldrd		r8, r9, sp, 96
282
	__ldrd		r10, r11, sp, 104
283
	add		X0, X0, r8
284
	add		X1, X1, r9
285
	add		X2, X2, r10
286
	add		X3, X3, r11
287
	_le32_bswap_4x	X0, X1, X2, X3,  r8
288
	stmia		r14!, {X0-X3}
289

290
	// Save keystream for x4-x7
291
	__ldrd		r8, r9, sp, 112
292
	__ldrd		r10, r11, sp, 120
293
	add		X4, r8, X4, ror #brot
294
	add		X5, r9, X5, ror #brot
295
	add		X6, r10, X6, ror #brot
296
	add		X7, r11, X7, ror #brot
297
	_le32_bswap_4x	X4, X5, X6, X7,  r8
298
	  add		r8, sp, #64
299
	stmia		r14!, {X4-X7}
300

301
	// Save keystream for x8-x15
302
	ldm		r8, {r0-r7}		// (x8-x9,x12-x15,x10-x11)
303
	__ldrd		r8, r9, sp, 128
304
	__ldrd		r10, r11, sp, 136
305
	add		r0, r0, r8		// x8
306
	add		r1, r1, r9		// x9
307
	add		r6, r6, r10		// x10
308
	add		r7, r7, r11		// x11
309
	_le32_bswap_4x	r0, r1, r6, r7,  r8
310
	stmia		r14!, {r0,r1,r6,r7}
311
	__ldrd		r8, r9, sp, 144
312
	__ldrd		r10, r11, sp, 152
313
	add		r2, r8, r2, ror #drot	// x12
314
	add		r3, r9, r3, ror #drot	// x13
315
	add		r4, r10, r4, ror #drot	// x14
316
	add		r5, r11, r5, ror #drot	// x15
317
	_le32_bswap_4x	r2, r3, r4, r5,  r9
318
	stmia		r14, {r2-r5}
319

320
	// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
321
	// Registers: r8 is block counter, r12 is IN.
322

323
	ldr		r9, [sp, #168]		// LEN
324
	ldr		r14, [sp, #160]		// OUT
325
	cmp		r9, #64
326
	  mov		r0, sp
327
	movle		r1, r9
328
	movgt		r1, #64
329
	// r1 is number of bytes to XOR, in range [1, 64]
330

331
.if __LINUX_ARM_ARCH__ < 6
332
	orr		r2, r12, r14
333
	tst		r2, #3			// IN or OUT misaligned?
334
	bne		.Lxor_next_byte\@
335
.endif
336

337
	// XOR a word at a time
338
.rept 16
339
	subs		r1, #4
340
	blt		.Lxor_words_done\@
341
	ldr		r2, [r12], #4
342
	ldr		r3, [r0], #4
343
	eor		r2, r2, r3
344
	str		r2, [r14], #4
345
.endr
346
	b		.Lxor_slowpath_done\@
347
.Lxor_words_done\@:
348
	ands		r1, r1, #3
349
	beq		.Lxor_slowpath_done\@
350

351
	// XOR a byte at a time
352
.Lxor_next_byte\@:
353
	ldrb		r2, [r12], #1
354
	ldrb		r3, [r0], #1
355
	eor		r2, r2, r3
356
	strb		r2, [r14], #1
357
	subs		r1, #1
358
	bne		.Lxor_next_byte\@
359

360
.Lxor_slowpath_done\@:
361
	subs		r9, #64
362
	add		sp, #96
363
	bgt		.Lprepare_for_next_block\@
364

365
.Ldone\@:
366
.endm	// _chacha
367

368
/*
369
 * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
370
 *		     const struct chacha_state *state, int nrounds);
371
 */
372
ENTRY(chacha_doarm)
373
	cmp		r2, #0			// len == 0?
374
	reteq		lr
375

376
	ldr		ip, [sp]
377
	cmp		ip, #12
378

379
	push		{r0-r2,r4-r11,lr}
380

381
	// Push state x0-x15 onto stack.
382
	// Also store an extra copy of x10-x11 just before the state.
383

384
	add		X12, r3, #48
385
	ldm		X12, {X12,X13,X14,X15}
386
	push		{X12,X13,X14,X15}
387
	sub		sp, sp, #64
388

389
	__ldrd		X8_X10, X9_X11, r3, 40
390
	__strd		X8_X10, X9_X11, sp, 8
391
	__strd		X8_X10, X9_X11, sp, 56
392
	ldm		r3, {X0-X9_X11}
393
	__strd		X0, X1, sp, 16
394
	__strd		X2, X3, sp, 24
395
	__strd		X4, X5, sp, 32
396
	__strd		X6, X7, sp, 40
397
	__strd		X8_X10, X9_X11, sp, 48
398

399
	beq		1f
400
	_chacha		20
401

402
0:	add		sp, #76
403
	pop		{r4-r11, pc}
404

405
1:	_chacha		12
406
	b		0b
407
ENDPROC(chacha_doarm)
408

409
/*
410
 * void hchacha_block_arm(const struct chacha_state *state,
411
 *			  u32 out[HCHACHA_OUT_WORDS], int nrounds);
412
 */
413
ENTRY(hchacha_block_arm)
414
	push		{r1,r4-r11,lr}
415

416
	cmp		r2, #12			// ChaCha12 ?
417

418
	mov		r14, r0
419
	ldmia		r14!, {r0-r11}		// load x0-x11
420
	push		{r10-r11}		// store x10-x11 to stack
421
	ldm		r14, {r10-r12,r14}	// load x12-x15
422
	sub		sp, #8
423

424
	beq		1f
425
	_chacha_permute	20
426

427
	// Skip over (unused0-unused1, x10-x11)
428
0:	add		sp, #16
429

430
	// Fix up rotations of x12-x15
431
	ror		X12, X12, #drot
432
	ror		X13, X13, #drot
433
	  pop		{r4}			// load 'out'
434
	ror		X14, X14, #drot
435
	ror		X15, X15, #drot
436

437
	// Store (x0-x3,x12-x15) to 'out'
438
	stm		r4, {X0,X1,X2,X3,X12,X13,X14,X15}
439

440
	pop		{r4-r11,pc}
441

442
1:	_chacha_permute	12
443
	b		0b
444
ENDPROC(hchacha_block_arm)
445

446
Product

Resources

Company