Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/arm/chacha-scalar-core.S
26282 views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/*
3
* Copyright (C) 2018 Google, Inc.
4
*/
5
6
#include <linux/linkage.h>
7
#include <asm/assembler.h>
8
9
/*
10
* Design notes:
11
*
12
* 16 registers would be needed to hold the state matrix, but only 14 are
13
* available because 'sp' and 'pc' cannot be used. So we spill the elements
14
* (x8, x9) to the stack and swap them out with (x10, x11). This adds one
15
* 'ldrd' and one 'strd' instruction per round.
16
*
17
* All rotates are performed using the implicit rotate operand accepted by the
18
* 'add' and 'eor' instructions. This is faster than using explicit rotate
19
* instructions. To make this work, we allow the values in the second and last
20
* rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
21
* wrong rotation amount. The rotation amount is then fixed up just in time
22
* when the values are used. 'brot' is the number of bits the values in row 'b'
23
* need to be rotated right to arrive at the correct values, and 'drot'
24
* similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
25
* that they end up as (25, 24) after every round.
26
*/
27
28
// ChaCha state registers
29
X0 .req r0
30
X1 .req r1
31
X2 .req r2
32
X3 .req r3
33
X4 .req r4
34
X5 .req r5
35
X6 .req r6
36
X7 .req r7
37
X8_X10 .req r8 // shared by x8 and x10
38
X9_X11 .req r9 // shared by x9 and x11
39
X12 .req r10
40
X13 .req r11
41
X14 .req r12
42
X15 .req r14
43
44
.macro _le32_bswap_4x a, b, c, d, tmp
45
#ifdef __ARMEB__
46
rev_l \a, \tmp
47
rev_l \b, \tmp
48
rev_l \c, \tmp
49
rev_l \d, \tmp
50
#endif
51
.endm
52
53
.macro __ldrd a, b, src, offset
54
#if __LINUX_ARM_ARCH__ >= 6
55
ldrd \a, \b, [\src, #\offset]
56
#else
57
ldr \a, [\src, #\offset]
58
ldr \b, [\src, #\offset + 4]
59
#endif
60
.endm
61
62
.macro __strd a, b, dst, offset
63
#if __LINUX_ARM_ARCH__ >= 6
64
strd \a, \b, [\dst, #\offset]
65
#else
66
str \a, [\dst, #\offset]
67
str \b, [\dst, #\offset + 4]
68
#endif
69
.endm
70
71
.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
72
73
// a += b; d ^= a; d = rol(d, 16);
74
add \a1, \a1, \b1, ror #brot
75
add \a2, \a2, \b2, ror #brot
76
eor \d1, \a1, \d1, ror #drot
77
eor \d2, \a2, \d2, ror #drot
78
// drot == 32 - 16 == 16
79
80
// c += d; b ^= c; b = rol(b, 12);
81
add \c1, \c1, \d1, ror #16
82
add \c2, \c2, \d2, ror #16
83
eor \b1, \c1, \b1, ror #brot
84
eor \b2, \c2, \b2, ror #brot
85
// brot == 32 - 12 == 20
86
87
// a += b; d ^= a; d = rol(d, 8);
88
add \a1, \a1, \b1, ror #20
89
add \a2, \a2, \b2, ror #20
90
eor \d1, \a1, \d1, ror #16
91
eor \d2, \a2, \d2, ror #16
92
// drot == 32 - 8 == 24
93
94
// c += d; b ^= c; b = rol(b, 7);
95
add \c1, \c1, \d1, ror #24
96
add \c2, \c2, \d2, ror #24
97
eor \b1, \c1, \b1, ror #20
98
eor \b2, \c2, \b2, ror #20
99
// brot == 32 - 7 == 25
100
.endm
101
102
.macro _doubleround
103
104
// column round
105
106
// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
107
_halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
108
109
// save (x8, x9); restore (x10, x11)
110
__strd X8_X10, X9_X11, sp, 0
111
__ldrd X8_X10, X9_X11, sp, 8
112
113
// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
114
_halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
115
116
.set brot, 25
117
.set drot, 24
118
119
// diagonal round
120
121
// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
122
_halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
123
124
// save (x10, x11); restore (x8, x9)
125
__strd X8_X10, X9_X11, sp, 8
126
__ldrd X8_X10, X9_X11, sp, 0
127
128
// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
129
_halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
130
.endm
131
132
.macro _chacha_permute nrounds
133
.set brot, 0
134
.set drot, 0
135
.rept \nrounds / 2
136
_doubleround
137
.endr
138
.endm
139
140
.macro _chacha nrounds
141
142
.Lnext_block\@:
143
// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
144
// Registers contain x0-x9,x12-x15.
145
146
// Do the core ChaCha permutation to update x0-x15.
147
_chacha_permute \nrounds
148
149
add sp, #8
150
// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
151
// Registers contain x0-x9,x12-x15.
152
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
153
154
// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
155
push {X8_X10, X9_X11, X12, X13, X14, X15}
156
157
// Load (OUT, IN, LEN).
158
ldr r14, [sp, #96]
159
ldr r12, [sp, #100]
160
ldr r11, [sp, #104]
161
162
orr r10, r14, r12
163
164
// Use slow path if fewer than 64 bytes remain.
165
cmp r11, #64
166
blt .Lxor_slowpath\@
167
168
// Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
169
// ARMv6+, since ldmia and stmia (used below) still require alignment.
170
tst r10, #3
171
bne .Lxor_slowpath\@
172
173
// Fast path: XOR 64 bytes of aligned data.
174
175
// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
176
// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
177
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
178
179
// x0-x3
180
__ldrd r8, r9, sp, 32
181
__ldrd r10, r11, sp, 40
182
add X0, X0, r8
183
add X1, X1, r9
184
add X2, X2, r10
185
add X3, X3, r11
186
_le32_bswap_4x X0, X1, X2, X3, r8
187
ldmia r12!, {r8-r11}
188
eor X0, X0, r8
189
eor X1, X1, r9
190
eor X2, X2, r10
191
eor X3, X3, r11
192
stmia r14!, {X0-X3}
193
194
// x4-x7
195
__ldrd r8, r9, sp, 48
196
__ldrd r10, r11, sp, 56
197
add X4, r8, X4, ror #brot
198
add X5, r9, X5, ror #brot
199
ldmia r12!, {X0-X3}
200
add X6, r10, X6, ror #brot
201
add X7, r11, X7, ror #brot
202
_le32_bswap_4x X4, X5, X6, X7, r8
203
eor X4, X4, X0
204
eor X5, X5, X1
205
eor X6, X6, X2
206
eor X7, X7, X3
207
stmia r14!, {X4-X7}
208
209
// x8-x15
210
pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
211
__ldrd r8, r9, sp, 32
212
__ldrd r10, r11, sp, 40
213
add r0, r0, r8 // x8
214
add r1, r1, r9 // x9
215
add r6, r6, r10 // x10
216
add r7, r7, r11 // x11
217
_le32_bswap_4x r0, r1, r6, r7, r8
218
ldmia r12!, {r8-r11}
219
eor r0, r0, r8 // x8
220
eor r1, r1, r9 // x9
221
eor r6, r6, r10 // x10
222
eor r7, r7, r11 // x11
223
stmia r14!, {r0,r1,r6,r7}
224
ldmia r12!, {r0,r1,r6,r7}
225
__ldrd r8, r9, sp, 48
226
__ldrd r10, r11, sp, 56
227
add r2, r8, r2, ror #drot // x12
228
add r3, r9, r3, ror #drot // x13
229
add r4, r10, r4, ror #drot // x14
230
add r5, r11, r5, ror #drot // x15
231
_le32_bswap_4x r2, r3, r4, r5, r9
232
ldr r9, [sp, #72] // load LEN
233
eor r2, r2, r0 // x12
234
eor r3, r3, r1 // x13
235
eor r4, r4, r6 // x14
236
eor r5, r5, r7 // x15
237
subs r9, #64 // decrement and check LEN
238
stmia r14!, {r2-r5}
239
240
beq .Ldone\@
241
242
.Lprepare_for_next_block\@:
243
244
// Stack: x0-x15 OUT IN LEN
245
246
// Increment block counter (x12)
247
add r8, #1
248
249
// Store updated (OUT, IN, LEN)
250
str r14, [sp, #64]
251
str r12, [sp, #68]
252
str r9, [sp, #72]
253
254
mov r14, sp
255
256
// Store updated block counter (x12)
257
str r8, [sp, #48]
258
259
sub sp, #16
260
261
// Reload state and do next block
262
ldmia r14!, {r0-r11} // load x0-x11
263
__strd r10, r11, sp, 8 // store x10-x11 before state
264
ldmia r14, {r10-r12,r14} // load x12-x15
265
b .Lnext_block\@
266
267
.Lxor_slowpath\@:
268
// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
269
// We handle it by storing the 64 bytes of keystream to the stack, then
270
// XOR-ing the needed portion with the data.
271
272
// Allocate keystream buffer
273
sub sp, #64
274
mov r14, sp
275
276
// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
277
// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
278
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
279
280
// Save keystream for x0-x3
281
__ldrd r8, r9, sp, 96
282
__ldrd r10, r11, sp, 104
283
add X0, X0, r8
284
add X1, X1, r9
285
add X2, X2, r10
286
add X3, X3, r11
287
_le32_bswap_4x X0, X1, X2, X3, r8
288
stmia r14!, {X0-X3}
289
290
// Save keystream for x4-x7
291
__ldrd r8, r9, sp, 112
292
__ldrd r10, r11, sp, 120
293
add X4, r8, X4, ror #brot
294
add X5, r9, X5, ror #brot
295
add X6, r10, X6, ror #brot
296
add X7, r11, X7, ror #brot
297
_le32_bswap_4x X4, X5, X6, X7, r8
298
add r8, sp, #64
299
stmia r14!, {X4-X7}
300
301
// Save keystream for x8-x15
302
ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
303
__ldrd r8, r9, sp, 128
304
__ldrd r10, r11, sp, 136
305
add r0, r0, r8 // x8
306
add r1, r1, r9 // x9
307
add r6, r6, r10 // x10
308
add r7, r7, r11 // x11
309
_le32_bswap_4x r0, r1, r6, r7, r8
310
stmia r14!, {r0,r1,r6,r7}
311
__ldrd r8, r9, sp, 144
312
__ldrd r10, r11, sp, 152
313
add r2, r8, r2, ror #drot // x12
314
add r3, r9, r3, ror #drot // x13
315
add r4, r10, r4, ror #drot // x14
316
add r5, r11, r5, ror #drot // x15
317
_le32_bswap_4x r2, r3, r4, r5, r9
318
stmia r14, {r2-r5}
319
320
// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
321
// Registers: r8 is block counter, r12 is IN.
322
323
ldr r9, [sp, #168] // LEN
324
ldr r14, [sp, #160] // OUT
325
cmp r9, #64
326
mov r0, sp
327
movle r1, r9
328
movgt r1, #64
329
// r1 is number of bytes to XOR, in range [1, 64]
330
331
.if __LINUX_ARM_ARCH__ < 6
332
orr r2, r12, r14
333
tst r2, #3 // IN or OUT misaligned?
334
bne .Lxor_next_byte\@
335
.endif
336
337
// XOR a word at a time
338
.rept 16
339
subs r1, #4
340
blt .Lxor_words_done\@
341
ldr r2, [r12], #4
342
ldr r3, [r0], #4
343
eor r2, r2, r3
344
str r2, [r14], #4
345
.endr
346
b .Lxor_slowpath_done\@
347
.Lxor_words_done\@:
348
ands r1, r1, #3
349
beq .Lxor_slowpath_done\@
350
351
// XOR a byte at a time
352
.Lxor_next_byte\@:
353
ldrb r2, [r12], #1
354
ldrb r3, [r0], #1
355
eor r2, r2, r3
356
strb r2, [r14], #1
357
subs r1, #1
358
bne .Lxor_next_byte\@
359
360
.Lxor_slowpath_done\@:
361
subs r9, #64
362
add sp, #96
363
bgt .Lprepare_for_next_block\@
364
365
.Ldone\@:
366
.endm // _chacha
367
368
/*
369
* void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
370
* const struct chacha_state *state, int nrounds);
371
*/
372
ENTRY(chacha_doarm)
373
cmp r2, #0 // len == 0?
374
reteq lr
375
376
ldr ip, [sp]
377
cmp ip, #12
378
379
push {r0-r2,r4-r11,lr}
380
381
// Push state x0-x15 onto stack.
382
// Also store an extra copy of x10-x11 just before the state.
383
384
add X12, r3, #48
385
ldm X12, {X12,X13,X14,X15}
386
push {X12,X13,X14,X15}
387
sub sp, sp, #64
388
389
__ldrd X8_X10, X9_X11, r3, 40
390
__strd X8_X10, X9_X11, sp, 8
391
__strd X8_X10, X9_X11, sp, 56
392
ldm r3, {X0-X9_X11}
393
__strd X0, X1, sp, 16
394
__strd X2, X3, sp, 24
395
__strd X4, X5, sp, 32
396
__strd X6, X7, sp, 40
397
__strd X8_X10, X9_X11, sp, 48
398
399
beq 1f
400
_chacha 20
401
402
0: add sp, #76
403
pop {r4-r11, pc}
404
405
1: _chacha 12
406
b 0b
407
ENDPROC(chacha_doarm)
408
409
/*
410
* void hchacha_block_arm(const struct chacha_state *state,
411
* u32 out[HCHACHA_OUT_WORDS], int nrounds);
412
*/
413
ENTRY(hchacha_block_arm)
414
push {r1,r4-r11,lr}
415
416
cmp r2, #12 // ChaCha12 ?
417
418
mov r14, r0
419
ldmia r14!, {r0-r11} // load x0-x11
420
push {r10-r11} // store x10-x11 to stack
421
ldm r14, {r10-r12,r14} // load x12-x15
422
sub sp, #8
423
424
beq 1f
425
_chacha_permute 20
426
427
// Skip over (unused0-unused1, x10-x11)
428
0: add sp, #16
429
430
// Fix up rotations of x12-x15
431
ror X12, X12, #drot
432
ror X13, X13, #drot
433
pop {r4} // load 'out'
434
ror X14, X14, #drot
435
ror X15, X15, #drot
436
437
// Store (x0-x3,x12-x15) to 'out'
438
stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
439
440
pop {r4-r11,pc}
441
442
1: _chacha_permute 12
443
b 0b
444
ENDPROC(hchacha_block_arm)
445
446