Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/mips/chacha-core.S
26282 views
1
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
2
/*
3
* Copyright (C) 2016-2018 RenĂ© van Dorst <[email protected]>. All Rights Reserved.
4
* Copyright (C) 2015-2019 Jason A. Donenfeld <[email protected]>. All Rights Reserved.
5
*/
6
7
#define MASK_U32 0x3c
8
#define CHACHA20_BLOCK_SIZE 64
9
#define STACK_SIZE 32
10
11
#define X0 $t0
12
#define X1 $t1
13
#define X2 $t2
14
#define X3 $t3
15
#define X4 $t4
16
#define X5 $t5
17
#define X6 $t6
18
#define X7 $t7
19
#define X8 $t8
20
#define X9 $t9
21
#define X10 $v1
22
#define X11 $s6
23
#define X12 $s5
24
#define X13 $s4
25
#define X14 $s3
26
#define X15 $s2
27
/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
28
#define T0 $s1
29
#define T1 $s0
30
#define T(n) T ## n
31
#define X(n) X ## n
32
33
/* Input arguments */
34
#define STATE $a0
35
#define OUT $a1
36
#define IN $a2
37
#define BYTES $a3
38
39
/* Output argument */
40
/* NONCE[0] is kept in a register and not in memory.
41
* We don't want to touch original value in memory.
42
* Must be incremented every loop iteration.
43
*/
44
#define NONCE_0 $v0
45
46
/* SAVED_X and SAVED_CA are set in the jump table.
47
* Use regs which are overwritten on exit else we don't leak clear data.
48
* They are used to handling the last bytes which are not multiple of 4.
49
*/
50
#define SAVED_X X15
51
#define SAVED_CA $s7
52
53
#define IS_UNALIGNED $s7
54
55
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
56
#define MSB 0
57
#define LSB 3
58
#define CPU_TO_LE32(n) \
59
wsbh n, n; \
60
rotr n, 16;
61
#else
62
#define MSB 3
63
#define LSB 0
64
#define CPU_TO_LE32(n)
65
#endif
66
67
#define FOR_EACH_WORD(x) \
68
x( 0); \
69
x( 1); \
70
x( 2); \
71
x( 3); \
72
x( 4); \
73
x( 5); \
74
x( 6); \
75
x( 7); \
76
x( 8); \
77
x( 9); \
78
x(10); \
79
x(11); \
80
x(12); \
81
x(13); \
82
x(14); \
83
x(15);
84
85
#define FOR_EACH_WORD_REV(x) \
86
x(15); \
87
x(14); \
88
x(13); \
89
x(12); \
90
x(11); \
91
x(10); \
92
x( 9); \
93
x( 8); \
94
x( 7); \
95
x( 6); \
96
x( 5); \
97
x( 4); \
98
x( 3); \
99
x( 2); \
100
x( 1); \
101
x( 0);
102
103
#define PLUS_ONE_0 1
104
#define PLUS_ONE_1 2
105
#define PLUS_ONE_2 3
106
#define PLUS_ONE_3 4
107
#define PLUS_ONE_4 5
108
#define PLUS_ONE_5 6
109
#define PLUS_ONE_6 7
110
#define PLUS_ONE_7 8
111
#define PLUS_ONE_8 9
112
#define PLUS_ONE_9 10
113
#define PLUS_ONE_10 11
114
#define PLUS_ONE_11 12
115
#define PLUS_ONE_12 13
116
#define PLUS_ONE_13 14
117
#define PLUS_ONE_14 15
118
#define PLUS_ONE_15 16
119
#define PLUS_ONE(x) PLUS_ONE_ ## x
120
#define _CONCAT3(a,b,c) a ## b ## c
121
#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
122
123
#define STORE_UNALIGNED(x) \
124
CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
125
.if (x != 12); \
126
lw T0, (x*4)(STATE); \
127
.endif; \
128
lwl T1, (x*4)+MSB ## (IN); \
129
lwr T1, (x*4)+LSB ## (IN); \
130
.if (x == 12); \
131
addu X ## x, NONCE_0; \
132
.else; \
133
addu X ## x, T0; \
134
.endif; \
135
CPU_TO_LE32(X ## x); \
136
xor X ## x, T1; \
137
swl X ## x, (x*4)+MSB ## (OUT); \
138
swr X ## x, (x*4)+LSB ## (OUT);
139
140
#define STORE_ALIGNED(x) \
141
CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
142
.if (x != 12); \
143
lw T0, (x*4)(STATE); \
144
.endif; \
145
lw T1, (x*4) ## (IN); \
146
.if (x == 12); \
147
addu X ## x, NONCE_0; \
148
.else; \
149
addu X ## x, T0; \
150
.endif; \
151
CPU_TO_LE32(X ## x); \
152
xor X ## x, T1; \
153
sw X ## x, (x*4) ## (OUT);
154
155
/* Jump table macro.
156
* Used for setup and handling the last bytes, which are not multiple of 4.
157
* X15 is free to store Xn
158
* Every jumptable entry must be equal in size.
159
*/
160
#define JMPTBL_ALIGNED(x) \
161
.Lchacha_mips_jmptbl_aligned_ ## x: ; \
162
.set noreorder; \
163
b .Lchacha_mips_xor_aligned_ ## x ## _b; \
164
.if (x == 12); \
165
addu SAVED_X, X ## x, NONCE_0; \
166
.else; \
167
addu SAVED_X, X ## x, SAVED_CA; \
168
.endif; \
169
.set reorder
170
171
#define JMPTBL_UNALIGNED(x) \
172
.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
173
.set noreorder; \
174
b .Lchacha_mips_xor_unaligned_ ## x ## _b; \
175
.if (x == 12); \
176
addu SAVED_X, X ## x, NONCE_0; \
177
.else; \
178
addu SAVED_X, X ## x, SAVED_CA; \
179
.endif; \
180
.set reorder
181
182
#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
183
addu X(A), X(K); \
184
addu X(B), X(L); \
185
addu X(C), X(M); \
186
addu X(D), X(N); \
187
xor X(V), X(A); \
188
xor X(W), X(B); \
189
xor X(Y), X(C); \
190
xor X(Z), X(D); \
191
rotr X(V), 32 - S; \
192
rotr X(W), 32 - S; \
193
rotr X(Y), 32 - S; \
194
rotr X(Z), 32 - S;
195
196
.text
197
.set reorder
198
.set noat
199
.globl chacha_crypt_arch
200
.ent chacha_crypt_arch
201
chacha_crypt_arch:
202
.frame $sp, STACK_SIZE, $ra
203
204
/* Load number of rounds */
205
lw $at, 16($sp)
206
207
addiu $sp, -STACK_SIZE
208
209
/* Return bytes = 0. */
210
beqz BYTES, .Lchacha_mips_end
211
212
lw NONCE_0, 48(STATE)
213
214
/* Save s0-s7 */
215
sw $s0, 0($sp)
216
sw $s1, 4($sp)
217
sw $s2, 8($sp)
218
sw $s3, 12($sp)
219
sw $s4, 16($sp)
220
sw $s5, 20($sp)
221
sw $s6, 24($sp)
222
sw $s7, 28($sp)
223
224
/* Test IN or OUT is unaligned.
225
* IS_UNALIGNED = ( IN | OUT ) & 0x00000003
226
*/
227
or IS_UNALIGNED, IN, OUT
228
andi IS_UNALIGNED, 0x3
229
230
b .Lchacha_rounds_start
231
232
.align 4
233
.Loop_chacha_rounds:
234
addiu IN, CHACHA20_BLOCK_SIZE
235
addiu OUT, CHACHA20_BLOCK_SIZE
236
addiu NONCE_0, 1
237
238
.Lchacha_rounds_start:
239
lw X0, 0(STATE)
240
lw X1, 4(STATE)
241
lw X2, 8(STATE)
242
lw X3, 12(STATE)
243
244
lw X4, 16(STATE)
245
lw X5, 20(STATE)
246
lw X6, 24(STATE)
247
lw X7, 28(STATE)
248
lw X8, 32(STATE)
249
lw X9, 36(STATE)
250
lw X10, 40(STATE)
251
lw X11, 44(STATE)
252
253
move X12, NONCE_0
254
lw X13, 52(STATE)
255
lw X14, 56(STATE)
256
lw X15, 60(STATE)
257
258
.Loop_chacha_xor_rounds:
259
addiu $at, -2
260
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
261
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
262
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
263
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
264
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
265
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
266
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
267
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
268
bnez $at, .Loop_chacha_xor_rounds
269
270
addiu BYTES, -(CHACHA20_BLOCK_SIZE)
271
272
/* Is data src/dst unaligned? Jump */
273
bnez IS_UNALIGNED, .Loop_chacha_unaligned
274
275
/* Set number rounds here to fill delayslot. */
276
lw $at, (STACK_SIZE+16)($sp)
277
278
/* BYTES < 0, it has no full block. */
279
bltz BYTES, .Lchacha_mips_no_full_block_aligned
280
281
FOR_EACH_WORD_REV(STORE_ALIGNED)
282
283
/* BYTES > 0? Loop again. */
284
bgtz BYTES, .Loop_chacha_rounds
285
286
/* Place this here to fill delay slot */
287
addiu NONCE_0, 1
288
289
/* BYTES < 0? Handle last bytes */
290
bltz BYTES, .Lchacha_mips_xor_bytes
291
292
.Lchacha_mips_xor_done:
293
/* Restore used registers */
294
lw $s0, 0($sp)
295
lw $s1, 4($sp)
296
lw $s2, 8($sp)
297
lw $s3, 12($sp)
298
lw $s4, 16($sp)
299
lw $s5, 20($sp)
300
lw $s6, 24($sp)
301
lw $s7, 28($sp)
302
303
/* Write NONCE_0 back to right location in state */
304
sw NONCE_0, 48(STATE)
305
306
.Lchacha_mips_end:
307
addiu $sp, STACK_SIZE
308
jr $ra
309
310
.Lchacha_mips_no_full_block_aligned:
311
/* Restore the offset on BYTES */
312
addiu BYTES, CHACHA20_BLOCK_SIZE
313
314
/* Get number of full WORDS */
315
andi $at, BYTES, MASK_U32
316
317
/* Load upper half of jump table addr */
318
lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
319
320
/* Calculate lower half jump table offset */
321
ins T0, $at, 1, 6
322
323
/* Add offset to STATE */
324
addu T1, STATE, $at
325
326
/* Add lower half jump table addr */
327
addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
328
329
/* Read value from STATE */
330
lw SAVED_CA, 0(T1)
331
332
/* Store remaining bytecounter as negative value */
333
subu BYTES, $at, BYTES
334
335
jr T0
336
337
/* Jump table */
338
FOR_EACH_WORD(JMPTBL_ALIGNED)
339
340
341
.Loop_chacha_unaligned:
342
/* Set number rounds here to fill delayslot. */
343
lw $at, (STACK_SIZE+16)($sp)
344
345
/* BYTES > 0, it has no full block. */
346
bltz BYTES, .Lchacha_mips_no_full_block_unaligned
347
348
FOR_EACH_WORD_REV(STORE_UNALIGNED)
349
350
/* BYTES > 0? Loop again. */
351
bgtz BYTES, .Loop_chacha_rounds
352
353
/* Write NONCE_0 back to right location in state */
354
sw NONCE_0, 48(STATE)
355
356
.set noreorder
357
/* Fall through to byte handling */
358
bgez BYTES, .Lchacha_mips_xor_done
359
.Lchacha_mips_xor_unaligned_0_b:
360
.Lchacha_mips_xor_aligned_0_b:
361
/* Place this here to fill delay slot */
362
addiu NONCE_0, 1
363
.set reorder
364
365
.Lchacha_mips_xor_bytes:
366
addu IN, $at
367
addu OUT, $at
368
/* First byte */
369
lbu T1, 0(IN)
370
addiu $at, BYTES, 1
371
xor T1, SAVED_X
372
sb T1, 0(OUT)
373
beqz $at, .Lchacha_mips_xor_done
374
/* Second byte */
375
lbu T1, 1(IN)
376
addiu $at, BYTES, 2
377
rotr SAVED_X, 8
378
xor T1, SAVED_X
379
sb T1, 1(OUT)
380
beqz $at, .Lchacha_mips_xor_done
381
/* Third byte */
382
lbu T1, 2(IN)
383
rotr SAVED_X, 8
384
xor T1, SAVED_X
385
sb T1, 2(OUT)
386
b .Lchacha_mips_xor_done
387
388
.Lchacha_mips_no_full_block_unaligned:
389
/* Restore the offset on BYTES */
390
addiu BYTES, CHACHA20_BLOCK_SIZE
391
392
/* Get number of full WORDS */
393
andi $at, BYTES, MASK_U32
394
395
/* Load upper half of jump table addr */
396
lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
397
398
/* Calculate lower half jump table offset */
399
ins T0, $at, 1, 6
400
401
/* Add offset to STATE */
402
addu T1, STATE, $at
403
404
/* Add lower half jump table addr */
405
addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
406
407
/* Read value from STATE */
408
lw SAVED_CA, 0(T1)
409
410
/* Store remaining bytecounter as negative value */
411
subu BYTES, $at, BYTES
412
413
jr T0
414
415
/* Jump table */
416
FOR_EACH_WORD(JMPTBL_UNALIGNED)
417
.end chacha_crypt_arch
418
.set at
419
420
/* Input arguments
421
* STATE $a0
422
* OUT $a1
423
* NROUND $a2
424
*/
425
426
#undef X12
427
#undef X13
428
#undef X14
429
#undef X15
430
431
#define X12 $a3
432
#define X13 $at
433
#define X14 $v0
434
#define X15 STATE
435
436
.set noat
437
.globl hchacha_block_arch
438
.ent hchacha_block_arch
439
hchacha_block_arch:
440
.frame $sp, STACK_SIZE, $ra
441
442
addiu $sp, -STACK_SIZE
443
444
/* Save X11(s6) */
445
sw X11, 0($sp)
446
447
lw X0, 0(STATE)
448
lw X1, 4(STATE)
449
lw X2, 8(STATE)
450
lw X3, 12(STATE)
451
lw X4, 16(STATE)
452
lw X5, 20(STATE)
453
lw X6, 24(STATE)
454
lw X7, 28(STATE)
455
lw X8, 32(STATE)
456
lw X9, 36(STATE)
457
lw X10, 40(STATE)
458
lw X11, 44(STATE)
459
lw X12, 48(STATE)
460
lw X13, 52(STATE)
461
lw X14, 56(STATE)
462
lw X15, 60(STATE)
463
464
.Loop_hchacha_xor_rounds:
465
addiu $a2, -2
466
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
467
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
468
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
469
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
470
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
471
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
472
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
473
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
474
bnez $a2, .Loop_hchacha_xor_rounds
475
476
/* Restore used register */
477
lw X11, 0($sp)
478
479
sw X0, 0(OUT)
480
sw X1, 4(OUT)
481
sw X2, 8(OUT)
482
sw X3, 12(OUT)
483
sw X12, 16(OUT)
484
sw X13, 20(OUT)
485
sw X14, 24(OUT)
486
sw X15, 28(OUT)
487
488
addiu $sp, STACK_SIZE
489
jr $ra
490
.end hchacha_block_arch
491
.set at
492
493