Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/loongarch/vdso/vgetrandom-chacha.S
26427 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Copyright (C) 2024 Xi Ruoyao <[email protected]>. All Rights Reserved.
4
*/
5
6
#include <asm/asm.h>
7
#include <asm/regdef.h>
8
#include <linux/linkage.h>
9
10
.text
11
12
.macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3
13
\op \d0, \d0, \s0
14
\op \d1, \d1, \s1
15
\op \d2, \d2, \s2
16
\op \d3, \d3, \s3
17
.endm
18
19
/*
20
* Very basic LoongArch implementation of ChaCha20. Produces a given positive
21
* number of blocks of output with a nonce of 0, taking an input key and
22
* 8-byte counter. Importantly does not spill to the stack. Its arguments
23
* are:
24
*
25
* a0: output bytes
26
* a1: 32-byte key input
27
* a2: 8-byte counter input/output
28
* a3: number of 64-byte blocks to write to output
29
*/
30
SYM_FUNC_START(__arch_chacha20_blocks_nostack)
31
32
/* We don't need a frame pointer */
33
#define s9 fp
34
35
#define output a0
36
#define key a1
37
#define counter a2
38
#define nblocks a3
39
#define i a4
40
#define state0 s0
41
#define state1 s1
42
#define state2 s2
43
#define state3 s3
44
#define state4 s4
45
#define state5 s5
46
#define state6 s6
47
#define state7 s7
48
#define state8 s8
49
#define state9 s9
50
#define state10 a5
51
#define state11 a6
52
#define state12 a7
53
#define state13 t0
54
#define state14 t1
55
#define state15 t2
56
#define cnt_lo t3
57
#define cnt_hi t4
58
#define copy0 t5
59
#define copy1 t6
60
#define copy2 t7
61
#define copy3 t8
62
63
/* Packs to be used with OP_4REG */
64
#define line0 state0, state1, state2, state3
65
#define line1 state4, state5, state6, state7
66
#define line2 state8, state9, state10, state11
67
#define line3 state12, state13, state14, state15
68
69
#define line1_perm state5, state6, state7, state4
70
#define line2_perm state10, state11, state8, state9
71
#define line3_perm state15, state12, state13, state14
72
73
#define copy copy0, copy1, copy2, copy3
74
75
#define _16 16, 16, 16, 16
76
#define _20 20, 20, 20, 20
77
#define _24 24, 24, 24, 24
78
#define _25 25, 25, 25, 25
79
80
/*
81
* The ABI requires s0-s9 saved, and sp aligned to 16-byte.
82
* This does not violate the stack-less requirement: no sensitive data
83
* is spilled onto the stack.
84
*/
85
PTR_ADDI sp, sp, (-SZREG * 10) & STACK_ALIGN
86
REG_S s0, sp, 0
87
REG_S s1, sp, SZREG
88
REG_S s2, sp, SZREG * 2
89
REG_S s3, sp, SZREG * 3
90
REG_S s4, sp, SZREG * 4
91
REG_S s5, sp, SZREG * 5
92
REG_S s6, sp, SZREG * 6
93
REG_S s7, sp, SZREG * 7
94
REG_S s8, sp, SZREG * 8
95
REG_S s9, sp, SZREG * 9
96
97
li.w copy0, 0x61707865
98
li.w copy1, 0x3320646e
99
li.w copy2, 0x79622d32
100
li.w copy3, 0x6b206574
101
102
ld.w cnt_lo, counter, 0
103
ld.w cnt_hi, counter, 4
104
105
.Lblock:
106
/* state[0,1,2,3] = "expand 32-byte k" */
107
move state0, copy0
108
move state1, copy1
109
move state2, copy2
110
move state3, copy3
111
112
/* state[4,5,..,11] = key */
113
ld.w state4, key, 0
114
ld.w state5, key, 4
115
ld.w state6, key, 8
116
ld.w state7, key, 12
117
ld.w state8, key, 16
118
ld.w state9, key, 20
119
ld.w state10, key, 24
120
ld.w state11, key, 28
121
122
/* state[12,13] = counter */
123
move state12, cnt_lo
124
move state13, cnt_hi
125
126
/* state[14,15] = 0 */
127
move state14, zero
128
move state15, zero
129
130
li.w i, 10
131
.Lpermute:
132
/* odd round */
133
OP_4REG add.w line0, line1
134
OP_4REG xor line3, line0
135
OP_4REG rotri.w line3, _16
136
137
OP_4REG add.w line2, line3
138
OP_4REG xor line1, line2
139
OP_4REG rotri.w line1, _20
140
141
OP_4REG add.w line0, line1
142
OP_4REG xor line3, line0
143
OP_4REG rotri.w line3, _24
144
145
OP_4REG add.w line2, line3
146
OP_4REG xor line1, line2
147
OP_4REG rotri.w line1, _25
148
149
/* even round */
150
OP_4REG add.w line0, line1_perm
151
OP_4REG xor line3_perm, line0
152
OP_4REG rotri.w line3_perm, _16
153
154
OP_4REG add.w line2_perm, line3_perm
155
OP_4REG xor line1_perm, line2_perm
156
OP_4REG rotri.w line1_perm, _20
157
158
OP_4REG add.w line0, line1_perm
159
OP_4REG xor line3_perm, line0
160
OP_4REG rotri.w line3_perm, _24
161
162
OP_4REG add.w line2_perm, line3_perm
163
OP_4REG xor line1_perm, line2_perm
164
OP_4REG rotri.w line1_perm, _25
165
166
addi.w i, i, -1
167
bnez i, .Lpermute
168
169
/* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
170
OP_4REG add.w line0, copy
171
st.w state0, output, 0
172
st.w state1, output, 4
173
st.w state2, output, 8
174
st.w state3, output, 12
175
176
/* from now on state[0,1,2,3] are scratch registers */
177
178
/* state[0,1,2,3] = lo32(key) */
179
ld.w state0, key, 0
180
ld.w state1, key, 4
181
ld.w state2, key, 8
182
ld.w state3, key, 12
183
184
/* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
185
OP_4REG add.w line1, line0
186
st.w state4, output, 16
187
st.w state5, output, 20
188
st.w state6, output, 24
189
st.w state7, output, 28
190
191
/* state[0,1,2,3] = hi32(key) */
192
ld.w state0, key, 16
193
ld.w state1, key, 20
194
ld.w state2, key, 24
195
ld.w state3, key, 28
196
197
/* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */
198
OP_4REG add.w line2, line0
199
st.w state8, output, 32
200
st.w state9, output, 36
201
st.w state10, output, 40
202
st.w state11, output, 44
203
204
/* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */
205
add.w state12, state12, cnt_lo
206
add.w state13, state13, cnt_hi
207
st.w state12, output, 48
208
st.w state13, output, 52
209
st.w state14, output, 56
210
st.w state15, output, 60
211
212
/* ++counter */
213
addi.w cnt_lo, cnt_lo, 1
214
sltui state0, cnt_lo, 1
215
add.w cnt_hi, cnt_hi, state0
216
217
/* output += 64 */
218
PTR_ADDI output, output, 64
219
/* --nblocks */
220
PTR_ADDI nblocks, nblocks, -1
221
bnez nblocks, .Lblock
222
223
/* counter = [cnt_lo, cnt_hi] */
224
st.w cnt_lo, counter, 0
225
st.w cnt_hi, counter, 4
226
227
/*
228
* Zero out the potentially sensitive regs, in case nothing uses these
229
* again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and
230
* state[0,...,9] are s0-s9 those we'll restore in the epilogue, so we
231
* only need to zero state[11,...,15].
232
*/
233
move state10, zero
234
move state11, zero
235
move state12, zero
236
move state13, zero
237
move state14, zero
238
move state15, zero
239
240
REG_L s0, sp, 0
241
REG_L s1, sp, SZREG
242
REG_L s2, sp, SZREG * 2
243
REG_L s3, sp, SZREG * 3
244
REG_L s4, sp, SZREG * 4
245
REG_L s5, sp, SZREG * 5
246
REG_L s6, sp, SZREG * 6
247
REG_L s7, sp, SZREG * 7
248
REG_L s8, sp, SZREG * 8
249
REG_L s9, sp, SZREG * 9
250
PTR_ADDI sp, sp, -((-SZREG * 10) & STACK_ALIGN)
251
252
jr ra
253
SYM_FUNC_END(__arch_chacha20_blocks_nostack)
254
255