Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/riscv/chacha-riscv64-zvkb.S
26285 views
1
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
2
//
3
// This file is dual-licensed, meaning that you can use it under your
4
// choice of either of the following two licenses:
5
//
6
// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
7
//
8
// Licensed under the Apache License 2.0 (the "License"). You can obtain
9
// a copy in the file LICENSE in the source distribution or at
10
// https://www.openssl.org/source/license.html
11
//
12
// or
13
//
14
// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
15
// Copyright 2024 Google LLC
16
// All rights reserved.
17
//
18
// Redistribution and use in source and binary forms, with or without
19
// modification, are permitted provided that the following conditions
20
// are met:
21
// 1. Redistributions of source code must retain the above copyright
22
// notice, this list of conditions and the following disclaimer.
23
// 2. Redistributions in binary form must reproduce the above copyright
24
// notice, this list of conditions and the following disclaimer in the
25
// documentation and/or other materials provided with the distribution.
26
//
27
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38
39
// The generated code of this file depends on the following RISC-V extensions:
40
// - RV64I
41
// - RISC-V Vector ('V') with VLEN >= 128
42
// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
43
44
#include <linux/linkage.h>
45
46
.text
47
.option arch, +zvkb
48
49
#define STATEP a0
50
#define INP a1
51
#define OUTP a2
52
#define NBLOCKS a3
53
#define NROUNDS a4
54
55
#define CONSTS0 a5
56
#define CONSTS1 a6
57
#define CONSTS2 a7
58
#define CONSTS3 t0
59
#define TMP t1
60
#define VL t2
61
#define STRIDE t3
62
#define ROUND_CTR t4
63
#define KEY0 s0
64
#define KEY1 s1
65
#define KEY2 s2
66
#define KEY3 s3
67
#define KEY4 s4
68
#define KEY5 s5
69
#define KEY6 s6
70
#define KEY7 s7
71
#define COUNTER s8
72
#define NONCE0 s9
73
#define NONCE1 s10
74
#define NONCE2 s11
75
76
.macro chacha_round a0, b0, c0, d0, a1, b1, c1, d1, \
77
a2, b2, c2, d2, a3, b3, c3, d3
78
// a += b; d ^= a; d = rol(d, 16);
79
vadd.vv \a0, \a0, \b0
80
vadd.vv \a1, \a1, \b1
81
vadd.vv \a2, \a2, \b2
82
vadd.vv \a3, \a3, \b3
83
vxor.vv \d0, \d0, \a0
84
vxor.vv \d1, \d1, \a1
85
vxor.vv \d2, \d2, \a2
86
vxor.vv \d3, \d3, \a3
87
vror.vi \d0, \d0, 32 - 16
88
vror.vi \d1, \d1, 32 - 16
89
vror.vi \d2, \d2, 32 - 16
90
vror.vi \d3, \d3, 32 - 16
91
92
// c += d; b ^= c; b = rol(b, 12);
93
vadd.vv \c0, \c0, \d0
94
vadd.vv \c1, \c1, \d1
95
vadd.vv \c2, \c2, \d2
96
vadd.vv \c3, \c3, \d3
97
vxor.vv \b0, \b0, \c0
98
vxor.vv \b1, \b1, \c1
99
vxor.vv \b2, \b2, \c2
100
vxor.vv \b3, \b3, \c3
101
vror.vi \b0, \b0, 32 - 12
102
vror.vi \b1, \b1, 32 - 12
103
vror.vi \b2, \b2, 32 - 12
104
vror.vi \b3, \b3, 32 - 12
105
106
// a += b; d ^= a; d = rol(d, 8);
107
vadd.vv \a0, \a0, \b0
108
vadd.vv \a1, \a1, \b1
109
vadd.vv \a2, \a2, \b2
110
vadd.vv \a3, \a3, \b3
111
vxor.vv \d0, \d0, \a0
112
vxor.vv \d1, \d1, \a1
113
vxor.vv \d2, \d2, \a2
114
vxor.vv \d3, \d3, \a3
115
vror.vi \d0, \d0, 32 - 8
116
vror.vi \d1, \d1, 32 - 8
117
vror.vi \d2, \d2, 32 - 8
118
vror.vi \d3, \d3, 32 - 8
119
120
// c += d; b ^= c; b = rol(b, 7);
121
vadd.vv \c0, \c0, \d0
122
vadd.vv \c1, \c1, \d1
123
vadd.vv \c2, \c2, \d2
124
vadd.vv \c3, \c3, \d3
125
vxor.vv \b0, \b0, \c0
126
vxor.vv \b1, \b1, \c1
127
vxor.vv \b2, \b2, \c2
128
vxor.vv \b3, \b3, \c3
129
vror.vi \b0, \b0, 32 - 7
130
vror.vi \b1, \b1, 32 - 7
131
vror.vi \b2, \b2, 32 - 7
132
vror.vi \b3, \b3, 32 - 7
133
.endm
134
135
// void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out,
136
// size_t nblocks, int nrounds);
137
//
138
// |nblocks| is the number of 64-byte blocks to process, and must be nonzero.
139
//
140
// |state| gives the ChaCha state matrix, including the 32-bit counter in
141
// state->x[12] following the RFC7539 convention; note that this differs from
142
// the original Salsa20 paper which uses a 64-bit counter in state->x[12..13].
143
// The updated 32-bit counter is written back to state->x[12] before returning.
144
SYM_FUNC_START(chacha_zvkb)
145
addi sp, sp, -96
146
sd s0, 0(sp)
147
sd s1, 8(sp)
148
sd s2, 16(sp)
149
sd s3, 24(sp)
150
sd s4, 32(sp)
151
sd s5, 40(sp)
152
sd s6, 48(sp)
153
sd s7, 56(sp)
154
sd s8, 64(sp)
155
sd s9, 72(sp)
156
sd s10, 80(sp)
157
sd s11, 88(sp)
158
159
li STRIDE, 64
160
161
// Set up the initial state matrix in scalar registers.
162
lw CONSTS0, 0(STATEP)
163
lw CONSTS1, 4(STATEP)
164
lw CONSTS2, 8(STATEP)
165
lw CONSTS3, 12(STATEP)
166
lw KEY0, 16(STATEP)
167
lw KEY1, 20(STATEP)
168
lw KEY2, 24(STATEP)
169
lw KEY3, 28(STATEP)
170
lw KEY4, 32(STATEP)
171
lw KEY5, 36(STATEP)
172
lw KEY6, 40(STATEP)
173
lw KEY7, 44(STATEP)
174
lw COUNTER, 48(STATEP)
175
lw NONCE0, 52(STATEP)
176
lw NONCE1, 56(STATEP)
177
lw NONCE2, 60(STATEP)
178
179
.Lblock_loop:
180
// Set vl to the number of blocks to process in this iteration.
181
vsetvli VL, NBLOCKS, e32, m1, ta, ma
182
183
// Set up the initial state matrix for the next VL blocks in v0-v15.
184
// v{i} holds the i'th 32-bit word of the state matrix for all blocks.
185
// Note that only the counter word, at index 12, differs across blocks.
186
vmv.v.x v0, CONSTS0
187
vmv.v.x v1, CONSTS1
188
vmv.v.x v2, CONSTS2
189
vmv.v.x v3, CONSTS3
190
vmv.v.x v4, KEY0
191
vmv.v.x v5, KEY1
192
vmv.v.x v6, KEY2
193
vmv.v.x v7, KEY3
194
vmv.v.x v8, KEY4
195
vmv.v.x v9, KEY5
196
vmv.v.x v10, KEY6
197
vmv.v.x v11, KEY7
198
vid.v v12
199
vadd.vx v12, v12, COUNTER
200
vmv.v.x v13, NONCE0
201
vmv.v.x v14, NONCE1
202
vmv.v.x v15, NONCE2
203
204
// Load the first half of the input data for each block into v16-v23.
205
// v{16+i} holds the i'th 32-bit word for all blocks.
206
vlsseg8e32.v v16, (INP), STRIDE
207
208
mv ROUND_CTR, NROUNDS
209
.Lnext_doubleround:
210
addi ROUND_CTR, ROUND_CTR, -2
211
// column round
212
chacha_round v0, v4, v8, v12, v1, v5, v9, v13, \
213
v2, v6, v10, v14, v3, v7, v11, v15
214
// diagonal round
215
chacha_round v0, v5, v10, v15, v1, v6, v11, v12, \
216
v2, v7, v8, v13, v3, v4, v9, v14
217
bnez ROUND_CTR, .Lnext_doubleround
218
219
// Load the second half of the input data for each block into v24-v31.
220
// v{24+i} holds the {8+i}'th 32-bit word for all blocks.
221
addi TMP, INP, 32
222
vlsseg8e32.v v24, (TMP), STRIDE
223
224
// Finalize the first half of the keystream for each block.
225
vadd.vx v0, v0, CONSTS0
226
vadd.vx v1, v1, CONSTS1
227
vadd.vx v2, v2, CONSTS2
228
vadd.vx v3, v3, CONSTS3
229
vadd.vx v4, v4, KEY0
230
vadd.vx v5, v5, KEY1
231
vadd.vx v6, v6, KEY2
232
vadd.vx v7, v7, KEY3
233
234
// Encrypt/decrypt the first half of the data for each block.
235
vxor.vv v16, v16, v0
236
vxor.vv v17, v17, v1
237
vxor.vv v18, v18, v2
238
vxor.vv v19, v19, v3
239
vxor.vv v20, v20, v4
240
vxor.vv v21, v21, v5
241
vxor.vv v22, v22, v6
242
vxor.vv v23, v23, v7
243
244
// Store the first half of the output data for each block.
245
vssseg8e32.v v16, (OUTP), STRIDE
246
247
// Finalize the second half of the keystream for each block.
248
vadd.vx v8, v8, KEY4
249
vadd.vx v9, v9, KEY5
250
vadd.vx v10, v10, KEY6
251
vadd.vx v11, v11, KEY7
252
vid.v v0
253
vadd.vx v12, v12, COUNTER
254
vadd.vx v13, v13, NONCE0
255
vadd.vx v14, v14, NONCE1
256
vadd.vx v15, v15, NONCE2
257
vadd.vv v12, v12, v0
258
259
// Encrypt/decrypt the second half of the data for each block.
260
vxor.vv v24, v24, v8
261
vxor.vv v25, v25, v9
262
vxor.vv v26, v26, v10
263
vxor.vv v27, v27, v11
264
vxor.vv v29, v29, v13
265
vxor.vv v28, v28, v12
266
vxor.vv v30, v30, v14
267
vxor.vv v31, v31, v15
268
269
// Store the second half of the output data for each block.
270
addi TMP, OUTP, 32
271
vssseg8e32.v v24, (TMP), STRIDE
272
273
// Update the counter, the remaining number of blocks, and the input and
274
// output pointers according to the number of blocks processed (VL).
275
add COUNTER, COUNTER, VL
276
sub NBLOCKS, NBLOCKS, VL
277
slli TMP, VL, 6
278
add OUTP, OUTP, TMP
279
add INP, INP, TMP
280
bnez NBLOCKS, .Lblock_loop
281
282
sw COUNTER, 48(STATEP)
283
ld s0, 0(sp)
284
ld s1, 8(sp)
285
ld s2, 16(sp)
286
ld s3, 24(sp)
287
ld s4, 32(sp)
288
ld s5, 40(sp)
289
ld s6, 48(sp)
290
ld s7, 56(sp)
291
ld s8, 64(sp)
292
ld s9, 72(sp)
293
ld s10, 80(sp)
294
ld s11, 88(sp)
295
addi sp, sp, 96
296
ret
297
SYM_FUNC_END(chacha_zvkb)
298
299