Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S
26451 views
1
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
2
//
3
// This file is dual-licensed, meaning that you can use it under your
4
// choice of either of the following two licenses:
5
//
6
// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
7
//
8
// Licensed under the Apache License 2.0 (the "License"). You can obtain
9
// a copy in the file LICENSE in the source distribution or at
10
// https://www.openssl.org/source/license.html
11
//
12
// or
13
//
14
// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
15
// Copyright 2024 Google LLC
16
// All rights reserved.
17
//
18
// Redistribution and use in source and binary forms, with or without
19
// modification, are permitted provided that the following conditions
20
// are met:
21
// 1. Redistributions of source code must retain the above copyright
22
// notice, this list of conditions and the following disclaimer.
23
// 2. Redistributions in binary form must reproduce the above copyright
24
// notice, this list of conditions and the following disclaimer in the
25
// documentation and/or other materials provided with the distribution.
26
//
27
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38
39
// The generated code of this file depends on the following RISC-V extensions:
40
// - RV64I
41
// - RISC-V Vector ('V') with VLEN >= 128 && VLEN < 2048
42
// - RISC-V Vector AES block cipher extension ('Zvkned')
43
// - RISC-V Vector Bit-manipulation extension ('Zvbb')
44
// - RISC-V Vector GCM/GMAC extension ('Zvkg')
45
46
#include <linux/linkage.h>
47
48
.text
49
.option arch, +zvkned, +zvbb, +zvkg
50
51
#include "aes-macros.S"
52
53
#define KEYP a0
54
#define INP a1
55
#define OUTP a2
56
#define LEN a3
57
#define TWEAKP a4
58
59
#define LEN32 a5
60
#define TAIL_LEN a6
61
#define VL a7
62
#define VLMAX t4
63
64
// v1-v15 contain the AES round keys, but they are used for temporaries before
65
// the AES round keys have been loaded.
66
#define TWEAKS v16 // LMUL=4 (most of the time)
67
#define TWEAKS_BREV v20 // LMUL=4 (most of the time)
68
#define MULTS_BREV v24 // LMUL=4 (most of the time)
69
#define TMP0 v28
70
#define TMP1 v29
71
#define TMP2 v30
72
#define TMP3 v31
73
74
// xts_init initializes the following values:
75
//
76
// TWEAKS: N 128-bit tweaks T*(x^i) for i in 0..(N - 1)
77
// TWEAKS_BREV: same as TWEAKS, but bit-reversed
78
// MULTS_BREV: N 128-bit values x^N, bit-reversed. Only if N > 1.
79
//
80
// N is the maximum number of blocks that will be processed per loop iteration,
81
// computed using vsetvli.
82
//
83
// The field convention used by XTS is the same as that of GHASH, but with the
84
// bits reversed within each byte. The zvkg extension provides the vgmul
85
// instruction which does multiplication in this field. Therefore, for tweak
86
// computation we use vgmul to do multiplications in parallel, instead of
87
// serially multiplying by x using shifting+xoring. Note that for this to work,
88
// the inputs and outputs to vgmul must be bit-reversed (we do it with vbrev8).
89
.macro xts_init
90
91
// Load the first tweak T.
92
vsetivli zero, 4, e32, m1, ta, ma
93
vle32.v TWEAKS, (TWEAKP)
94
95
// If there's only one block (or no blocks at all), then skip the tweak
96
// sequence computation because (at most) T itself is needed.
97
li t0, 16
98
ble LEN, t0, .Linit_single_block\@
99
100
// Save a copy of T bit-reversed in v12.
101
vbrev8.v v12, TWEAKS
102
103
//
104
// Generate x^i for i in 0..(N - 1), i.e. 128-bit values 1 << i assuming
105
// that N <= 128. Though, this code actually requires N < 64 (or
106
// equivalently VLEN < 2048) due to the use of 64-bit intermediate
107
// values here and in the x^N computation later.
108
//
109
vsetvli VL, LEN32, e32, m4, ta, ma
110
srli t0, VL, 2 // t0 = N (num blocks)
111
// Generate two sequences, each with N 32-bit values:
112
// v0=[1, 1, 1, ...] and v1=[0, 1, 2, ...].
113
vsetvli zero, t0, e32, m1, ta, ma
114
vmv.v.i v0, 1
115
vid.v v1
116
// Use vzext to zero-extend the sequences to 64 bits. Reinterpret them
117
// as two sequences, each with 2*N 32-bit values:
118
// v2=[1, 0, 1, 0, 1, 0, ...] and v4=[0, 0, 1, 0, 2, 0, ...].
119
vsetvli zero, t0, e64, m2, ta, ma
120
vzext.vf2 v2, v0
121
vzext.vf2 v4, v1
122
slli t1, t0, 1 // t1 = 2*N
123
vsetvli zero, t1, e32, m2, ta, ma
124
// Use vwsll to compute [1<<0, 0<<0, 1<<1, 0<<0, 1<<2, 0<<0, ...],
125
// widening to 64 bits per element. When reinterpreted as N 128-bit
126
// values, this is the needed sequence of 128-bit values 1 << i (x^i).
127
vwsll.vv v8, v2, v4
128
129
// Copy the bit-reversed T to all N elements of TWEAKS_BREV, then
130
// multiply by x^i. This gives the sequence T*(x^i), bit-reversed.
131
vsetvli zero, LEN32, e32, m4, ta, ma
132
vmv.v.i TWEAKS_BREV, 0
133
vaesz.vs TWEAKS_BREV, v12
134
vbrev8.v v8, v8
135
vgmul.vv TWEAKS_BREV, v8
136
137
// Save a copy of the sequence T*(x^i) with the bit reversal undone.
138
vbrev8.v TWEAKS, TWEAKS_BREV
139
140
// Generate N copies of x^N, i.e. 128-bit values 1 << N, bit-reversed.
141
li t1, 1
142
sll t1, t1, t0 // t1 = 1 << N
143
vsetivli zero, 2, e64, m1, ta, ma
144
vmv.v.i v0, 0
145
vsetivli zero, 1, e64, m1, tu, ma
146
vmv.v.x v0, t1
147
vbrev8.v v0, v0
148
vsetvli zero, LEN32, e32, m4, ta, ma
149
vmv.v.i MULTS_BREV, 0
150
vaesz.vs MULTS_BREV, v0
151
152
j .Linit_done\@
153
154
.Linit_single_block\@:
155
vbrev8.v TWEAKS_BREV, TWEAKS
156
.Linit_done\@:
157
.endm
158
159
// Set the first 128 bits of MULTS_BREV to 0x40, i.e. 'x' bit-reversed. This is
160
// the multiplier required to advance the tweak by one.
161
.macro load_x
162
li t0, 0x40
163
vsetivli zero, 4, e32, m1, ta, ma
164
vmv.v.i MULTS_BREV, 0
165
vsetivli zero, 1, e8, m1, tu, ma
166
vmv.v.x MULTS_BREV, t0
167
.endm
168
169
.macro __aes_xts_crypt enc, keylen
170
// With 16 < len <= 31, there's no main loop, just ciphertext stealing.
171
beqz LEN32, .Lcts_without_main_loop\@
172
173
vsetvli VLMAX, zero, e32, m4, ta, ma
174
1:
175
vsetvli VL, LEN32, e32, m4, ta, ma
176
2:
177
// Encrypt or decrypt VL/4 blocks.
178
vle32.v TMP0, (INP)
179
vxor.vv TMP0, TMP0, TWEAKS
180
aes_crypt TMP0, \enc, \keylen
181
vxor.vv TMP0, TMP0, TWEAKS
182
vse32.v TMP0, (OUTP)
183
184
// Update the pointers and the remaining length.
185
slli t0, VL, 2
186
add INP, INP, t0
187
add OUTP, OUTP, t0
188
sub LEN32, LEN32, VL
189
190
// Check whether more blocks remain.
191
beqz LEN32, .Lmain_loop_done\@
192
193
// Compute the next sequence of tweaks by multiplying the previous
194
// sequence by x^N. Store the result in both bit-reversed order and
195
// regular order (i.e. with the bit reversal undone).
196
vgmul.vv TWEAKS_BREV, MULTS_BREV
197
vbrev8.v TWEAKS, TWEAKS_BREV
198
199
// Since we compute the tweak multipliers x^N in advance, we require
200
// that each iteration process the same length except possibly the last.
201
// This conflicts slightly with the behavior allowed by RISC-V Vector
202
// Extension, where CPUs can select a lower length for both of the last
203
// two iterations. E.g., vl might take the sequence of values
204
// [16, 16, 16, 12, 12], whereas we need [16, 16, 16, 16, 8] so that we
205
// can use x^4 again instead of computing x^3. Therefore, we explicitly
206
// keep the vl at VLMAX if there is at least VLMAX remaining.
207
bge LEN32, VLMAX, 2b
208
j 1b
209
210
.Lmain_loop_done\@:
211
load_x
212
213
// Compute the next tweak.
214
addi t0, VL, -4
215
vsetivli zero, 4, e32, m4, ta, ma
216
vslidedown.vx TWEAKS_BREV, TWEAKS_BREV, t0 // Extract last tweak
217
vsetivli zero, 4, e32, m1, ta, ma
218
vgmul.vv TWEAKS_BREV, MULTS_BREV // Advance to next tweak
219
220
bnez TAIL_LEN, .Lcts\@
221
222
// Update *TWEAKP to contain the next tweak.
223
vbrev8.v TWEAKS, TWEAKS_BREV
224
vse32.v TWEAKS, (TWEAKP)
225
ret
226
227
.Lcts_without_main_loop\@:
228
load_x
229
.Lcts\@:
230
// TWEAKS_BREV now contains the next tweak. Compute the one after that.
231
vsetivli zero, 4, e32, m1, ta, ma
232
vmv.v.v TMP0, TWEAKS_BREV
233
vgmul.vv TMP0, MULTS_BREV
234
// Undo the bit reversal of the next two tweaks and store them in TMP1
235
// and TMP2, such that TMP1 is the first needed and TMP2 the second.
236
.if \enc
237
vbrev8.v TMP1, TWEAKS_BREV
238
vbrev8.v TMP2, TMP0
239
.else
240
vbrev8.v TMP1, TMP0
241
vbrev8.v TMP2, TWEAKS_BREV
242
.endif
243
244
// Encrypt/decrypt the last full block.
245
vle32.v TMP0, (INP)
246
vxor.vv TMP0, TMP0, TMP1
247
aes_crypt TMP0, \enc, \keylen
248
vxor.vv TMP0, TMP0, TMP1
249
250
// Swap the first TAIL_LEN bytes of the above result with the tail.
251
// Note that to support in-place encryption/decryption, the load from
252
// the input tail must happen before the store to the output tail.
253
addi t0, INP, 16
254
addi t1, OUTP, 16
255
vmv.v.v TMP3, TMP0
256
vsetvli zero, TAIL_LEN, e8, m1, tu, ma
257
vle8.v TMP0, (t0)
258
vse8.v TMP3, (t1)
259
260
// Encrypt/decrypt again and store the last full block.
261
vsetivli zero, 4, e32, m1, ta, ma
262
vxor.vv TMP0, TMP0, TMP2
263
aes_crypt TMP0, \enc, \keylen
264
vxor.vv TMP0, TMP0, TMP2
265
vse32.v TMP0, (OUTP)
266
267
ret
268
.endm
269
270
.macro aes_xts_crypt enc
271
272
// Check whether the length is a multiple of the AES block size.
273
andi TAIL_LEN, LEN, 15
274
beqz TAIL_LEN, 1f
275
276
// The length isn't a multiple of the AES block size, so ciphertext
277
// stealing will be required. Ciphertext stealing involves special
278
// handling of the partial block and the last full block, so subtract
279
// the length of both from the length to be processed in the main loop.
280
sub LEN, LEN, TAIL_LEN
281
addi LEN, LEN, -16
282
1:
283
srli LEN32, LEN, 2
284
// LEN and LEN32 now contain the total length of the blocks that will be
285
// processed in the main loop, in bytes and 32-bit words respectively.
286
287
xts_init
288
aes_begin KEYP, 128f, 192f
289
__aes_xts_crypt \enc, 256
290
128:
291
__aes_xts_crypt \enc, 128
292
192:
293
__aes_xts_crypt \enc, 192
294
.endm
295
296
// void aes_xts_encrypt_zvkned_zvbb_zvkg(const struct crypto_aes_ctx *key,
297
// const u8 *in, u8 *out, size_t len,
298
// u8 tweak[16]);
299
//
300
// |key| is the data key. |tweak| contains the next tweak; the encryption of
301
// the original IV with the tweak key was already done. This function supports
302
// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
303
// |len| must be a multiple of 16 except on the last call. If |len| is a
304
// multiple of 16, then this function updates |tweak| to contain the next tweak.
305
SYM_FUNC_START(aes_xts_encrypt_zvkned_zvbb_zvkg)
306
aes_xts_crypt 1
307
SYM_FUNC_END(aes_xts_encrypt_zvkned_zvbb_zvkg)
308
309
// Same prototype and calling convention as the encryption function
310
SYM_FUNC_START(aes_xts_decrypt_zvkned_zvbb_zvkg)
311
aes_xts_crypt 0
312
SYM_FUNC_END(aes_xts_decrypt_zvkned_zvbb_zvkg)
313
314