Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/arm64/crypto/aes-neonbs-core.S
26424 views
1
/* SPDX-License-Identifier: GPL-2.0-only */
2
/*
3
* Bit sliced AES using NEON instructions
4
*
5
* Copyright (C) 2016 Linaro Ltd <[email protected]>
6
*/
7
8
/*
9
* The algorithm implemented here is described in detail by the paper
10
* 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
11
* Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
12
*
13
* This implementation is based primarily on the OpenSSL implementation
14
* for 32-bit ARM written by Andy Polyakov <[email protected]>
15
*/
16
17
#include <linux/linkage.h>
18
#include <linux/cfi_types.h>
19
#include <asm/assembler.h>
20
21
.text
22
23
rounds .req x11
24
bskey .req x12
25
26
.macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
27
eor \b2, \b2, \b1
28
eor \b5, \b5, \b6
29
eor \b3, \b3, \b0
30
eor \b6, \b6, \b2
31
eor \b5, \b5, \b0
32
eor \b6, \b6, \b3
33
eor \b3, \b3, \b7
34
eor \b7, \b7, \b5
35
eor \b3, \b3, \b4
36
eor \b4, \b4, \b5
37
eor \b2, \b2, \b7
38
eor \b3, \b3, \b1
39
eor \b1, \b1, \b5
40
.endm
41
42
.macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
43
eor \b0, \b0, \b6
44
eor \b1, \b1, \b4
45
eor \b4, \b4, \b6
46
eor \b2, \b2, \b0
47
eor \b6, \b6, \b1
48
eor \b1, \b1, \b5
49
eor \b5, \b5, \b3
50
eor \b3, \b3, \b7
51
eor \b7, \b7, \b5
52
eor \b2, \b2, \b5
53
eor \b4, \b4, \b7
54
.endm
55
56
.macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
57
eor \b1, \b1, \b7
58
eor \b4, \b4, \b7
59
eor \b7, \b7, \b5
60
eor \b1, \b1, \b3
61
eor \b2, \b2, \b5
62
eor \b3, \b3, \b7
63
eor \b6, \b6, \b1
64
eor \b2, \b2, \b0
65
eor \b5, \b5, \b3
66
eor \b4, \b4, \b6
67
eor \b0, \b0, \b6
68
eor \b1, \b1, \b4
69
.endm
70
71
.macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
72
eor \b1, \b1, \b5
73
eor \b2, \b2, \b7
74
eor \b3, \b3, \b1
75
eor \b4, \b4, \b5
76
eor \b7, \b7, \b5
77
eor \b3, \b3, \b4
78
eor \b5, \b5, \b0
79
eor \b3, \b3, \b7
80
eor \b6, \b6, \b2
81
eor \b2, \b2, \b1
82
eor \b6, \b6, \b3
83
eor \b3, \b3, \b0
84
eor \b5, \b5, \b6
85
.endm
86
87
.macro mul_gf4, x0, x1, y0, y1, t0, t1
88
eor \t0, \y0, \y1
89
and \t0, \t0, \x0
90
eor \x0, \x0, \x1
91
and \t1, \x1, \y0
92
and \x0, \x0, \y1
93
eor \x1, \t1, \t0
94
eor \x0, \x0, \t1
95
.endm
96
97
.macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
98
eor \t0, \y0, \y1
99
eor \t1, \y2, \y3
100
and \t0, \t0, \x0
101
and \t1, \t1, \x2
102
eor \x0, \x0, \x1
103
eor \x2, \x2, \x3
104
and \x1, \x1, \y0
105
and \x3, \x3, \y2
106
and \x0, \x0, \y1
107
and \x2, \x2, \y3
108
eor \x1, \x1, \x0
109
eor \x2, \x2, \x3
110
eor \x0, \x0, \t0
111
eor \x3, \x3, \t1
112
.endm
113
114
.macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
115
y0, y1, y2, y3, t0, t1, t2, t3
116
eor \t0, \x0, \x2
117
eor \t1, \x1, \x3
118
mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3
119
eor \y0, \y0, \y2
120
eor \y1, \y1, \y3
121
mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
122
eor \x0, \x0, \t0
123
eor \x2, \x2, \t0
124
eor \x1, \x1, \t1
125
eor \x3, \x3, \t1
126
eor \t0, \x4, \x6
127
eor \t1, \x5, \x7
128
mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
129
eor \y0, \y0, \y2
130
eor \y1, \y1, \y3
131
mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3
132
eor \x4, \x4, \t0
133
eor \x6, \x6, \t0
134
eor \x5, \x5, \t1
135
eor \x7, \x7, \t1
136
.endm
137
138
.macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
139
t0, t1, t2, t3, s0, s1, s2, s3
140
eor \t3, \x4, \x6
141
eor \t0, \x5, \x7
142
eor \t1, \x1, \x3
143
eor \s1, \x7, \x6
144
eor \s0, \x0, \x2
145
eor \s3, \t3, \t0
146
orr \t2, \t0, \t1
147
and \s2, \t3, \s0
148
orr \t3, \t3, \s0
149
eor \s0, \s0, \t1
150
and \t0, \t0, \t1
151
eor \t1, \x3, \x2
152
and \s3, \s3, \s0
153
and \s1, \s1, \t1
154
eor \t1, \x4, \x5
155
eor \s0, \x1, \x0
156
eor \t3, \t3, \s1
157
eor \t2, \t2, \s1
158
and \s1, \t1, \s0
159
orr \t1, \t1, \s0
160
eor \t3, \t3, \s3
161
eor \t0, \t0, \s1
162
eor \t2, \t2, \s2
163
eor \t1, \t1, \s3
164
eor \t0, \t0, \s2
165
and \s0, \x7, \x3
166
eor \t1, \t1, \s2
167
and \s1, \x6, \x2
168
and \s2, \x5, \x1
169
orr \s3, \x4, \x0
170
eor \t3, \t3, \s0
171
eor \t1, \t1, \s2
172
eor \s0, \t0, \s3
173
eor \t2, \t2, \s1
174
and \s2, \t3, \t1
175
eor \s1, \t2, \s2
176
eor \s3, \s0, \s2
177
bsl \s1, \t1, \s0
178
not \t0, \s0
179
bsl \s0, \s1, \s3
180
bsl \t0, \s1, \s3
181
bsl \s3, \t3, \t2
182
eor \t3, \t3, \t2
183
and \s2, \s0, \s3
184
eor \t1, \t1, \t0
185
eor \s2, \s2, \t3
186
mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
187
\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
188
.endm
189
190
.macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
191
t0, t1, t2, t3, s0, s1, s2, s3
192
in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
193
\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
194
inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
195
\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
196
\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
197
\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
198
out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
199
\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
200
.endm
201
202
.macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
203
t0, t1, t2, t3, s0, s1, s2, s3
204
inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
205
\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
206
inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
207
\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
208
\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
209
\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
210
inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
211
\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
212
.endm
213
214
.macro enc_next_rk
215
ldp q16, q17, [bskey], #128
216
ldp q18, q19, [bskey, #-96]
217
ldp q20, q21, [bskey, #-64]
218
ldp q22, q23, [bskey, #-32]
219
.endm
220
221
.macro dec_next_rk
222
ldp q16, q17, [bskey, #-128]!
223
ldp q18, q19, [bskey, #32]
224
ldp q20, q21, [bskey, #64]
225
ldp q22, q23, [bskey, #96]
226
.endm
227
228
.macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
229
eor \x0\().16b, \x0\().16b, v16.16b
230
eor \x1\().16b, \x1\().16b, v17.16b
231
eor \x2\().16b, \x2\().16b, v18.16b
232
eor \x3\().16b, \x3\().16b, v19.16b
233
eor \x4\().16b, \x4\().16b, v20.16b
234
eor \x5\().16b, \x5\().16b, v21.16b
235
eor \x6\().16b, \x6\().16b, v22.16b
236
eor \x7\().16b, \x7\().16b, v23.16b
237
.endm
238
239
.macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
240
tbl \x0\().16b, {\x0\().16b}, \mask\().16b
241
tbl \x1\().16b, {\x1\().16b}, \mask\().16b
242
tbl \x2\().16b, {\x2\().16b}, \mask\().16b
243
tbl \x3\().16b, {\x3\().16b}, \mask\().16b
244
tbl \x4\().16b, {\x4\().16b}, \mask\().16b
245
tbl \x5\().16b, {\x5\().16b}, \mask\().16b
246
tbl \x6\().16b, {\x6\().16b}, \mask\().16b
247
tbl \x7\().16b, {\x7\().16b}, \mask\().16b
248
.endm
249
250
.macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
251
t0, t1, t2, t3, t4, t5, t6, t7, inv
252
ext \t0\().16b, \x0\().16b, \x0\().16b, #12
253
ext \t1\().16b, \x1\().16b, \x1\().16b, #12
254
eor \x0\().16b, \x0\().16b, \t0\().16b
255
ext \t2\().16b, \x2\().16b, \x2\().16b, #12
256
eor \x1\().16b, \x1\().16b, \t1\().16b
257
ext \t3\().16b, \x3\().16b, \x3\().16b, #12
258
eor \x2\().16b, \x2\().16b, \t2\().16b
259
ext \t4\().16b, \x4\().16b, \x4\().16b, #12
260
eor \x3\().16b, \x3\().16b, \t3\().16b
261
ext \t5\().16b, \x5\().16b, \x5\().16b, #12
262
eor \x4\().16b, \x4\().16b, \t4\().16b
263
ext \t6\().16b, \x6\().16b, \x6\().16b, #12
264
eor \x5\().16b, \x5\().16b, \t5\().16b
265
ext \t7\().16b, \x7\().16b, \x7\().16b, #12
266
eor \x6\().16b, \x6\().16b, \t6\().16b
267
eor \t1\().16b, \t1\().16b, \x0\().16b
268
eor \x7\().16b, \x7\().16b, \t7\().16b
269
ext \x0\().16b, \x0\().16b, \x0\().16b, #8
270
eor \t2\().16b, \t2\().16b, \x1\().16b
271
eor \t0\().16b, \t0\().16b, \x7\().16b
272
eor \t1\().16b, \t1\().16b, \x7\().16b
273
ext \x1\().16b, \x1\().16b, \x1\().16b, #8
274
eor \t5\().16b, \t5\().16b, \x4\().16b
275
eor \x0\().16b, \x0\().16b, \t0\().16b
276
eor \t6\().16b, \t6\().16b, \x5\().16b
277
eor \x1\().16b, \x1\().16b, \t1\().16b
278
ext \t0\().16b, \x4\().16b, \x4\().16b, #8
279
eor \t4\().16b, \t4\().16b, \x3\().16b
280
ext \t1\().16b, \x5\().16b, \x5\().16b, #8
281
eor \t7\().16b, \t7\().16b, \x6\().16b
282
ext \x4\().16b, \x3\().16b, \x3\().16b, #8
283
eor \t3\().16b, \t3\().16b, \x2\().16b
284
ext \x5\().16b, \x7\().16b, \x7\().16b, #8
285
eor \t4\().16b, \t4\().16b, \x7\().16b
286
ext \x3\().16b, \x6\().16b, \x6\().16b, #8
287
eor \t3\().16b, \t3\().16b, \x7\().16b
288
ext \x6\().16b, \x2\().16b, \x2\().16b, #8
289
eor \x7\().16b, \t1\().16b, \t5\().16b
290
.ifb \inv
291
eor \x2\().16b, \t0\().16b, \t4\().16b
292
eor \x4\().16b, \x4\().16b, \t3\().16b
293
eor \x5\().16b, \x5\().16b, \t7\().16b
294
eor \x3\().16b, \x3\().16b, \t6\().16b
295
eor \x6\().16b, \x6\().16b, \t2\().16b
296
.else
297
eor \t3\().16b, \t3\().16b, \x4\().16b
298
eor \x5\().16b, \x5\().16b, \t7\().16b
299
eor \x2\().16b, \x3\().16b, \t6\().16b
300
eor \x3\().16b, \t0\().16b, \t4\().16b
301
eor \x4\().16b, \x6\().16b, \t2\().16b
302
mov \x6\().16b, \t3\().16b
303
.endif
304
.endm
305
306
.macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
307
t0, t1, t2, t3, t4, t5, t6, t7
308
ext \t0\().16b, \x0\().16b, \x0\().16b, #8
309
ext \t6\().16b, \x6\().16b, \x6\().16b, #8
310
ext \t7\().16b, \x7\().16b, \x7\().16b, #8
311
eor \t0\().16b, \t0\().16b, \x0\().16b
312
ext \t1\().16b, \x1\().16b, \x1\().16b, #8
313
eor \t6\().16b, \t6\().16b, \x6\().16b
314
ext \t2\().16b, \x2\().16b, \x2\().16b, #8
315
eor \t7\().16b, \t7\().16b, \x7\().16b
316
ext \t3\().16b, \x3\().16b, \x3\().16b, #8
317
eor \t1\().16b, \t1\().16b, \x1\().16b
318
ext \t4\().16b, \x4\().16b, \x4\().16b, #8
319
eor \t2\().16b, \t2\().16b, \x2\().16b
320
ext \t5\().16b, \x5\().16b, \x5\().16b, #8
321
eor \t3\().16b, \t3\().16b, \x3\().16b
322
eor \t4\().16b, \t4\().16b, \x4\().16b
323
eor \t5\().16b, \t5\().16b, \x5\().16b
324
eor \x0\().16b, \x0\().16b, \t6\().16b
325
eor \x1\().16b, \x1\().16b, \t6\().16b
326
eor \x2\().16b, \x2\().16b, \t0\().16b
327
eor \x4\().16b, \x4\().16b, \t2\().16b
328
eor \x3\().16b, \x3\().16b, \t1\().16b
329
eor \x1\().16b, \x1\().16b, \t7\().16b
330
eor \x2\().16b, \x2\().16b, \t7\().16b
331
eor \x4\().16b, \x4\().16b, \t6\().16b
332
eor \x5\().16b, \x5\().16b, \t3\().16b
333
eor \x3\().16b, \x3\().16b, \t6\().16b
334
eor \x6\().16b, \x6\().16b, \t4\().16b
335
eor \x4\().16b, \x4\().16b, \t7\().16b
336
eor \x5\().16b, \x5\().16b, \t7\().16b
337
eor \x7\().16b, \x7\().16b, \t5\().16b
338
mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
339
\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
340
.endm
341
342
.macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
343
ushr \t0\().2d, \b0\().2d, #\n
344
ushr \t1\().2d, \b1\().2d, #\n
345
eor \t0\().16b, \t0\().16b, \a0\().16b
346
eor \t1\().16b, \t1\().16b, \a1\().16b
347
and \t0\().16b, \t0\().16b, \mask\().16b
348
and \t1\().16b, \t1\().16b, \mask\().16b
349
eor \a0\().16b, \a0\().16b, \t0\().16b
350
shl \t0\().2d, \t0\().2d, #\n
351
eor \a1\().16b, \a1\().16b, \t1\().16b
352
shl \t1\().2d, \t1\().2d, #\n
353
eor \b0\().16b, \b0\().16b, \t0\().16b
354
eor \b1\().16b, \b1\().16b, \t1\().16b
355
.endm
356
357
.macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
358
movi \t0\().16b, #0x55
359
movi \t1\().16b, #0x33
360
swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
361
swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
362
movi \t0\().16b, #0x0f
363
swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
364
swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
365
swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
366
swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
367
.endm
368
369
370
.align 6
371
M0: .octa 0x0004080c0105090d02060a0e03070b0f
372
373
M0SR: .octa 0x0004080c05090d010a0e02060f03070b
374
SR: .octa 0x0f0e0d0c0a09080b0504070600030201
375
SRM0: .octa 0x01060b0c0207080d0304090e00050a0f
376
377
M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03
378
ISR: .octa 0x0f0e0d0c080b0a090504070602010003
379
ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f
380
381
/*
382
* void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
383
*/
384
SYM_FUNC_START(aesbs_convert_key)
385
ld1 {v7.4s}, [x1], #16 // load round 0 key
386
ld1 {v17.4s}, [x1], #16 // load round 1 key
387
388
movi v8.16b, #0x01 // bit masks
389
movi v9.16b, #0x02
390
movi v10.16b, #0x04
391
movi v11.16b, #0x08
392
movi v12.16b, #0x10
393
movi v13.16b, #0x20
394
movi v14.16b, #0x40
395
movi v15.16b, #0x80
396
ldr q16, M0
397
398
sub x2, x2, #1
399
str q7, [x0], #16 // save round 0 key
400
401
.Lkey_loop:
402
tbl v7.16b ,{v17.16b}, v16.16b
403
ld1 {v17.4s}, [x1], #16 // load next round key
404
405
cmtst v0.16b, v7.16b, v8.16b
406
cmtst v1.16b, v7.16b, v9.16b
407
cmtst v2.16b, v7.16b, v10.16b
408
cmtst v3.16b, v7.16b, v11.16b
409
cmtst v4.16b, v7.16b, v12.16b
410
cmtst v5.16b, v7.16b, v13.16b
411
cmtst v6.16b, v7.16b, v14.16b
412
cmtst v7.16b, v7.16b, v15.16b
413
not v0.16b, v0.16b
414
not v1.16b, v1.16b
415
not v5.16b, v5.16b
416
not v6.16b, v6.16b
417
418
subs x2, x2, #1
419
stp q0, q1, [x0], #128
420
stp q2, q3, [x0, #-96]
421
stp q4, q5, [x0, #-64]
422
stp q6, q7, [x0, #-32]
423
b.ne .Lkey_loop
424
425
movi v7.16b, #0x63 // compose .L63
426
eor v17.16b, v17.16b, v7.16b
427
str q17, [x0]
428
ret
429
SYM_FUNC_END(aesbs_convert_key)
430
431
.align 4
432
SYM_FUNC_START_LOCAL(aesbs_encrypt8)
433
ldr q9, [bskey], #16 // round 0 key
434
ldr q8, M0SR
435
ldr q24, SR
436
437
eor v10.16b, v0.16b, v9.16b // xor with round0 key
438
eor v11.16b, v1.16b, v9.16b
439
tbl v0.16b, {v10.16b}, v8.16b
440
eor v12.16b, v2.16b, v9.16b
441
tbl v1.16b, {v11.16b}, v8.16b
442
eor v13.16b, v3.16b, v9.16b
443
tbl v2.16b, {v12.16b}, v8.16b
444
eor v14.16b, v4.16b, v9.16b
445
tbl v3.16b, {v13.16b}, v8.16b
446
eor v15.16b, v5.16b, v9.16b
447
tbl v4.16b, {v14.16b}, v8.16b
448
eor v10.16b, v6.16b, v9.16b
449
tbl v5.16b, {v15.16b}, v8.16b
450
eor v11.16b, v7.16b, v9.16b
451
tbl v6.16b, {v10.16b}, v8.16b
452
tbl v7.16b, {v11.16b}, v8.16b
453
454
bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
455
456
sub rounds, rounds, #1
457
b .Lenc_sbox
458
459
.Lenc_loop:
460
shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24
461
.Lenc_sbox:
462
sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
463
v13, v14, v15
464
subs rounds, rounds, #1
465
b.cc .Lenc_done
466
467
enc_next_rk
468
469
mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
470
v13, v14, v15
471
472
add_round_key v0, v1, v2, v3, v4, v5, v6, v7
473
474
b.ne .Lenc_loop
475
ldr q24, SRM0
476
b .Lenc_loop
477
478
.Lenc_done:
479
ldr q12, [bskey] // last round key
480
481
bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
482
483
eor v0.16b, v0.16b, v12.16b
484
eor v1.16b, v1.16b, v12.16b
485
eor v4.16b, v4.16b, v12.16b
486
eor v6.16b, v6.16b, v12.16b
487
eor v3.16b, v3.16b, v12.16b
488
eor v7.16b, v7.16b, v12.16b
489
eor v2.16b, v2.16b, v12.16b
490
eor v5.16b, v5.16b, v12.16b
491
ret
492
SYM_FUNC_END(aesbs_encrypt8)
493
494
.align 4
495
SYM_FUNC_START_LOCAL(aesbs_decrypt8)
496
lsl x9, rounds, #7
497
add bskey, bskey, x9
498
499
ldr q9, [bskey, #-112]! // round 0 key
500
ldr q8, M0ISR
501
ldr q24, ISR
502
503
eor v10.16b, v0.16b, v9.16b // xor with round0 key
504
eor v11.16b, v1.16b, v9.16b
505
tbl v0.16b, {v10.16b}, v8.16b
506
eor v12.16b, v2.16b, v9.16b
507
tbl v1.16b, {v11.16b}, v8.16b
508
eor v13.16b, v3.16b, v9.16b
509
tbl v2.16b, {v12.16b}, v8.16b
510
eor v14.16b, v4.16b, v9.16b
511
tbl v3.16b, {v13.16b}, v8.16b
512
eor v15.16b, v5.16b, v9.16b
513
tbl v4.16b, {v14.16b}, v8.16b
514
eor v10.16b, v6.16b, v9.16b
515
tbl v5.16b, {v15.16b}, v8.16b
516
eor v11.16b, v7.16b, v9.16b
517
tbl v6.16b, {v10.16b}, v8.16b
518
tbl v7.16b, {v11.16b}, v8.16b
519
520
bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
521
522
sub rounds, rounds, #1
523
b .Ldec_sbox
524
525
.Ldec_loop:
526
shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24
527
.Ldec_sbox:
528
inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
529
v13, v14, v15
530
subs rounds, rounds, #1
531
b.cc .Ldec_done
532
533
dec_next_rk
534
535
add_round_key v0, v1, v6, v4, v2, v7, v3, v5
536
537
inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
538
v13, v14, v15
539
540
b.ne .Ldec_loop
541
ldr q24, ISRM0
542
b .Ldec_loop
543
.Ldec_done:
544
ldr q12, [bskey, #-16] // last round key
545
546
bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
547
548
eor v0.16b, v0.16b, v12.16b
549
eor v1.16b, v1.16b, v12.16b
550
eor v6.16b, v6.16b, v12.16b
551
eor v4.16b, v4.16b, v12.16b
552
eor v2.16b, v2.16b, v12.16b
553
eor v7.16b, v7.16b, v12.16b
554
eor v3.16b, v3.16b, v12.16b
555
eor v5.16b, v5.16b, v12.16b
556
ret
557
SYM_FUNC_END(aesbs_decrypt8)
558
559
/*
560
* aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
561
* int blocks)
562
* aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
563
* int blocks)
564
*/
565
.macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
566
frame_push 5
567
568
mov x19, x0
569
mov x20, x1
570
mov x21, x2
571
mov x22, x3
572
mov x23, x4
573
574
99: mov x5, #1
575
lsl x5, x5, x23
576
subs w23, w23, #8
577
csel x23, x23, xzr, pl
578
csel x5, x5, xzr, mi
579
580
ld1 {v0.16b}, [x20], #16
581
tbnz x5, #1, 0f
582
ld1 {v1.16b}, [x20], #16
583
tbnz x5, #2, 0f
584
ld1 {v2.16b}, [x20], #16
585
tbnz x5, #3, 0f
586
ld1 {v3.16b}, [x20], #16
587
tbnz x5, #4, 0f
588
ld1 {v4.16b}, [x20], #16
589
tbnz x5, #5, 0f
590
ld1 {v5.16b}, [x20], #16
591
tbnz x5, #6, 0f
592
ld1 {v6.16b}, [x20], #16
593
tbnz x5, #7, 0f
594
ld1 {v7.16b}, [x20], #16
595
596
0: mov bskey, x21
597
mov rounds, x22
598
bl \do8
599
600
st1 {\o0\().16b}, [x19], #16
601
tbnz x5, #1, 1f
602
st1 {\o1\().16b}, [x19], #16
603
tbnz x5, #2, 1f
604
st1 {\o2\().16b}, [x19], #16
605
tbnz x5, #3, 1f
606
st1 {\o3\().16b}, [x19], #16
607
tbnz x5, #4, 1f
608
st1 {\o4\().16b}, [x19], #16
609
tbnz x5, #5, 1f
610
st1 {\o5\().16b}, [x19], #16
611
tbnz x5, #6, 1f
612
st1 {\o6\().16b}, [x19], #16
613
tbnz x5, #7, 1f
614
st1 {\o7\().16b}, [x19], #16
615
616
cbz x23, 1f
617
b 99b
618
619
1: frame_pop
620
ret
621
.endm
622
623
.align 4
624
SYM_TYPED_FUNC_START(aesbs_ecb_encrypt)
625
__ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
626
SYM_FUNC_END(aesbs_ecb_encrypt)
627
628
.align 4
629
SYM_TYPED_FUNC_START(aesbs_ecb_decrypt)
630
__ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
631
SYM_FUNC_END(aesbs_ecb_decrypt)
632
633
/*
634
* aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
635
* int blocks, u8 iv[])
636
*/
637
.align 4
638
SYM_FUNC_START(aesbs_cbc_decrypt)
639
frame_push 6
640
641
mov x19, x0
642
mov x20, x1
643
mov x21, x2
644
mov x22, x3
645
mov x23, x4
646
mov x24, x5
647
648
99: mov x6, #1
649
lsl x6, x6, x23
650
subs w23, w23, #8
651
csel x23, x23, xzr, pl
652
csel x6, x6, xzr, mi
653
654
ld1 {v0.16b}, [x20], #16
655
mov v25.16b, v0.16b
656
tbnz x6, #1, 0f
657
ld1 {v1.16b}, [x20], #16
658
mov v26.16b, v1.16b
659
tbnz x6, #2, 0f
660
ld1 {v2.16b}, [x20], #16
661
mov v27.16b, v2.16b
662
tbnz x6, #3, 0f
663
ld1 {v3.16b}, [x20], #16
664
mov v28.16b, v3.16b
665
tbnz x6, #4, 0f
666
ld1 {v4.16b}, [x20], #16
667
mov v29.16b, v4.16b
668
tbnz x6, #5, 0f
669
ld1 {v5.16b}, [x20], #16
670
mov v30.16b, v5.16b
671
tbnz x6, #6, 0f
672
ld1 {v6.16b}, [x20], #16
673
mov v31.16b, v6.16b
674
tbnz x6, #7, 0f
675
ld1 {v7.16b}, [x20]
676
677
0: mov bskey, x21
678
mov rounds, x22
679
bl aesbs_decrypt8
680
681
ld1 {v24.16b}, [x24] // load IV
682
683
eor v1.16b, v1.16b, v25.16b
684
eor v6.16b, v6.16b, v26.16b
685
eor v4.16b, v4.16b, v27.16b
686
eor v2.16b, v2.16b, v28.16b
687
eor v7.16b, v7.16b, v29.16b
688
eor v0.16b, v0.16b, v24.16b
689
eor v3.16b, v3.16b, v30.16b
690
eor v5.16b, v5.16b, v31.16b
691
692
st1 {v0.16b}, [x19], #16
693
mov v24.16b, v25.16b
694
tbnz x6, #1, 1f
695
st1 {v1.16b}, [x19], #16
696
mov v24.16b, v26.16b
697
tbnz x6, #2, 1f
698
st1 {v6.16b}, [x19], #16
699
mov v24.16b, v27.16b
700
tbnz x6, #3, 1f
701
st1 {v4.16b}, [x19], #16
702
mov v24.16b, v28.16b
703
tbnz x6, #4, 1f
704
st1 {v2.16b}, [x19], #16
705
mov v24.16b, v29.16b
706
tbnz x6, #5, 1f
707
st1 {v7.16b}, [x19], #16
708
mov v24.16b, v30.16b
709
tbnz x6, #6, 1f
710
st1 {v3.16b}, [x19], #16
711
mov v24.16b, v31.16b
712
tbnz x6, #7, 1f
713
ld1 {v24.16b}, [x20], #16
714
st1 {v5.16b}, [x19], #16
715
1: st1 {v24.16b}, [x24] // store IV
716
717
cbz x23, 2f
718
b 99b
719
720
2: frame_pop
721
ret
722
SYM_FUNC_END(aesbs_cbc_decrypt)
723
724
.macro next_tweak, out, in, const, tmp
725
sshr \tmp\().2d, \in\().2d, #63
726
and \tmp\().16b, \tmp\().16b, \const\().16b
727
add \out\().2d, \in\().2d, \in\().2d
728
ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
729
eor \out\().16b, \out\().16b, \tmp\().16b
730
.endm
731
732
/*
733
* aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
734
* int blocks, u8 iv[])
735
* aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
736
* int blocks, u8 iv[])
737
*/
738
SYM_FUNC_START_LOCAL(__xts_crypt8)
739
movi v18.2s, #0x1
740
movi v19.2s, #0x87
741
uzp1 v18.4s, v18.4s, v19.4s
742
743
ld1 {v0.16b-v3.16b}, [x1], #64
744
ld1 {v4.16b-v7.16b}, [x1], #64
745
746
next_tweak v26, v25, v18, v19
747
next_tweak v27, v26, v18, v19
748
next_tweak v28, v27, v18, v19
749
next_tweak v29, v28, v18, v19
750
next_tweak v30, v29, v18, v19
751
next_tweak v31, v30, v18, v19
752
next_tweak v16, v31, v18, v19
753
next_tweak v17, v16, v18, v19
754
755
eor v0.16b, v0.16b, v25.16b
756
eor v1.16b, v1.16b, v26.16b
757
eor v2.16b, v2.16b, v27.16b
758
eor v3.16b, v3.16b, v28.16b
759
eor v4.16b, v4.16b, v29.16b
760
eor v5.16b, v5.16b, v30.16b
761
eor v6.16b, v6.16b, v31.16b
762
eor v7.16b, v7.16b, v16.16b
763
764
stp q16, q17, [x6]
765
766
mov bskey, x2
767
mov rounds, x3
768
br x16
769
SYM_FUNC_END(__xts_crypt8)
770
771
.macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
772
frame_push 0, 32
773
add x6, sp, #.Lframe_local_offset
774
775
ld1 {v25.16b}, [x5]
776
777
0: adr x16, \do8
778
bl __xts_crypt8
779
780
eor v16.16b, \o0\().16b, v25.16b
781
eor v17.16b, \o1\().16b, v26.16b
782
eor v18.16b, \o2\().16b, v27.16b
783
eor v19.16b, \o3\().16b, v28.16b
784
785
ldp q24, q25, [x6]
786
787
eor v20.16b, \o4\().16b, v29.16b
788
eor v21.16b, \o5\().16b, v30.16b
789
eor v22.16b, \o6\().16b, v31.16b
790
eor v23.16b, \o7\().16b, v24.16b
791
792
st1 {v16.16b-v19.16b}, [x0], #64
793
st1 {v20.16b-v23.16b}, [x0], #64
794
795
subs x4, x4, #8
796
b.gt 0b
797
798
st1 {v25.16b}, [x5]
799
frame_pop
800
ret
801
.endm
802
803
SYM_TYPED_FUNC_START(aesbs_xts_encrypt)
804
__xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
805
SYM_FUNC_END(aesbs_xts_encrypt)
806
807
SYM_TYPED_FUNC_START(aesbs_xts_decrypt)
808
__xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
809
SYM_FUNC_END(aesbs_xts_decrypt)
810
811
.macro next_ctr, v
812
mov \v\().d[1], x8
813
adds x8, x8, #1
814
mov \v\().d[0], x7
815
adc x7, x7, xzr
816
rev64 \v\().16b, \v\().16b
817
.endm
818
819
/*
820
* aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
821
* int rounds, int blocks, u8 iv[])
822
*/
823
SYM_FUNC_START(aesbs_ctr_encrypt)
824
frame_push 0
825
ldp x7, x8, [x5]
826
ld1 {v0.16b}, [x5]
827
CPU_LE( rev x7, x7 )
828
CPU_LE( rev x8, x8 )
829
adds x8, x8, #1
830
adc x7, x7, xzr
831
832
0: next_ctr v1
833
next_ctr v2
834
next_ctr v3
835
next_ctr v4
836
next_ctr v5
837
next_ctr v6
838
next_ctr v7
839
840
mov bskey, x2
841
mov rounds, x3
842
bl aesbs_encrypt8
843
844
ld1 { v8.16b-v11.16b}, [x1], #64
845
ld1 {v12.16b-v15.16b}, [x1], #64
846
847
eor v8.16b, v0.16b, v8.16b
848
eor v9.16b, v1.16b, v9.16b
849
eor v10.16b, v4.16b, v10.16b
850
eor v11.16b, v6.16b, v11.16b
851
eor v12.16b, v3.16b, v12.16b
852
eor v13.16b, v7.16b, v13.16b
853
eor v14.16b, v2.16b, v14.16b
854
eor v15.16b, v5.16b, v15.16b
855
856
st1 { v8.16b-v11.16b}, [x0], #64
857
st1 {v12.16b-v15.16b}, [x0], #64
858
859
next_ctr v0
860
subs x4, x4, #8
861
b.gt 0b
862
863
st1 {v0.16b}, [x5]
864
frame_pop
865
ret
866
SYM_FUNC_END(aesbs_ctr_encrypt)
867
868