Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/arm64/crypto/aes-neon.S
26424 views
1
/* SPDX-License-Identifier: GPL-2.0-only */
2
/*
3
* linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
4
*
5
* Copyright (C) 2013 - 2017 Linaro Ltd. <[email protected]>
6
*/
7
8
#include <linux/linkage.h>
9
#include <asm/assembler.h>
10
11
#define AES_FUNC_START(func) SYM_FUNC_START(neon_ ## func)
12
#define AES_FUNC_END(func) SYM_FUNC_END(neon_ ## func)
13
14
xtsmask .req v7
15
cbciv .req v7
16
vctr .req v4
17
18
.macro xts_reload_mask, tmp
19
xts_load_mask \tmp
20
.endm
21
22
/* special case for the neon-bs driver calling into this one for CTS */
23
.macro xts_cts_skip_tw, reg, lbl
24
tbnz \reg, #1, \lbl
25
.endm
26
27
/* multiply by polynomial 'x' in GF(2^8) */
28
.macro mul_by_x, out, in, temp, const
29
sshr \temp, \in, #7
30
shl \out, \in, #1
31
and \temp, \temp, \const
32
eor \out, \out, \temp
33
.endm
34
35
/* multiply by polynomial 'x^2' in GF(2^8) */
36
.macro mul_by_x2, out, in, temp, const
37
ushr \temp, \in, #6
38
shl \out, \in, #2
39
pmul \temp, \temp, \const
40
eor \out, \out, \temp
41
.endm
42
43
/* preload the entire Sbox */
44
.macro prepare, sbox, shiftrows, temp
45
movi v12.16b, #0x1b
46
ldr_l q13, \shiftrows, \temp
47
ldr_l q14, .Lror32by8, \temp
48
adr_l \temp, \sbox
49
ld1 {v16.16b-v19.16b}, [\temp], #64
50
ld1 {v20.16b-v23.16b}, [\temp], #64
51
ld1 {v24.16b-v27.16b}, [\temp], #64
52
ld1 {v28.16b-v31.16b}, [\temp]
53
.endm
54
55
/* do preload for encryption */
56
.macro enc_prepare, ignore0, ignore1, temp
57
prepare crypto_aes_sbox, .LForward_ShiftRows, \temp
58
.endm
59
60
.macro enc_switch_key, ignore0, ignore1, temp
61
/* do nothing */
62
.endm
63
64
/* do preload for decryption */
65
.macro dec_prepare, ignore0, ignore1, temp
66
prepare crypto_aes_inv_sbox, .LReverse_ShiftRows, \temp
67
.endm
68
69
/* apply SubBytes transformation using the preloaded Sbox */
70
.macro sub_bytes, in
71
sub v9.16b, \in\().16b, v15.16b
72
tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b
73
sub v10.16b, v9.16b, v15.16b
74
tbx \in\().16b, {v20.16b-v23.16b}, v9.16b
75
sub v11.16b, v10.16b, v15.16b
76
tbx \in\().16b, {v24.16b-v27.16b}, v10.16b
77
tbx \in\().16b, {v28.16b-v31.16b}, v11.16b
78
.endm
79
80
/* apply MixColumns transformation */
81
.macro mix_columns, in, enc
82
.if \enc == 0
83
/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
84
mul_by_x2 v8.16b, \in\().16b, v9.16b, v12.16b
85
eor \in\().16b, \in\().16b, v8.16b
86
rev32 v8.8h, v8.8h
87
eor \in\().16b, \in\().16b, v8.16b
88
.endif
89
90
mul_by_x v9.16b, \in\().16b, v8.16b, v12.16b
91
rev32 v8.8h, \in\().8h
92
eor v8.16b, v8.16b, v9.16b
93
eor \in\().16b, \in\().16b, v8.16b
94
tbl \in\().16b, {\in\().16b}, v14.16b
95
eor \in\().16b, \in\().16b, v8.16b
96
.endm
97
98
.macro do_block, enc, in, rounds, rk, rkp, i
99
ld1 {v15.4s}, [\rk]
100
add \rkp, \rk, #16
101
mov \i, \rounds
102
.La\@: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
103
movi v15.16b, #0x40
104
tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */
105
sub_bytes \in
106
sub \i, \i, #1
107
ld1 {v15.4s}, [\rkp], #16
108
cbz \i, .Lb\@
109
mix_columns \in, \enc
110
b .La\@
111
.Lb\@: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
112
.endm
113
114
.macro encrypt_block, in, rounds, rk, rkp, i
115
do_block 1, \in, \rounds, \rk, \rkp, \i
116
.endm
117
118
.macro decrypt_block, in, rounds, rk, rkp, i
119
do_block 0, \in, \rounds, \rk, \rkp, \i
120
.endm
121
122
/*
123
* Interleaved versions: functionally equivalent to the
124
* ones above, but applied to AES states in parallel.
125
*/
126
127
.macro sub_bytes_4x, in0, in1, in2, in3
128
sub v8.16b, \in0\().16b, v15.16b
129
tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
130
sub v9.16b, \in1\().16b, v15.16b
131
tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
132
sub v10.16b, \in2\().16b, v15.16b
133
tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b
134
sub v11.16b, \in3\().16b, v15.16b
135
tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b
136
tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
137
tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
138
sub v8.16b, v8.16b, v15.16b
139
tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b
140
sub v9.16b, v9.16b, v15.16b
141
tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b
142
sub v10.16b, v10.16b, v15.16b
143
tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b
144
sub v11.16b, v11.16b, v15.16b
145
tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b
146
sub v8.16b, v8.16b, v15.16b
147
tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b
148
sub v9.16b, v9.16b, v15.16b
149
tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b
150
sub v10.16b, v10.16b, v15.16b
151
tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
152
sub v11.16b, v11.16b, v15.16b
153
tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
154
tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b
155
tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b
156
.endm
157
158
.macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
159
sshr \tmp0\().16b, \in0\().16b, #7
160
shl \out0\().16b, \in0\().16b, #1
161
sshr \tmp1\().16b, \in1\().16b, #7
162
and \tmp0\().16b, \tmp0\().16b, \const\().16b
163
shl \out1\().16b, \in1\().16b, #1
164
and \tmp1\().16b, \tmp1\().16b, \const\().16b
165
eor \out0\().16b, \out0\().16b, \tmp0\().16b
166
eor \out1\().16b, \out1\().16b, \tmp1\().16b
167
.endm
168
169
.macro mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
170
ushr \tmp0\().16b, \in0\().16b, #6
171
shl \out0\().16b, \in0\().16b, #2
172
ushr \tmp1\().16b, \in1\().16b, #6
173
pmul \tmp0\().16b, \tmp0\().16b, \const\().16b
174
shl \out1\().16b, \in1\().16b, #2
175
pmul \tmp1\().16b, \tmp1\().16b, \const\().16b
176
eor \out0\().16b, \out0\().16b, \tmp0\().16b
177
eor \out1\().16b, \out1\().16b, \tmp1\().16b
178
.endm
179
180
.macro mix_columns_2x, in0, in1, enc
181
.if \enc == 0
182
/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
183
mul_by_x2_2x v8, v9, \in0, \in1, v10, v11, v12
184
eor \in0\().16b, \in0\().16b, v8.16b
185
rev32 v8.8h, v8.8h
186
eor \in1\().16b, \in1\().16b, v9.16b
187
rev32 v9.8h, v9.8h
188
eor \in0\().16b, \in0\().16b, v8.16b
189
eor \in1\().16b, \in1\().16b, v9.16b
190
.endif
191
192
mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v12
193
rev32 v10.8h, \in0\().8h
194
rev32 v11.8h, \in1\().8h
195
eor v10.16b, v10.16b, v8.16b
196
eor v11.16b, v11.16b, v9.16b
197
eor \in0\().16b, \in0\().16b, v10.16b
198
eor \in1\().16b, \in1\().16b, v11.16b
199
tbl \in0\().16b, {\in0\().16b}, v14.16b
200
tbl \in1\().16b, {\in1\().16b}, v14.16b
201
eor \in0\().16b, \in0\().16b, v10.16b
202
eor \in1\().16b, \in1\().16b, v11.16b
203
.endm
204
205
.macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
206
ld1 {v15.4s}, [\rk]
207
add \rkp, \rk, #16
208
mov \i, \rounds
209
.La\@: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
210
eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
211
eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
212
eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
213
movi v15.16b, #0x40
214
tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
215
tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
216
tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */
217
tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */
218
sub_bytes_4x \in0, \in1, \in2, \in3
219
sub \i, \i, #1
220
ld1 {v15.4s}, [\rkp], #16
221
cbz \i, .Lb\@
222
mix_columns_2x \in0, \in1, \enc
223
mix_columns_2x \in2, \in3, \enc
224
b .La\@
225
.Lb\@: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
226
eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
227
eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
228
eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
229
.endm
230
231
.macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
232
do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
233
.endm
234
235
.macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
236
do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
237
.endm
238
239
#include "aes-modes.S"
240
241
.section ".rodata", "a"
242
.align 4
243
.LForward_ShiftRows:
244
.octa 0x0b06010c07020d08030e09040f0a0500
245
246
.LReverse_ShiftRows:
247
.octa 0x0306090c0f0205080b0e0104070a0d00
248
249
.Lror32by8:
250
.octa 0x0c0f0e0d080b0a090407060500030201
251
252