Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/x86/aes-aesni.S
121848 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
//
3
// AES block cipher using AES-NI instructions
4
//
5
// Copyright 2026 Google LLC
6
//
7
// The code in this file supports 32-bit and 64-bit CPUs, and it doesn't require
8
// AVX. It does use up to SSE4.1, which all CPUs with AES-NI have.
9
#include <linux/linkage.h>
10
11
.section .rodata
12
#ifdef __x86_64__
13
#define RODATA(label) label(%rip)
14
#else
15
#define RODATA(label) label
16
#endif
17
18
// A mask for pshufb that extracts the last dword, rotates it right by 8
19
// bits, and copies the result to all four dwords.
20
.p2align 4
21
.Lmask:
22
.byte 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12
23
24
// The AES round constants, used during key expansion
25
.Lrcon:
26
.long 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
27
28
.text
29
30
// Transform four dwords [a0, a1, a2, a3] in \a into
31
// [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3]. \tmp is a temporary xmm register.
32
//
33
// Note: this could be done in four instructions, shufps + pxor + shufps + pxor,
34
// if the temporary register were zero-initialized ahead of time. We instead do
35
// it in an easier-to-understand way that doesn't require zero-initialization
36
// and avoids the unusual shufps instruction. movdqa is usually "free" anyway.
37
.macro _prefix_sum a, tmp
38
movdqa \a, \tmp // [a0, a1, a2, a3]
39
pslldq $4, \a // [0, a0, a1, a2]
40
pxor \tmp, \a // [a0, a0^a1, a1^a2, a2^a3]
41
movdqa \a, \tmp
42
pslldq $8, \a // [0, 0, a0, a0^a1]
43
pxor \tmp, \a // [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3]
44
.endm
45
46
.macro _gen_round_key a, b
47
// Compute four copies of rcon[i] ^ SubBytes(ror32(w, 8)), where w is
48
// the last dword of the previous round key (given in \b).
49
//
50
// 'aesenclast src, dst' does dst = src XOR SubBytes(ShiftRows(dst)).
51
// It is used here solely for the SubBytes and the XOR. The ShiftRows
52
// is a no-op because all four columns are the same here.
53
//
54
// Don't use the 'aeskeygenassist' instruction, since:
55
// - On most Intel CPUs it is microcoded, making it have a much higher
56
// latency and use more execution ports than 'aesenclast'.
57
// - It cannot be used in a loop, since it requires an immediate.
58
// - It doesn't do much more than 'aesenclast' in the first place.
59
movdqa \b, %xmm2
60
pshufb MASK, %xmm2
61
aesenclast RCON, %xmm2
62
63
// XOR in the prefix sum of the four dwords of \a, which is the
64
// previous round key (AES-128) or the first round key in the previous
65
// pair of round keys (AES-256). The result is the next round key.
66
_prefix_sum \a, tmp=%xmm3
67
pxor %xmm2, \a
68
69
// Store the next round key to memory. Also leave it in \a.
70
movdqu \a, (RNDKEYS)
71
.endm
72
73
.macro _aes_expandkey_aesni is_aes128
74
#ifdef __x86_64__
75
// Arguments
76
.set RNDKEYS, %rdi
77
.set INV_RNDKEYS, %rsi
78
.set IN_KEY, %rdx
79
80
// Other local variables
81
.set RCON_PTR, %rcx
82
.set COUNTER, %eax
83
#else
84
// Arguments, assuming -mregparm=3
85
.set RNDKEYS, %eax
86
.set INV_RNDKEYS, %edx
87
.set IN_KEY, %ecx
88
89
// Other local variables
90
.set RCON_PTR, %ebx
91
.set COUNTER, %esi
92
#endif
93
.set RCON, %xmm6
94
.set MASK, %xmm7
95
96
#ifdef __i386__
97
push %ebx
98
push %esi
99
#endif
100
101
.if \is_aes128
102
// AES-128: the first round key is simply a copy of the raw key.
103
movdqu (IN_KEY), %xmm0
104
movdqu %xmm0, (RNDKEYS)
105
.else
106
// AES-256: the first two round keys are simply a copy of the raw key.
107
movdqu (IN_KEY), %xmm0
108
movdqu %xmm0, (RNDKEYS)
109
movdqu 16(IN_KEY), %xmm1
110
movdqu %xmm1, 16(RNDKEYS)
111
add $32, RNDKEYS
112
.endif
113
114
// Generate the remaining round keys.
115
movdqa RODATA(.Lmask), MASK
116
.if \is_aes128
117
lea RODATA(.Lrcon), RCON_PTR
118
mov $10, COUNTER
119
.Lgen_next_aes128_round_key:
120
add $16, RNDKEYS
121
movd (RCON_PTR), RCON
122
pshufd $0x00, RCON, RCON
123
add $4, RCON_PTR
124
_gen_round_key %xmm0, %xmm0
125
dec COUNTER
126
jnz .Lgen_next_aes128_round_key
127
.else
128
// AES-256: only the first 7 round constants are needed, so instead of
129
// loading each one from memory, just start by loading [1, 1, 1, 1] and
130
// then generate the rest by doubling.
131
pshufd $0x00, RODATA(.Lrcon), RCON
132
pxor %xmm5, %xmm5 // All-zeroes
133
mov $7, COUNTER
134
.Lgen_next_aes256_round_key_pair:
135
// Generate the next AES-256 round key: either the first of a pair of
136
// two, or the last one.
137
_gen_round_key %xmm0, %xmm1
138
139
dec COUNTER
140
jz .Lgen_aes256_round_keys_done
141
142
// Generate the second AES-256 round key of the pair. Compared to the
143
// first, there's no rotation and no XOR of a round constant.
144
pshufd $0xff, %xmm0, %xmm2 // Get four copies of last dword
145
aesenclast %xmm5, %xmm2 // Just does SubBytes
146
_prefix_sum %xmm1, tmp=%xmm3
147
pxor %xmm2, %xmm1
148
movdqu %xmm1, 16(RNDKEYS)
149
add $32, RNDKEYS
150
paddd RCON, RCON // RCON <<= 1
151
jmp .Lgen_next_aes256_round_key_pair
152
.Lgen_aes256_round_keys_done:
153
.endif
154
155
// If INV_RNDKEYS is non-NULL, write the round keys for the Equivalent
156
// Inverse Cipher to it. To do that, reverse the standard round keys,
157
// and apply aesimc (InvMixColumn) to each except the first and last.
158
test INV_RNDKEYS, INV_RNDKEYS
159
jz .Ldone\@
160
movdqu (RNDKEYS), %xmm0 // Last standard round key
161
movdqu %xmm0, (INV_RNDKEYS) // => First inverse round key
162
.if \is_aes128
163
mov $9, COUNTER
164
.else
165
mov $13, COUNTER
166
.endif
167
.Lgen_next_inv_round_key\@:
168
sub $16, RNDKEYS
169
add $16, INV_RNDKEYS
170
movdqu (RNDKEYS), %xmm0
171
aesimc %xmm0, %xmm0
172
movdqu %xmm0, (INV_RNDKEYS)
173
dec COUNTER
174
jnz .Lgen_next_inv_round_key\@
175
movdqu -16(RNDKEYS), %xmm0 // First standard round key
176
movdqu %xmm0, 16(INV_RNDKEYS) // => Last inverse round key
177
178
.Ldone\@:
179
#ifdef __i386__
180
pop %esi
181
pop %ebx
182
#endif
183
RET
184
.endm
185
186
// void aes128_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
187
// const u8 in_key[AES_KEYSIZE_128]);
188
SYM_FUNC_START(aes128_expandkey_aesni)
189
_aes_expandkey_aesni 1
190
SYM_FUNC_END(aes128_expandkey_aesni)
191
192
// void aes256_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
193
// const u8 in_key[AES_KEYSIZE_256]);
194
SYM_FUNC_START(aes256_expandkey_aesni)
195
_aes_expandkey_aesni 0
196
SYM_FUNC_END(aes256_expandkey_aesni)
197
198
.macro _aes_crypt_aesni enc
199
#ifdef __x86_64__
200
.set RNDKEYS, %rdi
201
.set NROUNDS, %esi
202
.set OUT, %rdx
203
.set IN, %rcx
204
#else
205
// Assuming -mregparm=3
206
.set RNDKEYS, %eax
207
.set NROUNDS, %edx
208
.set OUT, %ecx
209
.set IN, %ebx // Passed on stack
210
#endif
211
212
#ifdef __i386__
213
push %ebx
214
mov 8(%esp), %ebx
215
#endif
216
217
// Zero-th round
218
movdqu (IN), %xmm0
219
movdqu (RNDKEYS), %xmm1
220
pxor %xmm1, %xmm0
221
222
// Normal rounds
223
add $16, RNDKEYS
224
dec NROUNDS
225
.Lnext_round\@:
226
movdqu (RNDKEYS), %xmm1
227
.if \enc
228
aesenc %xmm1, %xmm0
229
.else
230
aesdec %xmm1, %xmm0
231
.endif
232
add $16, RNDKEYS
233
dec NROUNDS
234
jne .Lnext_round\@
235
236
// Last round
237
movdqu (RNDKEYS), %xmm1
238
.if \enc
239
aesenclast %xmm1, %xmm0
240
.else
241
aesdeclast %xmm1, %xmm0
242
.endif
243
movdqu %xmm0, (OUT)
244
245
#ifdef __i386__
246
pop %ebx
247
#endif
248
RET
249
.endm
250
251
// void aes_encrypt_aesni(const u32 rndkeys[], int nrounds,
252
// u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
253
SYM_FUNC_START(aes_encrypt_aesni)
254
_aes_crypt_aesni 1
255
SYM_FUNC_END(aes_encrypt_aesni)
256
257
// void aes_decrypt_aesni(const u32 inv_rndkeys[], int nrounds,
258
// u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
259
SYM_FUNC_START(aes_decrypt_aesni)
260
_aes_crypt_aesni 0
261
SYM_FUNC_END(aes_decrypt_aesni)
262
263