Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/powerpc/crypto/aes-spe-core.S
26439 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* Fast AES implementation for SPE instruction set (PPC)
4
*
5
* This code makes use of the SPE SIMD instruction set as defined in
6
* http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
7
* Implementation is based on optimization guide notes from
8
* http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
9
*
10
* Copyright (c) 2015 Markus Stockhausen <[email protected]>
11
*/
12
13
#include <asm/ppc_asm.h>
14
#include "aes-spe-regs.h"
15
16
#define EAD(in, bpos) \
17
rlwimi rT0,in,28-((bpos+3)%4)*8,20,27;
18
19
#define DAD(in, bpos) \
20
rlwimi rT1,in,24-((bpos+3)%4)*8,24,31;
21
22
#define LWH(out, off) \
23
evlwwsplat out,off(rT0); /* load word high */
24
25
#define LWL(out, off) \
26
lwz out,off(rT0); /* load word low */
27
28
#define LBZ(out, tab, off) \
29
lbz out,off(tab); /* load byte */
30
31
#define LAH(out, in, bpos, off) \
32
EAD(in, bpos) /* calc addr + load word high */ \
33
LWH(out, off)
34
35
#define LAL(out, in, bpos, off) \
36
EAD(in, bpos) /* calc addr + load word low */ \
37
LWL(out, off)
38
39
#define LAE(out, in, bpos) \
40
EAD(in, bpos) /* calc addr + load enc byte */ \
41
LBZ(out, rT0, 8)
42
43
#define LBE(out) \
44
LBZ(out, rT0, 8) /* load enc byte */
45
46
#define LAD(out, in, bpos) \
47
DAD(in, bpos) /* calc addr + load dec byte */ \
48
LBZ(out, rT1, 0)
49
50
#define LBD(out) \
51
LBZ(out, rT1, 0)
52
53
/*
54
* ppc_encrypt_block: The central encryption function for a single 16 bytes
55
* block. It does no stack handling or register saving to support fast calls
56
* via bl/blr. It expects that caller has pre-xored input data with first
57
* 4 words of encryption key into rD0-rD3. Pointer/counter registers must
58
* have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
59
* and rW0-rW3 and caller must execute a final xor on the output registers.
60
* All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
61
*
62
*/
63
_GLOBAL(ppc_encrypt_block)
64
LAH(rW4, rD1, 2, 4)
65
LAH(rW6, rD0, 3, 0)
66
LAH(rW3, rD0, 1, 8)
67
ppc_encrypt_block_loop:
68
LAH(rW0, rD3, 0, 12)
69
LAL(rW0, rD0, 0, 12)
70
LAH(rW1, rD1, 0, 12)
71
LAH(rW2, rD2, 1, 8)
72
LAL(rW2, rD3, 1, 8)
73
LAL(rW3, rD1, 1, 8)
74
LAL(rW4, rD2, 2, 4)
75
LAL(rW6, rD1, 3, 0)
76
LAH(rW5, rD3, 2, 4)
77
LAL(rW5, rD0, 2, 4)
78
LAH(rW7, rD2, 3, 0)
79
evldw rD1,16(rKP)
80
EAD(rD3, 3)
81
evxor rW2,rW2,rW4
82
LWL(rW7, 0)
83
evxor rW2,rW2,rW6
84
EAD(rD2, 0)
85
evxor rD1,rD1,rW2
86
LWL(rW1, 12)
87
evxor rD1,rD1,rW0
88
evldw rD3,24(rKP)
89
evmergehi rD0,rD0,rD1
90
EAD(rD1, 2)
91
evxor rW3,rW3,rW5
92
LWH(rW4, 4)
93
evxor rW3,rW3,rW7
94
EAD(rD0, 3)
95
evxor rD3,rD3,rW3
96
LWH(rW6, 0)
97
evxor rD3,rD3,rW1
98
EAD(rD0, 1)
99
evmergehi rD2,rD2,rD3
100
LWH(rW3, 8)
101
LAH(rW0, rD3, 0, 12)
102
LAL(rW0, rD0, 0, 12)
103
LAH(rW1, rD1, 0, 12)
104
LAH(rW2, rD2, 1, 8)
105
LAL(rW2, rD3, 1, 8)
106
LAL(rW3, rD1, 1, 8)
107
LAL(rW4, rD2, 2, 4)
108
LAL(rW6, rD1, 3, 0)
109
LAH(rW5, rD3, 2, 4)
110
LAL(rW5, rD0, 2, 4)
111
LAH(rW7, rD2, 3, 0)
112
evldw rD1,32(rKP)
113
EAD(rD3, 3)
114
evxor rW2,rW2,rW4
115
LWL(rW7, 0)
116
evxor rW2,rW2,rW6
117
EAD(rD2, 0)
118
evxor rD1,rD1,rW2
119
LWL(rW1, 12)
120
evxor rD1,rD1,rW0
121
evldw rD3,40(rKP)
122
evmergehi rD0,rD0,rD1
123
EAD(rD1, 2)
124
evxor rW3,rW3,rW5
125
LWH(rW4, 4)
126
evxor rW3,rW3,rW7
127
EAD(rD0, 3)
128
evxor rD3,rD3,rW3
129
LWH(rW6, 0)
130
evxor rD3,rD3,rW1
131
EAD(rD0, 1)
132
evmergehi rD2,rD2,rD3
133
LWH(rW3, 8)
134
addi rKP,rKP,32
135
bdnz ppc_encrypt_block_loop
136
LAH(rW0, rD3, 0, 12)
137
LAL(rW0, rD0, 0, 12)
138
LAH(rW1, rD1, 0, 12)
139
LAH(rW2, rD2, 1, 8)
140
LAL(rW2, rD3, 1, 8)
141
LAL(rW3, rD1, 1, 8)
142
LAL(rW4, rD2, 2, 4)
143
LAH(rW5, rD3, 2, 4)
144
LAL(rW6, rD1, 3, 0)
145
LAL(rW5, rD0, 2, 4)
146
LAH(rW7, rD2, 3, 0)
147
evldw rD1,16(rKP)
148
EAD(rD3, 3)
149
evxor rW2,rW2,rW4
150
LWL(rW7, 0)
151
evxor rW2,rW2,rW6
152
EAD(rD2, 0)
153
evxor rD1,rD1,rW2
154
LWL(rW1, 12)
155
evxor rD1,rD1,rW0
156
evldw rD3,24(rKP)
157
evmergehi rD0,rD0,rD1
158
EAD(rD1, 0)
159
evxor rW3,rW3,rW5
160
LBE(rW2)
161
evxor rW3,rW3,rW7
162
EAD(rD0, 1)
163
evxor rD3,rD3,rW3
164
LBE(rW6)
165
evxor rD3,rD3,rW1
166
EAD(rD0, 0)
167
evmergehi rD2,rD2,rD3
168
LBE(rW1)
169
LAE(rW0, rD3, 0)
170
LAE(rW1, rD0, 0)
171
LAE(rW4, rD2, 1)
172
LAE(rW5, rD3, 1)
173
LAE(rW3, rD2, 0)
174
LAE(rW7, rD1, 1)
175
rlwimi rW0,rW4,8,16,23
176
rlwimi rW1,rW5,8,16,23
177
LAE(rW4, rD1, 2)
178
LAE(rW5, rD2, 2)
179
rlwimi rW2,rW6,8,16,23
180
rlwimi rW3,rW7,8,16,23
181
LAE(rW6, rD3, 2)
182
LAE(rW7, rD0, 2)
183
rlwimi rW0,rW4,16,8,15
184
rlwimi rW1,rW5,16,8,15
185
LAE(rW4, rD0, 3)
186
LAE(rW5, rD1, 3)
187
rlwimi rW2,rW6,16,8,15
188
lwz rD0,32(rKP)
189
rlwimi rW3,rW7,16,8,15
190
lwz rD1,36(rKP)
191
LAE(rW6, rD2, 3)
192
LAE(rW7, rD3, 3)
193
rlwimi rW0,rW4,24,0,7
194
lwz rD2,40(rKP)
195
rlwimi rW1,rW5,24,0,7
196
lwz rD3,44(rKP)
197
rlwimi rW2,rW6,24,0,7
198
rlwimi rW3,rW7,24,0,7
199
blr
200
201
/*
202
* ppc_decrypt_block: The central decryption function for a single 16 bytes
203
* block. It does no stack handling or register saving to support fast calls
204
* via bl/blr. It expects that caller has pre-xored input data with first
205
* 4 words of encryption key into rD0-rD3. Pointer/counter registers must
206
* have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
207
* and rW0-rW3 and caller must execute a final xor on the output registers.
208
* All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
209
*
210
*/
211
_GLOBAL(ppc_decrypt_block)
212
LAH(rW0, rD1, 0, 12)
213
LAH(rW6, rD0, 3, 0)
214
LAH(rW3, rD0, 1, 8)
215
ppc_decrypt_block_loop:
216
LAH(rW1, rD3, 0, 12)
217
LAL(rW0, rD2, 0, 12)
218
LAH(rW2, rD2, 1, 8)
219
LAL(rW2, rD3, 1, 8)
220
LAH(rW4, rD3, 2, 4)
221
LAL(rW4, rD0, 2, 4)
222
LAL(rW6, rD1, 3, 0)
223
LAH(rW5, rD1, 2, 4)
224
LAH(rW7, rD2, 3, 0)
225
LAL(rW7, rD3, 3, 0)
226
LAL(rW3, rD1, 1, 8)
227
evldw rD1,16(rKP)
228
EAD(rD0, 0)
229
evxor rW4,rW4,rW6
230
LWL(rW1, 12)
231
evxor rW0,rW0,rW4
232
EAD(rD2, 2)
233
evxor rW0,rW0,rW2
234
LWL(rW5, 4)
235
evxor rD1,rD1,rW0
236
evldw rD3,24(rKP)
237
evmergehi rD0,rD0,rD1
238
EAD(rD1, 0)
239
evxor rW3,rW3,rW7
240
LWH(rW0, 12)
241
evxor rW3,rW3,rW1
242
EAD(rD0, 3)
243
evxor rD3,rD3,rW3
244
LWH(rW6, 0)
245
evxor rD3,rD3,rW5
246
EAD(rD0, 1)
247
evmergehi rD2,rD2,rD3
248
LWH(rW3, 8)
249
LAH(rW1, rD3, 0, 12)
250
LAL(rW0, rD2, 0, 12)
251
LAH(rW2, rD2, 1, 8)
252
LAL(rW2, rD3, 1, 8)
253
LAH(rW4, rD3, 2, 4)
254
LAL(rW4, rD0, 2, 4)
255
LAL(rW6, rD1, 3, 0)
256
LAH(rW5, rD1, 2, 4)
257
LAH(rW7, rD2, 3, 0)
258
LAL(rW7, rD3, 3, 0)
259
LAL(rW3, rD1, 1, 8)
260
evldw rD1,32(rKP)
261
EAD(rD0, 0)
262
evxor rW4,rW4,rW6
263
LWL(rW1, 12)
264
evxor rW0,rW0,rW4
265
EAD(rD2, 2)
266
evxor rW0,rW0,rW2
267
LWL(rW5, 4)
268
evxor rD1,rD1,rW0
269
evldw rD3,40(rKP)
270
evmergehi rD0,rD0,rD1
271
EAD(rD1, 0)
272
evxor rW3,rW3,rW7
273
LWH(rW0, 12)
274
evxor rW3,rW3,rW1
275
EAD(rD0, 3)
276
evxor rD3,rD3,rW3
277
LWH(rW6, 0)
278
evxor rD3,rD3,rW5
279
EAD(rD0, 1)
280
evmergehi rD2,rD2,rD3
281
LWH(rW3, 8)
282
addi rKP,rKP,32
283
bdnz ppc_decrypt_block_loop
284
LAH(rW1, rD3, 0, 12)
285
LAL(rW0, rD2, 0, 12)
286
LAH(rW2, rD2, 1, 8)
287
LAL(rW2, rD3, 1, 8)
288
LAH(rW4, rD3, 2, 4)
289
LAL(rW4, rD0, 2, 4)
290
LAL(rW6, rD1, 3, 0)
291
LAH(rW5, rD1, 2, 4)
292
LAH(rW7, rD2, 3, 0)
293
LAL(rW7, rD3, 3, 0)
294
LAL(rW3, rD1, 1, 8)
295
evldw rD1,16(rKP)
296
EAD(rD0, 0)
297
evxor rW4,rW4,rW6
298
LWL(rW1, 12)
299
evxor rW0,rW0,rW4
300
EAD(rD2, 2)
301
evxor rW0,rW0,rW2
302
LWL(rW5, 4)
303
evxor rD1,rD1,rW0
304
evldw rD3,24(rKP)
305
evmergehi rD0,rD0,rD1
306
DAD(rD1, 0)
307
evxor rW3,rW3,rW7
308
LBD(rW0)
309
evxor rW3,rW3,rW1
310
DAD(rD0, 1)
311
evxor rD3,rD3,rW3
312
LBD(rW6)
313
evxor rD3,rD3,rW5
314
DAD(rD0, 0)
315
evmergehi rD2,rD2,rD3
316
LBD(rW3)
317
LAD(rW2, rD3, 0)
318
LAD(rW1, rD2, 0)
319
LAD(rW4, rD2, 1)
320
LAD(rW5, rD3, 1)
321
LAD(rW7, rD1, 1)
322
rlwimi rW0,rW4,8,16,23
323
rlwimi rW1,rW5,8,16,23
324
LAD(rW4, rD3, 2)
325
LAD(rW5, rD0, 2)
326
rlwimi rW2,rW6,8,16,23
327
rlwimi rW3,rW7,8,16,23
328
LAD(rW6, rD1, 2)
329
LAD(rW7, rD2, 2)
330
rlwimi rW0,rW4,16,8,15
331
rlwimi rW1,rW5,16,8,15
332
LAD(rW4, rD0, 3)
333
LAD(rW5, rD1, 3)
334
rlwimi rW2,rW6,16,8,15
335
lwz rD0,32(rKP)
336
rlwimi rW3,rW7,16,8,15
337
lwz rD1,36(rKP)
338
LAD(rW6, rD2, 3)
339
LAD(rW7, rD3, 3)
340
rlwimi rW0,rW4,24,0,7
341
lwz rD2,40(rKP)
342
rlwimi rW1,rW5,24,0,7
343
lwz rD3,44(rKP)
344
rlwimi rW2,rW6,24,0,7
345
rlwimi rW3,rW7,24,0,7
346
blr
347
348