Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/bearssl/src/symcipher/aes_pwr8.c
39482 views
1
/*
2
* Copyright (c) 2017 Thomas Pornin <[email protected]>
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining
5
* a copy of this software and associated documentation files (the
6
* "Software"), to deal in the Software without restriction, including
7
* without limitation the rights to use, copy, modify, merge, publish,
8
* distribute, sublicense, and/or sell copies of the Software, and to
9
* permit persons to whom the Software is furnished to do so, subject to
10
* the following conditions:
11
*
12
* The above copyright notice and this permission notice shall be
13
* included in all copies or substantial portions of the Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
25
#define BR_POWER_ASM_MACROS 1
26
#include "inner.h"
27
28
/*
29
* This code contains the AES key schedule implementation using the
30
* POWER8 opcodes.
31
*/
32
33
#if BR_POWER8
34
35
static void
36
key_schedule_128(unsigned char *sk, const unsigned char *key)
37
{
38
long cc;
39
40
static const uint32_t fmod[] = { 0x11B, 0x11B, 0x11B, 0x11B };
41
#if BR_POWER8_LE
42
static const uint32_t idx2be[] = {
43
0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
44
};
45
#endif
46
47
cc = 0;
48
49
/*
50
* We use the VSX instructions for loading and storing the
51
* key/subkeys, since they support unaligned accesses. The rest
52
* of the computation is VMX only. VMX register 0 is VSX
53
* register 32.
54
*/
55
asm volatile (
56
57
/*
58
* v0 = all-zero word
59
* v1 = constant -8 / +8, copied into four words
60
* v2 = current subkey
61
* v3 = Rcon (x4 words)
62
* v6 = constant 8, copied into four words
63
* v7 = constant 0x11B, copied into four words
64
* v8 = constant for byteswapping words
65
*/
66
vspltisw(0, 0)
67
#if BR_POWER8_LE
68
vspltisw(1, -8)
69
#else
70
vspltisw(1, 8)
71
#endif
72
lxvw4x(34, 0, %[key])
73
vspltisw(3, 1)
74
vspltisw(6, 8)
75
lxvw4x(39, 0, %[fmod])
76
#if BR_POWER8_LE
77
lxvw4x(40, 0, %[idx2be])
78
#endif
79
80
/*
81
* First subkey is a copy of the key itself.
82
*/
83
#if BR_POWER8_LE
84
vperm(4, 2, 2, 8)
85
stxvw4x(36, 0, %[sk])
86
#else
87
stxvw4x(34, 0, %[sk])
88
#endif
89
90
/*
91
* Loop must run 10 times.
92
*/
93
li(%[cc], 10)
94
mtctr(%[cc])
95
label(loop)
96
/* Increment subkey address */
97
addi(%[sk], %[sk], 16)
98
99
/* Compute SubWord(RotWord(temp)) xor Rcon (into v4, splat) */
100
vrlw(4, 2, 1)
101
vsbox(4, 4)
102
#if BR_POWER8_LE
103
vxor(4, 4, 3)
104
#else
105
vsldoi(5, 3, 0, 3)
106
vxor(4, 4, 5)
107
#endif
108
vspltw(4, 4, 3)
109
110
/* XOR words for next subkey */
111
vsldoi(5, 0, 2, 12)
112
vxor(2, 2, 5)
113
vsldoi(5, 0, 2, 12)
114
vxor(2, 2, 5)
115
vsldoi(5, 0, 2, 12)
116
vxor(2, 2, 5)
117
vxor(2, 2, 4)
118
119
/* Store next subkey */
120
#if BR_POWER8_LE
121
vperm(4, 2, 2, 8)
122
stxvw4x(36, 0, %[sk])
123
#else
124
stxvw4x(34, 0, %[sk])
125
#endif
126
127
/* Update Rcon */
128
vadduwm(3, 3, 3)
129
vsrw(4, 3, 6)
130
vsubuwm(4, 0, 4)
131
vand(4, 4, 7)
132
vxor(3, 3, 4)
133
134
bdnz(loop)
135
136
: [sk] "+b" (sk), [cc] "+b" (cc)
137
: [key] "b" (key), [fmod] "b" (fmod)
138
#if BR_POWER8_LE
139
, [idx2be] "b" (idx2be)
140
#endif
141
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "ctr", "memory"
142
);
143
}
144
145
static void
146
key_schedule_192(unsigned char *sk, const unsigned char *key)
147
{
148
long cc;
149
150
#if BR_POWER8_LE
151
static const uint32_t idx2be[] = {
152
0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
153
};
154
#endif
155
156
cc = 0;
157
158
/*
159
* We use the VSX instructions for loading and storing the
160
* key/subkeys, since they support unaligned accesses. The rest
161
* of the computation is VMX only. VMX register 0 is VSX
162
* register 32.
163
*/
164
asm volatile (
165
166
/*
167
* v0 = all-zero word
168
* v1 = constant -8 / +8, copied into four words
169
* v2, v3 = current subkey
170
* v5 = Rcon (x4 words) (already shifted on big-endian)
171
* v6 = constant 8, copied into four words
172
* v8 = constant for byteswapping words
173
*
174
* The left two words of v3 are ignored.
175
*/
176
vspltisw(0, 0)
177
#if BR_POWER8_LE
178
vspltisw(1, -8)
179
#else
180
vspltisw(1, 8)
181
#endif
182
li(%[cc], 8)
183
lxvw4x(34, 0, %[key])
184
lxvw4x(35, %[cc], %[key])
185
vsldoi(3, 3, 0, 8)
186
vspltisw(5, 1)
187
#if !BR_POWER8_LE
188
vsldoi(5, 5, 0, 3)
189
#endif
190
vspltisw(6, 8)
191
#if BR_POWER8_LE
192
lxvw4x(40, 0, %[idx2be])
193
#endif
194
195
/*
196
* Loop must run 8 times. Each iteration produces 256
197
* bits of subkeys, with a 64-bit overlap.
198
*/
199
li(%[cc], 8)
200
mtctr(%[cc])
201
li(%[cc], 16)
202
label(loop)
203
204
/*
205
* Last 6 words in v2:v3l. Compute next 6 words into
206
* v3r:v4.
207
*/
208
vrlw(10, 3, 1)
209
vsbox(10, 10)
210
vxor(10, 10, 5)
211
vspltw(10, 10, 1)
212
vsldoi(11, 0, 10, 8)
213
214
vsldoi(12, 0, 2, 12)
215
vxor(12, 2, 12)
216
vsldoi(13, 0, 12, 12)
217
vxor(12, 12, 13)
218
vsldoi(13, 0, 12, 12)
219
vxor(12, 12, 13)
220
221
vspltw(13, 12, 3)
222
vxor(13, 13, 3)
223
vsldoi(14, 0, 3, 12)
224
vxor(13, 13, 14)
225
226
vsldoi(4, 12, 13, 8)
227
vsldoi(14, 0, 3, 8)
228
vsldoi(3, 14, 12, 8)
229
230
vxor(3, 3, 11)
231
vxor(4, 4, 10)
232
233
/*
234
* Update Rcon. Since for a 192-bit key, we use only 8
235
* such constants, we will not hit the field modulus,
236
* so a simple shift (addition) works well.
237
*/
238
vadduwm(5, 5, 5)
239
240
/*
241
* Write out the two left 128-bit words
242
*/
243
#if BR_POWER8_LE
244
vperm(10, 2, 2, 8)
245
vperm(11, 3, 3, 8)
246
stxvw4x(42, 0, %[sk])
247
stxvw4x(43, %[cc], %[sk])
248
#else
249
stxvw4x(34, 0, %[sk])
250
stxvw4x(35, %[cc], %[sk])
251
#endif
252
addi(%[sk], %[sk], 24)
253
254
/*
255
* Shift words for next iteration.
256
*/
257
vsldoi(2, 3, 4, 8)
258
vsldoi(3, 4, 0, 8)
259
260
bdnz(loop)
261
262
/*
263
* The loop wrote the first 50 subkey words, but we need
264
* to produce 52, so we must do one last write.
265
*/
266
#if BR_POWER8_LE
267
vperm(10, 2, 2, 8)
268
stxvw4x(42, 0, %[sk])
269
#else
270
stxvw4x(34, 0, %[sk])
271
#endif
272
273
: [sk] "+b" (sk), [cc] "+b" (cc)
274
: [key] "b" (key)
275
#if BR_POWER8_LE
276
, [idx2be] "b" (idx2be)
277
#endif
278
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
279
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"
280
);
281
}
282
283
static void
284
key_schedule_256(unsigned char *sk, const unsigned char *key)
285
{
286
long cc;
287
288
#if BR_POWER8_LE
289
static const uint32_t idx2be[] = {
290
0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
291
};
292
#endif
293
294
cc = 0;
295
296
/*
297
* We use the VSX instructions for loading and storing the
298
* key/subkeys, since they support unaligned accesses. The rest
299
* of the computation is VMX only. VMX register 0 is VSX
300
* register 32.
301
*/
302
asm volatile (
303
304
/*
305
* v0 = all-zero word
306
* v1 = constant -8 / +8, copied into four words
307
* v2, v3 = current subkey
308
* v6 = Rcon (x4 words) (already shifted on big-endian)
309
* v7 = constant 8, copied into four words
310
* v8 = constant for byteswapping words
311
*
312
* The left two words of v3 are ignored.
313
*/
314
vspltisw(0, 0)
315
#if BR_POWER8_LE
316
vspltisw(1, -8)
317
#else
318
vspltisw(1, 8)
319
#endif
320
li(%[cc], 16)
321
lxvw4x(34, 0, %[key])
322
lxvw4x(35, %[cc], %[key])
323
vspltisw(6, 1)
324
#if !BR_POWER8_LE
325
vsldoi(6, 6, 0, 3)
326
#endif
327
vspltisw(7, 8)
328
#if BR_POWER8_LE
329
lxvw4x(40, 0, %[idx2be])
330
#endif
331
332
/*
333
* Loop must run 7 times. Each iteration produces two
334
* subkeys.
335
*/
336
li(%[cc], 7)
337
mtctr(%[cc])
338
li(%[cc], 16)
339
label(loop)
340
341
/*
342
* Current words are in v2:v3. Compute next word in v4.
343
*/
344
vrlw(10, 3, 1)
345
vsbox(10, 10)
346
vxor(10, 10, 6)
347
vspltw(10, 10, 3)
348
349
vsldoi(4, 0, 2, 12)
350
vxor(4, 2, 4)
351
vsldoi(5, 0, 4, 12)
352
vxor(4, 4, 5)
353
vsldoi(5, 0, 4, 12)
354
vxor(4, 4, 5)
355
vxor(4, 4, 10)
356
357
/*
358
* Then other word in v5.
359
*/
360
vsbox(10, 4)
361
vspltw(10, 10, 3)
362
363
vsldoi(5, 0, 3, 12)
364
vxor(5, 3, 5)
365
vsldoi(11, 0, 5, 12)
366
vxor(5, 5, 11)
367
vsldoi(11, 0, 5, 12)
368
vxor(5, 5, 11)
369
vxor(5, 5, 10)
370
371
/*
372
* Update Rcon. Since for a 256-bit key, we use only 7
373
* such constants, we will not hit the field modulus,
374
* so a simple shift (addition) works well.
375
*/
376
vadduwm(6, 6, 6)
377
378
/*
379
* Write out the two left 128-bit words
380
*/
381
#if BR_POWER8_LE
382
vperm(10, 2, 2, 8)
383
vperm(11, 3, 3, 8)
384
stxvw4x(42, 0, %[sk])
385
stxvw4x(43, %[cc], %[sk])
386
#else
387
stxvw4x(34, 0, %[sk])
388
stxvw4x(35, %[cc], %[sk])
389
#endif
390
addi(%[sk], %[sk], 32)
391
392
/*
393
* Replace v2:v3 with v4:v5.
394
*/
395
vxor(2, 0, 4)
396
vxor(3, 0, 5)
397
398
bdnz(loop)
399
400
/*
401
* The loop wrote the first 14 subkeys, but we need 15,
402
* so we must do an extra write.
403
*/
404
#if BR_POWER8_LE
405
vperm(10, 2, 2, 8)
406
stxvw4x(42, 0, %[sk])
407
#else
408
stxvw4x(34, 0, %[sk])
409
#endif
410
411
: [sk] "+b" (sk), [cc] "+b" (cc)
412
: [key] "b" (key)
413
#if BR_POWER8_LE
414
, [idx2be] "b" (idx2be)
415
#endif
416
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
417
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"
418
);
419
}
420
421
/* see inner.h */
422
int
423
br_aes_pwr8_supported(void)
424
{
425
return 1;
426
}
427
428
/* see inner.h */
429
unsigned
430
br_aes_pwr8_keysched(unsigned char *sk, const void *key, size_t len)
431
{
432
switch (len) {
433
case 16:
434
key_schedule_128(sk, key);
435
return 10;
436
case 24:
437
key_schedule_192(sk, key);
438
return 12;
439
default:
440
key_schedule_256(sk, key);
441
return 14;
442
}
443
}
444
445
#endif
446
447