Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/crypto/aria-aesni-avx-asm_64.S
54609 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* ARIA Cipher 16-way parallel algorithm (AVX)
4
*
5
* Copyright (c) 2022 Taehee Yoo <[email protected]>
6
*
7
*/
8
9
#include <linux/linkage.h>
10
#include <linux/cfi_types.h>
11
#include <asm/asm-offsets.h>
12
#include <asm/frame.h>
13
14
/* register macros */
15
#define CTX %rdi
16
17
18
#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
19
( (((a0) & 1) << 0) | \
20
(((a1) & 1) << 1) | \
21
(((a2) & 1) << 2) | \
22
(((a3) & 1) << 3) | \
23
(((a4) & 1) << 4) | \
24
(((a5) & 1) << 5) | \
25
(((a6) & 1) << 6) | \
26
(((a7) & 1) << 7) )
27
28
#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
29
( ((l7) << (0 * 8)) | \
30
((l6) << (1 * 8)) | \
31
((l5) << (2 * 8)) | \
32
((l4) << (3 * 8)) | \
33
((l3) << (4 * 8)) | \
34
((l2) << (5 * 8)) | \
35
((l1) << (6 * 8)) | \
36
((l0) << (7 * 8)) )
37
38
#define inc_le128(x, minus_one, tmp) \
39
vpcmpeqq minus_one, x, tmp; \
40
vpsubq minus_one, x, x; \
41
vpslldq $8, tmp, tmp; \
42
vpsubq tmp, x, x;
43
44
#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
45
vpand x, mask4bit, tmp0; \
46
vpandn x, mask4bit, x; \
47
vpsrld $4, x, x; \
48
\
49
vpshufb tmp0, lo_t, tmp0; \
50
vpshufb x, hi_t, x; \
51
vpxor tmp0, x, x;
52
53
#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
54
vpunpckhdq x1, x0, t2; \
55
vpunpckldq x1, x0, x0; \
56
\
57
vpunpckldq x3, x2, t1; \
58
vpunpckhdq x3, x2, x2; \
59
\
60
vpunpckhqdq t1, x0, x1; \
61
vpunpcklqdq t1, x0, x0; \
62
\
63
vpunpckhqdq x2, t2, x3; \
64
vpunpcklqdq x2, t2, x2;
65
66
#define byteslice_16x16b(a0, b0, c0, d0, \
67
a1, b1, c1, d1, \
68
a2, b2, c2, d2, \
69
a3, b3, c3, d3, \
70
st0, st1) \
71
vmovdqu d2, st0; \
72
vmovdqu d3, st1; \
73
transpose_4x4(a0, a1, a2, a3, d2, d3); \
74
transpose_4x4(b0, b1, b2, b3, d2, d3); \
75
vmovdqu st0, d2; \
76
vmovdqu st1, d3; \
77
\
78
vmovdqu a0, st0; \
79
vmovdqu a1, st1; \
80
transpose_4x4(c0, c1, c2, c3, a0, a1); \
81
transpose_4x4(d0, d1, d2, d3, a0, a1); \
82
\
83
vmovdqu .Lshufb_16x16b(%rip), a0; \
84
vmovdqu st1, a1; \
85
vpshufb a0, a2, a2; \
86
vpshufb a0, a3, a3; \
87
vpshufb a0, b0, b0; \
88
vpshufb a0, b1, b1; \
89
vpshufb a0, b2, b2; \
90
vpshufb a0, b3, b3; \
91
vpshufb a0, a1, a1; \
92
vpshufb a0, c0, c0; \
93
vpshufb a0, c1, c1; \
94
vpshufb a0, c2, c2; \
95
vpshufb a0, c3, c3; \
96
vpshufb a0, d0, d0; \
97
vpshufb a0, d1, d1; \
98
vpshufb a0, d2, d2; \
99
vpshufb a0, d3, d3; \
100
vmovdqu d3, st1; \
101
vmovdqu st0, d3; \
102
vpshufb a0, d3, a0; \
103
vmovdqu d2, st0; \
104
\
105
transpose_4x4(a0, b0, c0, d0, d2, d3); \
106
transpose_4x4(a1, b1, c1, d1, d2, d3); \
107
vmovdqu st0, d2; \
108
vmovdqu st1, d3; \
109
\
110
vmovdqu b0, st0; \
111
vmovdqu b1, st1; \
112
transpose_4x4(a2, b2, c2, d2, b0, b1); \
113
transpose_4x4(a3, b3, c3, d3, b0, b1); \
114
vmovdqu st0, b0; \
115
vmovdqu st1, b1; \
116
/* does not adjust output bytes inside vectors */
117
118
#define debyteslice_16x16b(a0, b0, c0, d0, \
119
a1, b1, c1, d1, \
120
a2, b2, c2, d2, \
121
a3, b3, c3, d3, \
122
st0, st1) \
123
vmovdqu d2, st0; \
124
vmovdqu d3, st1; \
125
transpose_4x4(a0, a1, a2, a3, d2, d3); \
126
transpose_4x4(b0, b1, b2, b3, d2, d3); \
127
vmovdqu st0, d2; \
128
vmovdqu st1, d3; \
129
\
130
vmovdqu a0, st0; \
131
vmovdqu a1, st1; \
132
transpose_4x4(c0, c1, c2, c3, a0, a1); \
133
transpose_4x4(d0, d1, d2, d3, a0, a1); \
134
\
135
vmovdqu .Lshufb_16x16b(%rip), a0; \
136
vmovdqu st1, a1; \
137
vpshufb a0, a2, a2; \
138
vpshufb a0, a3, a3; \
139
vpshufb a0, b0, b0; \
140
vpshufb a0, b1, b1; \
141
vpshufb a0, b2, b2; \
142
vpshufb a0, b3, b3; \
143
vpshufb a0, a1, a1; \
144
vpshufb a0, c0, c0; \
145
vpshufb a0, c1, c1; \
146
vpshufb a0, c2, c2; \
147
vpshufb a0, c3, c3; \
148
vpshufb a0, d0, d0; \
149
vpshufb a0, d1, d1; \
150
vpshufb a0, d2, d2; \
151
vpshufb a0, d3, d3; \
152
vmovdqu d3, st1; \
153
vmovdqu st0, d3; \
154
vpshufb a0, d3, a0; \
155
vmovdqu d2, st0; \
156
\
157
transpose_4x4(c0, d0, a0, b0, d2, d3); \
158
transpose_4x4(c1, d1, a1, b1, d2, d3); \
159
vmovdqu st0, d2; \
160
vmovdqu st1, d3; \
161
\
162
vmovdqu b0, st0; \
163
vmovdqu b1, st1; \
164
transpose_4x4(c2, d2, a2, b2, b0, b1); \
165
transpose_4x4(c3, d3, a3, b3, b0, b1); \
166
vmovdqu st0, b0; \
167
vmovdqu st1, b1; \
168
/* does not adjust output bytes inside vectors */
169
170
/* load blocks to registers and apply pre-whitening */
171
#define inpack16_pre(x0, x1, x2, x3, \
172
x4, x5, x6, x7, \
173
y0, y1, y2, y3, \
174
y4, y5, y6, y7, \
175
rio) \
176
vmovdqu (0 * 16)(rio), x0; \
177
vmovdqu (1 * 16)(rio), x1; \
178
vmovdqu (2 * 16)(rio), x2; \
179
vmovdqu (3 * 16)(rio), x3; \
180
vmovdqu (4 * 16)(rio), x4; \
181
vmovdqu (5 * 16)(rio), x5; \
182
vmovdqu (6 * 16)(rio), x6; \
183
vmovdqu (7 * 16)(rio), x7; \
184
vmovdqu (8 * 16)(rio), y0; \
185
vmovdqu (9 * 16)(rio), y1; \
186
vmovdqu (10 * 16)(rio), y2; \
187
vmovdqu (11 * 16)(rio), y3; \
188
vmovdqu (12 * 16)(rio), y4; \
189
vmovdqu (13 * 16)(rio), y5; \
190
vmovdqu (14 * 16)(rio), y6; \
191
vmovdqu (15 * 16)(rio), y7;
192
193
/* byteslice pre-whitened blocks and store to temporary memory */
194
#define inpack16_post(x0, x1, x2, x3, \
195
x4, x5, x6, x7, \
196
y0, y1, y2, y3, \
197
y4, y5, y6, y7, \
198
mem_ab, mem_cd) \
199
byteslice_16x16b(x0, x1, x2, x3, \
200
x4, x5, x6, x7, \
201
y0, y1, y2, y3, \
202
y4, y5, y6, y7, \
203
(mem_ab), (mem_cd)); \
204
\
205
vmovdqu x0, 0 * 16(mem_ab); \
206
vmovdqu x1, 1 * 16(mem_ab); \
207
vmovdqu x2, 2 * 16(mem_ab); \
208
vmovdqu x3, 3 * 16(mem_ab); \
209
vmovdqu x4, 4 * 16(mem_ab); \
210
vmovdqu x5, 5 * 16(mem_ab); \
211
vmovdqu x6, 6 * 16(mem_ab); \
212
vmovdqu x7, 7 * 16(mem_ab); \
213
vmovdqu y0, 0 * 16(mem_cd); \
214
vmovdqu y1, 1 * 16(mem_cd); \
215
vmovdqu y2, 2 * 16(mem_cd); \
216
vmovdqu y3, 3 * 16(mem_cd); \
217
vmovdqu y4, 4 * 16(mem_cd); \
218
vmovdqu y5, 5 * 16(mem_cd); \
219
vmovdqu y6, 6 * 16(mem_cd); \
220
vmovdqu y7, 7 * 16(mem_cd);
221
222
#define write_output(x0, x1, x2, x3, \
223
x4, x5, x6, x7, \
224
y0, y1, y2, y3, \
225
y4, y5, y6, y7, \
226
mem) \
227
vmovdqu x0, 0 * 16(mem); \
228
vmovdqu x1, 1 * 16(mem); \
229
vmovdqu x2, 2 * 16(mem); \
230
vmovdqu x3, 3 * 16(mem); \
231
vmovdqu x4, 4 * 16(mem); \
232
vmovdqu x5, 5 * 16(mem); \
233
vmovdqu x6, 6 * 16(mem); \
234
vmovdqu x7, 7 * 16(mem); \
235
vmovdqu y0, 8 * 16(mem); \
236
vmovdqu y1, 9 * 16(mem); \
237
vmovdqu y2, 10 * 16(mem); \
238
vmovdqu y3, 11 * 16(mem); \
239
vmovdqu y4, 12 * 16(mem); \
240
vmovdqu y5, 13 * 16(mem); \
241
vmovdqu y6, 14 * 16(mem); \
242
vmovdqu y7, 15 * 16(mem); \
243
244
#define aria_store_state_8way(x0, x1, x2, x3, \
245
x4, x5, x6, x7, \
246
mem_tmp, idx) \
247
vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \
248
vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \
249
vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \
250
vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \
251
vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \
252
vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \
253
vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \
254
vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
255
256
#define aria_load_state_8way(x0, x1, x2, x3, \
257
x4, x5, x6, x7, \
258
mem_tmp, idx) \
259
vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \
260
vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \
261
vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \
262
vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \
263
vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \
264
vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \
265
vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \
266
vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
267
268
#define aria_ark_8way(x0, x1, x2, x3, \
269
x4, x5, x6, x7, \
270
t0, t1, t2, rk, \
271
idx, round) \
272
/* AddRoundKey */ \
273
vbroadcastss ((round * 16) + idx + 0)(rk), t0; \
274
vpsrld $24, t0, t2; \
275
vpshufb t1, t2, t2; \
276
vpxor t2, x0, x0; \
277
vpsrld $16, t0, t2; \
278
vpshufb t1, t2, t2; \
279
vpxor t2, x1, x1; \
280
vpsrld $8, t0, t2; \
281
vpshufb t1, t2, t2; \
282
vpxor t2, x2, x2; \
283
vpshufb t1, t0, t2; \
284
vpxor t2, x3, x3; \
285
vbroadcastss ((round * 16) + idx + 4)(rk), t0; \
286
vpsrld $24, t0, t2; \
287
vpshufb t1, t2, t2; \
288
vpxor t2, x4, x4; \
289
vpsrld $16, t0, t2; \
290
vpshufb t1, t2, t2; \
291
vpxor t2, x5, x5; \
292
vpsrld $8, t0, t2; \
293
vpshufb t1, t2, t2; \
294
vpxor t2, x6, x6; \
295
vpshufb t1, t0, t2; \
296
vpxor t2, x7, x7;
297
298
#define aria_sbox_8way_gfni(x0, x1, x2, x3, \
299
x4, x5, x6, x7, \
300
t0, t1, t2, t3, \
301
t4, t5, t6, t7) \
302
vmovdqa .Ltf_s2_bitmatrix(%rip), t0; \
303
vmovdqa .Ltf_inv_bitmatrix(%rip), t1; \
304
vmovdqa .Ltf_id_bitmatrix(%rip), t2; \
305
vmovdqa .Ltf_aff_bitmatrix(%rip), t3; \
306
vmovdqa .Ltf_x2_bitmatrix(%rip), t4; \
307
vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
308
vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
309
vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
310
vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
311
vgf2p8affineinvqb $0, t2, x2, x2; \
312
vgf2p8affineinvqb $0, t2, x6, x6; \
313
vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
314
vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
315
vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
316
vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
317
vgf2p8affineinvqb $0, t2, x3, x3; \
318
vgf2p8affineinvqb $0, t2, x7, x7
319
320
#define aria_sbox_8way(x0, x1, x2, x3, \
321
x4, x5, x6, x7, \
322
t0, t1, t2, t3, \
323
t4, t5, t6, t7) \
324
vmovdqa .Linv_shift_row(%rip), t0; \
325
vmovdqa .Lshift_row(%rip), t1; \
326
vbroadcastss .L0f0f0f0f(%rip), t6; \
327
vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2; \
328
vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3; \
329
vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4; \
330
vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5; \
331
\
332
vaesenclast t7, x0, x0; \
333
vaesenclast t7, x4, x4; \
334
vaesenclast t7, x1, x1; \
335
vaesenclast t7, x5, x5; \
336
vaesdeclast t7, x2, x2; \
337
vaesdeclast t7, x6, x6; \
338
\
339
/* AES inverse shift rows */ \
340
vpshufb t0, x0, x0; \
341
vpshufb t0, x4, x4; \
342
vpshufb t0, x1, x1; \
343
vpshufb t0, x5, x5; \
344
vpshufb t1, x3, x3; \
345
vpshufb t1, x7, x7; \
346
vpshufb t1, x2, x2; \
347
vpshufb t1, x6, x6; \
348
\
349
/* affine transformation for S2 */ \
350
filter_8bit(x1, t2, t3, t6, t0); \
351
/* affine transformation for S2 */ \
352
filter_8bit(x5, t2, t3, t6, t0); \
353
\
354
/* affine transformation for X2 */ \
355
filter_8bit(x3, t4, t5, t6, t0); \
356
/* affine transformation for X2 */ \
357
filter_8bit(x7, t4, t5, t6, t0); \
358
vaesdeclast t7, x3, x3; \
359
vaesdeclast t7, x7, x7;
360
361
#define aria_diff_m(x0, x1, x2, x3, \
362
t0, t1, t2, t3) \
363
/* T = rotr32(X, 8); */ \
364
/* X ^= T */ \
365
vpxor x0, x3, t0; \
366
vpxor x1, x0, t1; \
367
vpxor x2, x1, t2; \
368
vpxor x3, x2, t3; \
369
/* X = T ^ rotr(X, 16); */ \
370
vpxor t2, x0, x0; \
371
vpxor x1, t3, t3; \
372
vpxor t0, x2, x2; \
373
vpxor t1, x3, x1; \
374
vmovdqu t3, x3;
375
376
#define aria_diff_word(x0, x1, x2, x3, \
377
x4, x5, x6, x7, \
378
y0, y1, y2, y3, \
379
y4, y5, y6, y7) \
380
/* t1 ^= t2; */ \
381
vpxor y0, x4, x4; \
382
vpxor y1, x5, x5; \
383
vpxor y2, x6, x6; \
384
vpxor y3, x7, x7; \
385
\
386
/* t2 ^= t3; */ \
387
vpxor y4, y0, y0; \
388
vpxor y5, y1, y1; \
389
vpxor y6, y2, y2; \
390
vpxor y7, y3, y3; \
391
\
392
/* t0 ^= t1; */ \
393
vpxor x4, x0, x0; \
394
vpxor x5, x1, x1; \
395
vpxor x6, x2, x2; \
396
vpxor x7, x3, x3; \
397
\
398
/* t3 ^= t1; */ \
399
vpxor x4, y4, y4; \
400
vpxor x5, y5, y5; \
401
vpxor x6, y6, y6; \
402
vpxor x7, y7, y7; \
403
\
404
/* t2 ^= t0; */ \
405
vpxor x0, y0, y0; \
406
vpxor x1, y1, y1; \
407
vpxor x2, y2, y2; \
408
vpxor x3, y3, y3; \
409
\
410
/* t1 ^= t2; */ \
411
vpxor y0, x4, x4; \
412
vpxor y1, x5, x5; \
413
vpxor y2, x6, x6; \
414
vpxor y3, x7, x7;
415
416
#define aria_fe(x0, x1, x2, x3, \
417
x4, x5, x6, x7, \
418
y0, y1, y2, y3, \
419
y4, y5, y6, y7, \
420
mem_tmp, rk, round) \
421
vpxor y7, y7, y7; \
422
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
423
y0, y7, y2, rk, 8, round); \
424
\
425
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
426
y0, y1, y2, y3, y4, y5, y6, y7); \
427
\
428
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
429
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
430
aria_store_state_8way(x0, x1, x2, x3, \
431
x4, x5, x6, x7, \
432
mem_tmp, 8); \
433
\
434
aria_load_state_8way(x0, x1, x2, x3, \
435
x4, x5, x6, x7, \
436
mem_tmp, 0); \
437
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
438
y0, y7, y2, rk, 0, round); \
439
\
440
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
441
y0, y1, y2, y3, y4, y5, y6, y7); \
442
\
443
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
444
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
445
aria_store_state_8way(x0, x1, x2, x3, \
446
x4, x5, x6, x7, \
447
mem_tmp, 0); \
448
aria_load_state_8way(y0, y1, y2, y3, \
449
y4, y5, y6, y7, \
450
mem_tmp, 8); \
451
aria_diff_word(x0, x1, x2, x3, \
452
x4, x5, x6, x7, \
453
y0, y1, y2, y3, \
454
y4, y5, y6, y7); \
455
/* aria_diff_byte() \
456
* T3 = ABCD -> BADC \
457
* T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
458
* T0 = ABCD -> CDAB \
459
* T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
460
* T1 = ABCD -> DCBA \
461
* T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
462
*/ \
463
aria_diff_word(x2, x3, x0, x1, \
464
x7, x6, x5, x4, \
465
y0, y1, y2, y3, \
466
y5, y4, y7, y6); \
467
aria_store_state_8way(x3, x2, x1, x0, \
468
x6, x7, x4, x5, \
469
mem_tmp, 0);
470
471
#define aria_fo(x0, x1, x2, x3, \
472
x4, x5, x6, x7, \
473
y0, y1, y2, y3, \
474
y4, y5, y6, y7, \
475
mem_tmp, rk, round) \
476
vpxor y7, y7, y7; \
477
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
478
y0, y7, y2, rk, 8, round); \
479
\
480
aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
481
y0, y1, y2, y3, y4, y5, y6, y7); \
482
\
483
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
484
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
485
aria_store_state_8way(x0, x1, x2, x3, \
486
x4, x5, x6, x7, \
487
mem_tmp, 8); \
488
\
489
aria_load_state_8way(x0, x1, x2, x3, \
490
x4, x5, x6, x7, \
491
mem_tmp, 0); \
492
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
493
y0, y7, y2, rk, 0, round); \
494
\
495
aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
496
y0, y1, y2, y3, y4, y5, y6, y7); \
497
\
498
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
499
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
500
aria_store_state_8way(x0, x1, x2, x3, \
501
x4, x5, x6, x7, \
502
mem_tmp, 0); \
503
aria_load_state_8way(y0, y1, y2, y3, \
504
y4, y5, y6, y7, \
505
mem_tmp, 8); \
506
aria_diff_word(x0, x1, x2, x3, \
507
x4, x5, x6, x7, \
508
y0, y1, y2, y3, \
509
y4, y5, y6, y7); \
510
/* aria_diff_byte() \
511
* T1 = ABCD -> BADC \
512
* T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
513
* T2 = ABCD -> CDAB \
514
* T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
515
* T3 = ABCD -> DCBA \
516
* T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
517
*/ \
518
aria_diff_word(x0, x1, x2, x3, \
519
x5, x4, x7, x6, \
520
y2, y3, y0, y1, \
521
y7, y6, y5, y4); \
522
aria_store_state_8way(x3, x2, x1, x0, \
523
x6, x7, x4, x5, \
524
mem_tmp, 0);
525
526
#define aria_ff(x0, x1, x2, x3, \
527
x4, x5, x6, x7, \
528
y0, y1, y2, y3, \
529
y4, y5, y6, y7, \
530
mem_tmp, rk, round, last_round) \
531
vpxor y7, y7, y7; \
532
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
533
y0, y7, y2, rk, 8, round); \
534
\
535
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
536
y0, y1, y2, y3, y4, y5, y6, y7); \
537
\
538
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
539
y0, y7, y2, rk, 8, last_round); \
540
\
541
aria_store_state_8way(x0, x1, x2, x3, \
542
x4, x5, x6, x7, \
543
mem_tmp, 8); \
544
\
545
aria_load_state_8way(x0, x1, x2, x3, \
546
x4, x5, x6, x7, \
547
mem_tmp, 0); \
548
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
549
y0, y7, y2, rk, 0, round); \
550
\
551
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
552
y0, y1, y2, y3, y4, y5, y6, y7); \
553
\
554
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
555
y0, y7, y2, rk, 0, last_round); \
556
\
557
aria_load_state_8way(y0, y1, y2, y3, \
558
y4, y5, y6, y7, \
559
mem_tmp, 8);
560
561
#define aria_fe_gfni(x0, x1, x2, x3, \
562
x4, x5, x6, x7, \
563
y0, y1, y2, y3, \
564
y4, y5, y6, y7, \
565
mem_tmp, rk, round) \
566
vpxor y7, y7, y7; \
567
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
568
y0, y7, y2, rk, 8, round); \
569
\
570
aria_sbox_8way_gfni(x2, x3, x0, x1, \
571
x6, x7, x4, x5, \
572
y0, y1, y2, y3, \
573
y4, y5, y6, y7); \
574
\
575
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
576
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
577
aria_store_state_8way(x0, x1, x2, x3, \
578
x4, x5, x6, x7, \
579
mem_tmp, 8); \
580
\
581
aria_load_state_8way(x0, x1, x2, x3, \
582
x4, x5, x6, x7, \
583
mem_tmp, 0); \
584
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
585
y0, y7, y2, rk, 0, round); \
586
\
587
aria_sbox_8way_gfni(x2, x3, x0, x1, \
588
x6, x7, x4, x5, \
589
y0, y1, y2, y3, \
590
y4, y5, y6, y7); \
591
\
592
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
593
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
594
aria_store_state_8way(x0, x1, x2, x3, \
595
x4, x5, x6, x7, \
596
mem_tmp, 0); \
597
aria_load_state_8way(y0, y1, y2, y3, \
598
y4, y5, y6, y7, \
599
mem_tmp, 8); \
600
aria_diff_word(x0, x1, x2, x3, \
601
x4, x5, x6, x7, \
602
y0, y1, y2, y3, \
603
y4, y5, y6, y7); \
604
/* aria_diff_byte() \
605
* T3 = ABCD -> BADC \
606
* T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
607
* T0 = ABCD -> CDAB \
608
* T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
609
* T1 = ABCD -> DCBA \
610
* T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
611
*/ \
612
aria_diff_word(x2, x3, x0, x1, \
613
x7, x6, x5, x4, \
614
y0, y1, y2, y3, \
615
y5, y4, y7, y6); \
616
aria_store_state_8way(x3, x2, x1, x0, \
617
x6, x7, x4, x5, \
618
mem_tmp, 0);
619
620
#define aria_fo_gfni(x0, x1, x2, x3, \
621
x4, x5, x6, x7, \
622
y0, y1, y2, y3, \
623
y4, y5, y6, y7, \
624
mem_tmp, rk, round) \
625
vpxor y7, y7, y7; \
626
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
627
y0, y7, y2, rk, 8, round); \
628
\
629
aria_sbox_8way_gfni(x0, x1, x2, x3, \
630
x4, x5, x6, x7, \
631
y0, y1, y2, y3, \
632
y4, y5, y6, y7); \
633
\
634
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
635
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
636
aria_store_state_8way(x0, x1, x2, x3, \
637
x4, x5, x6, x7, \
638
mem_tmp, 8); \
639
\
640
aria_load_state_8way(x0, x1, x2, x3, \
641
x4, x5, x6, x7, \
642
mem_tmp, 0); \
643
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
644
y0, y7, y2, rk, 0, round); \
645
\
646
aria_sbox_8way_gfni(x0, x1, x2, x3, \
647
x4, x5, x6, x7, \
648
y0, y1, y2, y3, \
649
y4, y5, y6, y7); \
650
\
651
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
652
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
653
aria_store_state_8way(x0, x1, x2, x3, \
654
x4, x5, x6, x7, \
655
mem_tmp, 0); \
656
aria_load_state_8way(y0, y1, y2, y3, \
657
y4, y5, y6, y7, \
658
mem_tmp, 8); \
659
aria_diff_word(x0, x1, x2, x3, \
660
x4, x5, x6, x7, \
661
y0, y1, y2, y3, \
662
y4, y5, y6, y7); \
663
/* aria_diff_byte() \
664
* T1 = ABCD -> BADC \
665
* T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
666
* T2 = ABCD -> CDAB \
667
* T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
668
* T3 = ABCD -> DCBA \
669
* T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
670
*/ \
671
aria_diff_word(x0, x1, x2, x3, \
672
x5, x4, x7, x6, \
673
y2, y3, y0, y1, \
674
y7, y6, y5, y4); \
675
aria_store_state_8way(x3, x2, x1, x0, \
676
x6, x7, x4, x5, \
677
mem_tmp, 0);
678
679
#define aria_ff_gfni(x0, x1, x2, x3, \
680
x4, x5, x6, x7, \
681
y0, y1, y2, y3, \
682
y4, y5, y6, y7, \
683
mem_tmp, rk, round, last_round) \
684
vpxor y7, y7, y7; \
685
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
686
y0, y7, y2, rk, 8, round); \
687
\
688
aria_sbox_8way_gfni(x2, x3, x0, x1, \
689
x6, x7, x4, x5, \
690
y0, y1, y2, y3, \
691
y4, y5, y6, y7); \
692
\
693
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
694
y0, y7, y2, rk, 8, last_round); \
695
\
696
aria_store_state_8way(x0, x1, x2, x3, \
697
x4, x5, x6, x7, \
698
mem_tmp, 8); \
699
\
700
aria_load_state_8way(x0, x1, x2, x3, \
701
x4, x5, x6, x7, \
702
mem_tmp, 0); \
703
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
704
y0, y7, y2, rk, 0, round); \
705
\
706
aria_sbox_8way_gfni(x2, x3, x0, x1, \
707
x6, x7, x4, x5, \
708
y0, y1, y2, y3, \
709
y4, y5, y6, y7); \
710
\
711
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
712
y0, y7, y2, rk, 0, last_round); \
713
\
714
aria_load_state_8way(y0, y1, y2, y3, \
715
y4, y5, y6, y7, \
716
mem_tmp, 8);
717
718
/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
719
.section .rodata.cst16, "aM", @progbits, 16
720
.align 16
721
722
#define SHUFB_BYTES(idx) \
723
0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
724
725
.Lshufb_16x16b:
726
.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
727
/* For isolating SubBytes from AESENCLAST, inverse shift row */
728
.Linv_shift_row:
729
.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
730
.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
731
.Lshift_row:
732
.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
733
.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
734
/* For CTR-mode IV byteswap */
735
.Lbswap128_mask:
736
.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
737
.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
738
739
/* AES inverse affine and S2 combined:
740
* 1 1 0 0 0 0 0 1 x0 0
741
* 0 1 0 0 1 0 0 0 x1 0
742
* 1 1 0 0 1 1 1 1 x2 0
743
* 0 1 1 0 1 0 0 1 x3 1
744
* 0 1 0 0 1 1 0 0 * x4 + 0
745
* 0 1 0 1 1 0 0 0 x5 0
746
* 0 0 0 0 0 1 0 1 x6 0
747
* 1 1 1 0 0 1 1 1 x7 1
748
*/
749
.Ltf_lo__inv_aff__and__s2:
750
.octa 0x92172DA81A9FA520B2370D883ABF8500
751
.Ltf_hi__inv_aff__and__s2:
752
.octa 0x2B15FFC1AF917B45E6D8320C625CB688
753
754
/* X2 and AES forward affine combined:
755
* 1 0 1 1 0 0 0 1 x0 0
756
* 0 1 1 1 1 0 1 1 x1 0
757
* 0 0 0 1 1 0 1 0 x2 1
758
* 0 1 0 0 0 1 0 0 x3 0
759
* 0 0 1 1 1 0 1 1 * x4 + 0
760
* 0 1 0 0 1 0 0 0 x5 0
761
* 1 1 0 1 0 0 1 1 x6 0
762
* 0 1 0 0 1 0 1 0 x7 0
763
*/
764
.Ltf_lo__x2__and__fwd_aff:
765
.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
766
.Ltf_hi__x2__and__fwd_aff:
767
.octa 0x3F893781E95FE1576CDA64D2BA0CB204
768
769
/* AES affine: */
770
#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
771
.Ltf_aff_bitmatrix:
772
.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
773
BV8(1, 1, 0, 0, 0, 1, 1, 1),
774
BV8(1, 1, 1, 0, 0, 0, 1, 1),
775
BV8(1, 1, 1, 1, 0, 0, 0, 1),
776
BV8(1, 1, 1, 1, 1, 0, 0, 0),
777
BV8(0, 1, 1, 1, 1, 1, 0, 0),
778
BV8(0, 0, 1, 1, 1, 1, 1, 0),
779
BV8(0, 0, 0, 1, 1, 1, 1, 1))
780
.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
781
BV8(1, 1, 0, 0, 0, 1, 1, 1),
782
BV8(1, 1, 1, 0, 0, 0, 1, 1),
783
BV8(1, 1, 1, 1, 0, 0, 0, 1),
784
BV8(1, 1, 1, 1, 1, 0, 0, 0),
785
BV8(0, 1, 1, 1, 1, 1, 0, 0),
786
BV8(0, 0, 1, 1, 1, 1, 1, 0),
787
BV8(0, 0, 0, 1, 1, 1, 1, 1))
788
789
/* AES inverse affine: */
790
#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
791
.Ltf_inv_bitmatrix:
792
.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
793
BV8(1, 0, 0, 1, 0, 0, 1, 0),
794
BV8(0, 1, 0, 0, 1, 0, 0, 1),
795
BV8(1, 0, 1, 0, 0, 1, 0, 0),
796
BV8(0, 1, 0, 1, 0, 0, 1, 0),
797
BV8(0, 0, 1, 0, 1, 0, 0, 1),
798
BV8(1, 0, 0, 1, 0, 1, 0, 0),
799
BV8(0, 1, 0, 0, 1, 0, 1, 0))
800
.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
801
BV8(1, 0, 0, 1, 0, 0, 1, 0),
802
BV8(0, 1, 0, 0, 1, 0, 0, 1),
803
BV8(1, 0, 1, 0, 0, 1, 0, 0),
804
BV8(0, 1, 0, 1, 0, 0, 1, 0),
805
BV8(0, 0, 1, 0, 1, 0, 0, 1),
806
BV8(1, 0, 0, 1, 0, 1, 0, 0),
807
BV8(0, 1, 0, 0, 1, 0, 1, 0))
808
809
/* S2: */
810
#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
811
.Ltf_s2_bitmatrix:
812
.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
813
BV8(0, 0, 1, 1, 1, 1, 1, 1),
814
BV8(1, 1, 1, 0, 1, 1, 0, 1),
815
BV8(1, 1, 0, 0, 0, 0, 1, 1),
816
BV8(0, 1, 0, 0, 0, 0, 1, 1),
817
BV8(1, 1, 0, 0, 1, 1, 1, 0),
818
BV8(0, 1, 1, 0, 0, 0, 1, 1),
819
BV8(1, 1, 1, 1, 0, 1, 1, 0))
820
.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
821
BV8(0, 0, 1, 1, 1, 1, 1, 1),
822
BV8(1, 1, 1, 0, 1, 1, 0, 1),
823
BV8(1, 1, 0, 0, 0, 0, 1, 1),
824
BV8(0, 1, 0, 0, 0, 0, 1, 1),
825
BV8(1, 1, 0, 0, 1, 1, 1, 0),
826
BV8(0, 1, 1, 0, 0, 0, 1, 1),
827
BV8(1, 1, 1, 1, 0, 1, 1, 0))
828
829
/* X2: */
830
#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
831
.Ltf_x2_bitmatrix:
832
.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
833
BV8(0, 0, 1, 0, 0, 1, 1, 0),
834
BV8(0, 0, 0, 0, 1, 0, 1, 0),
835
BV8(1, 1, 1, 0, 0, 0, 1, 1),
836
BV8(1, 1, 1, 0, 1, 1, 0, 0),
837
BV8(0, 1, 1, 0, 1, 0, 1, 1),
838
BV8(1, 0, 1, 1, 1, 1, 0, 1),
839
BV8(1, 0, 0, 1, 0, 0, 1, 1))
840
.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
841
BV8(0, 0, 1, 0, 0, 1, 1, 0),
842
BV8(0, 0, 0, 0, 1, 0, 1, 0),
843
BV8(1, 1, 1, 0, 0, 0, 1, 1),
844
BV8(1, 1, 1, 0, 1, 1, 0, 0),
845
BV8(0, 1, 1, 0, 1, 0, 1, 1),
846
BV8(1, 0, 1, 1, 1, 1, 0, 1),
847
BV8(1, 0, 0, 1, 0, 0, 1, 1))
848
849
/* Identity matrix: */
850
.Ltf_id_bitmatrix:
851
.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
852
BV8(0, 1, 0, 0, 0, 0, 0, 0),
853
BV8(0, 0, 1, 0, 0, 0, 0, 0),
854
BV8(0, 0, 0, 1, 0, 0, 0, 0),
855
BV8(0, 0, 0, 0, 1, 0, 0, 0),
856
BV8(0, 0, 0, 0, 0, 1, 0, 0),
857
BV8(0, 0, 0, 0, 0, 0, 1, 0),
858
BV8(0, 0, 0, 0, 0, 0, 0, 1))
859
.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
860
BV8(0, 1, 0, 0, 0, 0, 0, 0),
861
BV8(0, 0, 1, 0, 0, 0, 0, 0),
862
BV8(0, 0, 0, 1, 0, 0, 0, 0),
863
BV8(0, 0, 0, 0, 1, 0, 0, 0),
864
BV8(0, 0, 0, 0, 0, 1, 0, 0),
865
BV8(0, 0, 0, 0, 0, 0, 1, 0),
866
BV8(0, 0, 0, 0, 0, 0, 0, 1))
867
868
/* 4-bit mask */
869
.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
870
.align 4
871
.L0f0f0f0f:
872
.long 0x0f0f0f0f
873
874
.text
875
876
SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
877
/* input:
878
* %r9: rk
879
* %rsi: dst
880
* %rdx: src
881
* %xmm0..%xmm15: 16 byte-sliced blocks
882
*/
883
884
FRAME_BEGIN
885
886
movq %rsi, %rax;
887
leaq 8 * 16(%rax), %r8;
888
889
inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
890
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
891
%xmm15, %rax, %r8);
892
aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
893
%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
894
%rax, %r9, 0);
895
aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
896
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
897
%xmm15, %rax, %r9, 1);
898
aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
899
%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
900
%rax, %r9, 2);
901
aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
902
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
903
%xmm15, %rax, %r9, 3);
904
aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
905
%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
906
%rax, %r9, 4);
907
aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
908
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
909
%xmm15, %rax, %r9, 5);
910
aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
911
%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
912
%rax, %r9, 6);
913
aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
914
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
915
%xmm15, %rax, %r9, 7);
916
aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
917
%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
918
%rax, %r9, 8);
919
aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
920
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
921
%xmm15, %rax, %r9, 9);
922
aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
923
%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
924
%rax, %r9, 10);
925
cmpl $12, ARIA_CTX_rounds(CTX);
926
jne .Laria_192;
927
aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
928
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
929
%xmm15, %rax, %r9, 11, 12);
930
jmp .Laria_end;
931
.Laria_192:
932
aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
933
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
934
%xmm15, %rax, %r9, 11);
935
aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
936
%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
937
%rax, %r9, 12);
938
cmpl $14, ARIA_CTX_rounds(CTX);
939
jne .Laria_256;
940
aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
941
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
942
%xmm15, %rax, %r9, 13, 14);
943
jmp .Laria_end;
944
.Laria_256:
945
aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
946
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
947
%xmm15, %rax, %r9, 13);
948
aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
949
%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
950
%rax, %r9, 14);
951
aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
952
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
953
%xmm15, %rax, %r9, 15, 16);
954
.Laria_end:
955
debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
956
%xmm9, %xmm13, %xmm0, %xmm5,
957
%xmm10, %xmm14, %xmm3, %xmm6,
958
%xmm11, %xmm15, %xmm2, %xmm7,
959
(%rax), (%r8));
960
961
FRAME_END
962
RET;
963
SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
964
965
SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
966
/* input:
967
* %rdi: ctx, CTX
968
* %rsi: dst
969
* %rdx: src
970
*/
971
972
FRAME_BEGIN
973
974
leaq ARIA_CTX_enc_key(CTX), %r9;
975
976
inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
977
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
978
%xmm15, %rdx);
979
980
call __aria_aesni_avx_crypt_16way;
981
982
write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
983
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
984
%xmm15, %rax);
985
986
FRAME_END
987
RET;
988
SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
989
990
SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
991
/* input:
992
* %rdi: ctx, CTX
993
* %rsi: dst
994
* %rdx: src
995
*/
996
997
FRAME_BEGIN
998
999
leaq ARIA_CTX_dec_key(CTX), %r9;
1000
1001
inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1002
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1003
%xmm15, %rdx);
1004
1005
call __aria_aesni_avx_crypt_16way;
1006
1007
write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1008
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1009
%xmm15, %rax);
1010
1011
FRAME_END
1012
RET;
1013
SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
1014
1015
SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
1016
/* input:
1017
* %rdi: ctx
1018
* %rsi: dst
1019
* %rdx: src
1020
* %rcx: keystream
1021
* %r8: iv (big endian, 128bit)
1022
*/
1023
1024
FRAME_BEGIN
1025
/* load IV and byteswap */
1026
vmovdqu (%r8), %xmm8;
1027
1028
vmovdqa .Lbswap128_mask (%rip), %xmm1;
1029
vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
1030
1031
vpcmpeqd %xmm0, %xmm0, %xmm0;
1032
vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
1033
1034
/* construct IVs */
1035
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1036
vpshufb %xmm1, %xmm3, %xmm9;
1037
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1038
vpshufb %xmm1, %xmm3, %xmm10;
1039
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1040
vpshufb %xmm1, %xmm3, %xmm11;
1041
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1042
vpshufb %xmm1, %xmm3, %xmm12;
1043
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1044
vpshufb %xmm1, %xmm3, %xmm13;
1045
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1046
vpshufb %xmm1, %xmm3, %xmm14;
1047
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1048
vpshufb %xmm1, %xmm3, %xmm15;
1049
vmovdqu %xmm8, (0 * 16)(%rcx);
1050
vmovdqu %xmm9, (1 * 16)(%rcx);
1051
vmovdqu %xmm10, (2 * 16)(%rcx);
1052
vmovdqu %xmm11, (3 * 16)(%rcx);
1053
vmovdqu %xmm12, (4 * 16)(%rcx);
1054
vmovdqu %xmm13, (5 * 16)(%rcx);
1055
vmovdqu %xmm14, (6 * 16)(%rcx);
1056
vmovdqu %xmm15, (7 * 16)(%rcx);
1057
1058
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1059
vpshufb %xmm1, %xmm3, %xmm8;
1060
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1061
vpshufb %xmm1, %xmm3, %xmm9;
1062
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1063
vpshufb %xmm1, %xmm3, %xmm10;
1064
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1065
vpshufb %xmm1, %xmm3, %xmm11;
1066
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1067
vpshufb %xmm1, %xmm3, %xmm12;
1068
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1069
vpshufb %xmm1, %xmm3, %xmm13;
1070
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1071
vpshufb %xmm1, %xmm3, %xmm14;
1072
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1073
vpshufb %xmm1, %xmm3, %xmm15;
1074
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1075
vpshufb %xmm1, %xmm3, %xmm4;
1076
vmovdqu %xmm4, (%r8);
1077
1078
vmovdqu (0 * 16)(%rcx), %xmm0;
1079
vmovdqu (1 * 16)(%rcx), %xmm1;
1080
vmovdqu (2 * 16)(%rcx), %xmm2;
1081
vmovdqu (3 * 16)(%rcx), %xmm3;
1082
vmovdqu (4 * 16)(%rcx), %xmm4;
1083
vmovdqu (5 * 16)(%rcx), %xmm5;
1084
vmovdqu (6 * 16)(%rcx), %xmm6;
1085
vmovdqu (7 * 16)(%rcx), %xmm7;
1086
1087
FRAME_END
1088
RET;
1089
SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
1090
1091
SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
1092
/* input:
1093
* %rdi: ctx
1094
* %rsi: dst
1095
* %rdx: src
1096
* %rcx: keystream
1097
* %r8: iv (big endian, 128bit)
1098
*/
1099
FRAME_BEGIN
1100
1101
call __aria_aesni_avx_ctr_gen_keystream_16way;
1102
1103
leaq (%rsi), %r10;
1104
leaq (%rdx), %r11;
1105
leaq (%rcx), %rsi;
1106
leaq (%rcx), %rdx;
1107
leaq ARIA_CTX_enc_key(CTX), %r9;
1108
1109
call __aria_aesni_avx_crypt_16way;
1110
1111
vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1112
vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1113
vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1114
vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1115
vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1116
vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1117
vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1118
vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1119
vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1120
vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1121
vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1122
vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1123
vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1124
vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1125
vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1126
vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1127
write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1128
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1129
%xmm15, %r10);
1130
1131
FRAME_END
1132
RET;
1133
SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
1134
1135
SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
1136
/* input:
1137
* %r9: rk
1138
* %rsi: dst
1139
* %rdx: src
1140
* %xmm0..%xmm15: 16 byte-sliced blocks
1141
*/
1142
1143
FRAME_BEGIN
1144
1145
movq %rsi, %rax;
1146
leaq 8 * 16(%rax), %r8;
1147
1148
inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
1149
%xmm4, %xmm5, %xmm6, %xmm7,
1150
%xmm8, %xmm9, %xmm10, %xmm11,
1151
%xmm12, %xmm13, %xmm14,
1152
%xmm15, %rax, %r8);
1153
aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
1154
%xmm12, %xmm13, %xmm14, %xmm15,
1155
%xmm0, %xmm1, %xmm2, %xmm3,
1156
%xmm4, %xmm5, %xmm6, %xmm7,
1157
%rax, %r9, 0);
1158
aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1159
%xmm4, %xmm5, %xmm6, %xmm7,
1160
%xmm8, %xmm9, %xmm10, %xmm11,
1161
%xmm12, %xmm13, %xmm14,
1162
%xmm15, %rax, %r9, 1);
1163
aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1164
%xmm12, %xmm13, %xmm14, %xmm15,
1165
%xmm0, %xmm1, %xmm2, %xmm3,
1166
%xmm4, %xmm5, %xmm6, %xmm7,
1167
%rax, %r9, 2);
1168
aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1169
%xmm4, %xmm5, %xmm6, %xmm7,
1170
%xmm8, %xmm9, %xmm10, %xmm11,
1171
%xmm12, %xmm13, %xmm14,
1172
%xmm15, %rax, %r9, 3);
1173
aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1174
%xmm12, %xmm13, %xmm14, %xmm15,
1175
%xmm0, %xmm1, %xmm2, %xmm3,
1176
%xmm4, %xmm5, %xmm6, %xmm7,
1177
%rax, %r9, 4);
1178
aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1179
%xmm4, %xmm5, %xmm6, %xmm7,
1180
%xmm8, %xmm9, %xmm10, %xmm11,
1181
%xmm12, %xmm13, %xmm14,
1182
%xmm15, %rax, %r9, 5);
1183
aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1184
%xmm12, %xmm13, %xmm14, %xmm15,
1185
%xmm0, %xmm1, %xmm2, %xmm3,
1186
%xmm4, %xmm5, %xmm6, %xmm7,
1187
%rax, %r9, 6);
1188
aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1189
%xmm4, %xmm5, %xmm6, %xmm7,
1190
%xmm8, %xmm9, %xmm10, %xmm11,
1191
%xmm12, %xmm13, %xmm14,
1192
%xmm15, %rax, %r9, 7);
1193
aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1194
%xmm12, %xmm13, %xmm14, %xmm15,
1195
%xmm0, %xmm1, %xmm2, %xmm3,
1196
%xmm4, %xmm5, %xmm6, %xmm7,
1197
%rax, %r9, 8);
1198
aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1199
%xmm4, %xmm5, %xmm6, %xmm7,
1200
%xmm8, %xmm9, %xmm10, %xmm11,
1201
%xmm12, %xmm13, %xmm14,
1202
%xmm15, %rax, %r9, 9);
1203
aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1204
%xmm12, %xmm13, %xmm14, %xmm15,
1205
%xmm0, %xmm1, %xmm2, %xmm3,
1206
%xmm4, %xmm5, %xmm6, %xmm7,
1207
%rax, %r9, 10);
1208
cmpl $12, ARIA_CTX_rounds(CTX);
1209
jne .Laria_gfni_192;
1210
aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1211
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1212
%xmm15, %rax, %r9, 11, 12);
1213
jmp .Laria_gfni_end;
1214
.Laria_gfni_192:
1215
aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1216
%xmm4, %xmm5, %xmm6, %xmm7,
1217
%xmm8, %xmm9, %xmm10, %xmm11,
1218
%xmm12, %xmm13, %xmm14,
1219
%xmm15, %rax, %r9, 11);
1220
aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1221
%xmm12, %xmm13, %xmm14, %xmm15,
1222
%xmm0, %xmm1, %xmm2, %xmm3,
1223
%xmm4, %xmm5, %xmm6, %xmm7,
1224
%rax, %r9, 12);
1225
cmpl $14, ARIA_CTX_rounds(CTX);
1226
jne .Laria_gfni_256;
1227
aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1228
%xmm4, %xmm5, %xmm6, %xmm7,
1229
%xmm8, %xmm9, %xmm10, %xmm11,
1230
%xmm12, %xmm13, %xmm14,
1231
%xmm15, %rax, %r9, 13, 14);
1232
jmp .Laria_gfni_end;
1233
.Laria_gfni_256:
1234
aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1235
%xmm4, %xmm5, %xmm6, %xmm7,
1236
%xmm8, %xmm9, %xmm10, %xmm11,
1237
%xmm12, %xmm13, %xmm14,
1238
%xmm15, %rax, %r9, 13);
1239
aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1240
%xmm12, %xmm13, %xmm14, %xmm15,
1241
%xmm0, %xmm1, %xmm2, %xmm3,
1242
%xmm4, %xmm5, %xmm6, %xmm7,
1243
%rax, %r9, 14);
1244
aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1245
%xmm4, %xmm5, %xmm6, %xmm7,
1246
%xmm8, %xmm9, %xmm10, %xmm11,
1247
%xmm12, %xmm13, %xmm14,
1248
%xmm15, %rax, %r9, 15, 16);
1249
.Laria_gfni_end:
1250
debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
1251
%xmm9, %xmm13, %xmm0, %xmm5,
1252
%xmm10, %xmm14, %xmm3, %xmm6,
1253
%xmm11, %xmm15, %xmm2, %xmm7,
1254
(%rax), (%r8));
1255
1256
FRAME_END
1257
RET;
1258
SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
1259
1260
SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
1261
/* input:
1262
* %rdi: ctx, CTX
1263
* %rsi: dst
1264
* %rdx: src
1265
*/
1266
1267
FRAME_BEGIN
1268
1269
leaq ARIA_CTX_enc_key(CTX), %r9;
1270
1271
inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1272
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1273
%xmm15, %rdx);
1274
1275
call __aria_aesni_avx_gfni_crypt_16way;
1276
1277
write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1278
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1279
%xmm15, %rax);
1280
1281
FRAME_END
1282
RET;
1283
SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
1284
1285
SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
1286
/* input:
1287
* %rdi: ctx, CTX
1288
* %rsi: dst
1289
* %rdx: src
1290
*/
1291
1292
FRAME_BEGIN
1293
1294
leaq ARIA_CTX_dec_key(CTX), %r9;
1295
1296
inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1297
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1298
%xmm15, %rdx);
1299
1300
call __aria_aesni_avx_gfni_crypt_16way;
1301
1302
write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1303
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1304
%xmm15, %rax);
1305
1306
FRAME_END
1307
RET;
1308
SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
1309
1310
SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
1311
/* input:
1312
* %rdi: ctx
1313
* %rsi: dst
1314
* %rdx: src
1315
* %rcx: keystream
1316
* %r8: iv (big endian, 128bit)
1317
*/
1318
FRAME_BEGIN
1319
1320
call __aria_aesni_avx_ctr_gen_keystream_16way
1321
1322
leaq (%rsi), %r10;
1323
leaq (%rdx), %r11;
1324
leaq (%rcx), %rsi;
1325
leaq (%rcx), %rdx;
1326
leaq ARIA_CTX_enc_key(CTX), %r9;
1327
1328
call __aria_aesni_avx_gfni_crypt_16way;
1329
1330
vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1331
vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1332
vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1333
vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1334
vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1335
vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1336
vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1337
vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1338
vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1339
vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1340
vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1341
vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1342
vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1343
vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1344
vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1345
vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1346
write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1347
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1348
%xmm15, %r10);
1349
1350
FRAME_END
1351
RET;
1352
SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
1353
1354