Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/crypto/aria-aesni-avx-asm_64.S
26451 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* ARIA Cipher 16-way parallel algorithm (AVX)
4
*
5
* Copyright (c) 2022 Taehee Yoo <[email protected]>
6
*
7
*/
8
9
#include <linux/linkage.h>
10
#include <linux/cfi_types.h>
11
#include <asm/asm-offsets.h>
12
#include <asm/frame.h>
13
14
/* register macros */
15
#define CTX %rdi
16
17
18
#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
19
( (((a0) & 1) << 0) | \
20
(((a1) & 1) << 1) | \
21
(((a2) & 1) << 2) | \
22
(((a3) & 1) << 3) | \
23
(((a4) & 1) << 4) | \
24
(((a5) & 1) << 5) | \
25
(((a6) & 1) << 6) | \
26
(((a7) & 1) << 7) )
27
28
#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
29
( ((l7) << (0 * 8)) | \
30
((l6) << (1 * 8)) | \
31
((l5) << (2 * 8)) | \
32
((l4) << (3 * 8)) | \
33
((l3) << (4 * 8)) | \
34
((l2) << (5 * 8)) | \
35
((l1) << (6 * 8)) | \
36
((l0) << (7 * 8)) )
37
38
#define inc_le128(x, minus_one, tmp) \
39
vpcmpeqq minus_one, x, tmp; \
40
vpsubq minus_one, x, x; \
41
vpslldq $8, tmp, tmp; \
42
vpsubq tmp, x, x;
43
44
#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
45
vpand x, mask4bit, tmp0; \
46
vpandn x, mask4bit, x; \
47
vpsrld $4, x, x; \
48
\
49
vpshufb tmp0, lo_t, tmp0; \
50
vpshufb x, hi_t, x; \
51
vpxor tmp0, x, x;
52
53
#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
54
vpunpckhdq x1, x0, t2; \
55
vpunpckldq x1, x0, x0; \
56
\
57
vpunpckldq x3, x2, t1; \
58
vpunpckhdq x3, x2, x2; \
59
\
60
vpunpckhqdq t1, x0, x1; \
61
vpunpcklqdq t1, x0, x0; \
62
\
63
vpunpckhqdq x2, t2, x3; \
64
vpunpcklqdq x2, t2, x2;
65
66
#define byteslice_16x16b(a0, b0, c0, d0, \
67
a1, b1, c1, d1, \
68
a2, b2, c2, d2, \
69
a3, b3, c3, d3, \
70
st0, st1) \
71
vmovdqu d2, st0; \
72
vmovdqu d3, st1; \
73
transpose_4x4(a0, a1, a2, a3, d2, d3); \
74
transpose_4x4(b0, b1, b2, b3, d2, d3); \
75
vmovdqu st0, d2; \
76
vmovdqu st1, d3; \
77
\
78
vmovdqu a0, st0; \
79
vmovdqu a1, st1; \
80
transpose_4x4(c0, c1, c2, c3, a0, a1); \
81
transpose_4x4(d0, d1, d2, d3, a0, a1); \
82
\
83
vmovdqu .Lshufb_16x16b(%rip), a0; \
84
vmovdqu st1, a1; \
85
vpshufb a0, a2, a2; \
86
vpshufb a0, a3, a3; \
87
vpshufb a0, b0, b0; \
88
vpshufb a0, b1, b1; \
89
vpshufb a0, b2, b2; \
90
vpshufb a0, b3, b3; \
91
vpshufb a0, a1, a1; \
92
vpshufb a0, c0, c0; \
93
vpshufb a0, c1, c1; \
94
vpshufb a0, c2, c2; \
95
vpshufb a0, c3, c3; \
96
vpshufb a0, d0, d0; \
97
vpshufb a0, d1, d1; \
98
vpshufb a0, d2, d2; \
99
vpshufb a0, d3, d3; \
100
vmovdqu d3, st1; \
101
vmovdqu st0, d3; \
102
vpshufb a0, d3, a0; \
103
vmovdqu d2, st0; \
104
\
105
transpose_4x4(a0, b0, c0, d0, d2, d3); \
106
transpose_4x4(a1, b1, c1, d1, d2, d3); \
107
vmovdqu st0, d2; \
108
vmovdqu st1, d3; \
109
\
110
vmovdqu b0, st0; \
111
vmovdqu b1, st1; \
112
transpose_4x4(a2, b2, c2, d2, b0, b1); \
113
transpose_4x4(a3, b3, c3, d3, b0, b1); \
114
vmovdqu st0, b0; \
115
vmovdqu st1, b1; \
116
/* does not adjust output bytes inside vectors */
117
118
#define debyteslice_16x16b(a0, b0, c0, d0, \
119
a1, b1, c1, d1, \
120
a2, b2, c2, d2, \
121
a3, b3, c3, d3, \
122
st0, st1) \
123
vmovdqu d2, st0; \
124
vmovdqu d3, st1; \
125
transpose_4x4(a0, a1, a2, a3, d2, d3); \
126
transpose_4x4(b0, b1, b2, b3, d2, d3); \
127
vmovdqu st0, d2; \
128
vmovdqu st1, d3; \
129
\
130
vmovdqu a0, st0; \
131
vmovdqu a1, st1; \
132
transpose_4x4(c0, c1, c2, c3, a0, a1); \
133
transpose_4x4(d0, d1, d2, d3, a0, a1); \
134
\
135
vmovdqu .Lshufb_16x16b(%rip), a0; \
136
vmovdqu st1, a1; \
137
vpshufb a0, a2, a2; \
138
vpshufb a0, a3, a3; \
139
vpshufb a0, b0, b0; \
140
vpshufb a0, b1, b1; \
141
vpshufb a0, b2, b2; \
142
vpshufb a0, b3, b3; \
143
vpshufb a0, a1, a1; \
144
vpshufb a0, c0, c0; \
145
vpshufb a0, c1, c1; \
146
vpshufb a0, c2, c2; \
147
vpshufb a0, c3, c3; \
148
vpshufb a0, d0, d0; \
149
vpshufb a0, d1, d1; \
150
vpshufb a0, d2, d2; \
151
vpshufb a0, d3, d3; \
152
vmovdqu d3, st1; \
153
vmovdqu st0, d3; \
154
vpshufb a0, d3, a0; \
155
vmovdqu d2, st0; \
156
\
157
transpose_4x4(c0, d0, a0, b0, d2, d3); \
158
transpose_4x4(c1, d1, a1, b1, d2, d3); \
159
vmovdqu st0, d2; \
160
vmovdqu st1, d3; \
161
\
162
vmovdqu b0, st0; \
163
vmovdqu b1, st1; \
164
transpose_4x4(c2, d2, a2, b2, b0, b1); \
165
transpose_4x4(c3, d3, a3, b3, b0, b1); \
166
vmovdqu st0, b0; \
167
vmovdqu st1, b1; \
168
/* does not adjust output bytes inside vectors */
169
170
/* load blocks to registers and apply pre-whitening */
171
#define inpack16_pre(x0, x1, x2, x3, \
172
x4, x5, x6, x7, \
173
y0, y1, y2, y3, \
174
y4, y5, y6, y7, \
175
rio) \
176
vmovdqu (0 * 16)(rio), x0; \
177
vmovdqu (1 * 16)(rio), x1; \
178
vmovdqu (2 * 16)(rio), x2; \
179
vmovdqu (3 * 16)(rio), x3; \
180
vmovdqu (4 * 16)(rio), x4; \
181
vmovdqu (5 * 16)(rio), x5; \
182
vmovdqu (6 * 16)(rio), x6; \
183
vmovdqu (7 * 16)(rio), x7; \
184
vmovdqu (8 * 16)(rio), y0; \
185
vmovdqu (9 * 16)(rio), y1; \
186
vmovdqu (10 * 16)(rio), y2; \
187
vmovdqu (11 * 16)(rio), y3; \
188
vmovdqu (12 * 16)(rio), y4; \
189
vmovdqu (13 * 16)(rio), y5; \
190
vmovdqu (14 * 16)(rio), y6; \
191
vmovdqu (15 * 16)(rio), y7;
192
193
/* byteslice pre-whitened blocks and store to temporary memory */
194
#define inpack16_post(x0, x1, x2, x3, \
195
x4, x5, x6, x7, \
196
y0, y1, y2, y3, \
197
y4, y5, y6, y7, \
198
mem_ab, mem_cd) \
199
byteslice_16x16b(x0, x1, x2, x3, \
200
x4, x5, x6, x7, \
201
y0, y1, y2, y3, \
202
y4, y5, y6, y7, \
203
(mem_ab), (mem_cd)); \
204
\
205
vmovdqu x0, 0 * 16(mem_ab); \
206
vmovdqu x1, 1 * 16(mem_ab); \
207
vmovdqu x2, 2 * 16(mem_ab); \
208
vmovdqu x3, 3 * 16(mem_ab); \
209
vmovdqu x4, 4 * 16(mem_ab); \
210
vmovdqu x5, 5 * 16(mem_ab); \
211
vmovdqu x6, 6 * 16(mem_ab); \
212
vmovdqu x7, 7 * 16(mem_ab); \
213
vmovdqu y0, 0 * 16(mem_cd); \
214
vmovdqu y1, 1 * 16(mem_cd); \
215
vmovdqu y2, 2 * 16(mem_cd); \
216
vmovdqu y3, 3 * 16(mem_cd); \
217
vmovdqu y4, 4 * 16(mem_cd); \
218
vmovdqu y5, 5 * 16(mem_cd); \
219
vmovdqu y6, 6 * 16(mem_cd); \
220
vmovdqu y7, 7 * 16(mem_cd);
221
222
#define write_output(x0, x1, x2, x3, \
223
x4, x5, x6, x7, \
224
y0, y1, y2, y3, \
225
y4, y5, y6, y7, \
226
mem) \
227
vmovdqu x0, 0 * 16(mem); \
228
vmovdqu x1, 1 * 16(mem); \
229
vmovdqu x2, 2 * 16(mem); \
230
vmovdqu x3, 3 * 16(mem); \
231
vmovdqu x4, 4 * 16(mem); \
232
vmovdqu x5, 5 * 16(mem); \
233
vmovdqu x6, 6 * 16(mem); \
234
vmovdqu x7, 7 * 16(mem); \
235
vmovdqu y0, 8 * 16(mem); \
236
vmovdqu y1, 9 * 16(mem); \
237
vmovdqu y2, 10 * 16(mem); \
238
vmovdqu y3, 11 * 16(mem); \
239
vmovdqu y4, 12 * 16(mem); \
240
vmovdqu y5, 13 * 16(mem); \
241
vmovdqu y6, 14 * 16(mem); \
242
vmovdqu y7, 15 * 16(mem); \
243
244
#define aria_store_state_8way(x0, x1, x2, x3, \
245
x4, x5, x6, x7, \
246
mem_tmp, idx) \
247
vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \
248
vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \
249
vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \
250
vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \
251
vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \
252
vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \
253
vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \
254
vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
255
256
#define aria_load_state_8way(x0, x1, x2, x3, \
257
x4, x5, x6, x7, \
258
mem_tmp, idx) \
259
vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \
260
vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \
261
vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \
262
vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \
263
vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \
264
vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \
265
vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \
266
vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
267
268
#define aria_ark_8way(x0, x1, x2, x3, \
269
x4, x5, x6, x7, \
270
t0, t1, t2, rk, \
271
idx, round) \
272
/* AddRoundKey */ \
273
vbroadcastss ((round * 16) + idx + 0)(rk), t0; \
274
vpsrld $24, t0, t2; \
275
vpshufb t1, t2, t2; \
276
vpxor t2, x0, x0; \
277
vpsrld $16, t0, t2; \
278
vpshufb t1, t2, t2; \
279
vpxor t2, x1, x1; \
280
vpsrld $8, t0, t2; \
281
vpshufb t1, t2, t2; \
282
vpxor t2, x2, x2; \
283
vpshufb t1, t0, t2; \
284
vpxor t2, x3, x3; \
285
vbroadcastss ((round * 16) + idx + 4)(rk), t0; \
286
vpsrld $24, t0, t2; \
287
vpshufb t1, t2, t2; \
288
vpxor t2, x4, x4; \
289
vpsrld $16, t0, t2; \
290
vpshufb t1, t2, t2; \
291
vpxor t2, x5, x5; \
292
vpsrld $8, t0, t2; \
293
vpshufb t1, t2, t2; \
294
vpxor t2, x6, x6; \
295
vpshufb t1, t0, t2; \
296
vpxor t2, x7, x7;
297
298
#ifdef CONFIG_AS_GFNI
299
#define aria_sbox_8way_gfni(x0, x1, x2, x3, \
300
x4, x5, x6, x7, \
301
t0, t1, t2, t3, \
302
t4, t5, t6, t7) \
303
vmovdqa .Ltf_s2_bitmatrix(%rip), t0; \
304
vmovdqa .Ltf_inv_bitmatrix(%rip), t1; \
305
vmovdqa .Ltf_id_bitmatrix(%rip), t2; \
306
vmovdqa .Ltf_aff_bitmatrix(%rip), t3; \
307
vmovdqa .Ltf_x2_bitmatrix(%rip), t4; \
308
vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
309
vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
310
vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
311
vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
312
vgf2p8affineinvqb $0, t2, x2, x2; \
313
vgf2p8affineinvqb $0, t2, x6, x6; \
314
vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
315
vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
316
vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
317
vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
318
vgf2p8affineinvqb $0, t2, x3, x3; \
319
vgf2p8affineinvqb $0, t2, x7, x7
320
321
#endif /* CONFIG_AS_GFNI */
322
323
#define aria_sbox_8way(x0, x1, x2, x3, \
324
x4, x5, x6, x7, \
325
t0, t1, t2, t3, \
326
t4, t5, t6, t7) \
327
vmovdqa .Linv_shift_row(%rip), t0; \
328
vmovdqa .Lshift_row(%rip), t1; \
329
vbroadcastss .L0f0f0f0f(%rip), t6; \
330
vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2; \
331
vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3; \
332
vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4; \
333
vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5; \
334
\
335
vaesenclast t7, x0, x0; \
336
vaesenclast t7, x4, x4; \
337
vaesenclast t7, x1, x1; \
338
vaesenclast t7, x5, x5; \
339
vaesdeclast t7, x2, x2; \
340
vaesdeclast t7, x6, x6; \
341
\
342
/* AES inverse shift rows */ \
343
vpshufb t0, x0, x0; \
344
vpshufb t0, x4, x4; \
345
vpshufb t0, x1, x1; \
346
vpshufb t0, x5, x5; \
347
vpshufb t1, x3, x3; \
348
vpshufb t1, x7, x7; \
349
vpshufb t1, x2, x2; \
350
vpshufb t1, x6, x6; \
351
\
352
/* affine transformation for S2 */ \
353
filter_8bit(x1, t2, t3, t6, t0); \
354
/* affine transformation for S2 */ \
355
filter_8bit(x5, t2, t3, t6, t0); \
356
\
357
/* affine transformation for X2 */ \
358
filter_8bit(x3, t4, t5, t6, t0); \
359
/* affine transformation for X2 */ \
360
filter_8bit(x7, t4, t5, t6, t0); \
361
vaesdeclast t7, x3, x3; \
362
vaesdeclast t7, x7, x7;
363
364
#define aria_diff_m(x0, x1, x2, x3, \
365
t0, t1, t2, t3) \
366
/* T = rotr32(X, 8); */ \
367
/* X ^= T */ \
368
vpxor x0, x3, t0; \
369
vpxor x1, x0, t1; \
370
vpxor x2, x1, t2; \
371
vpxor x3, x2, t3; \
372
/* X = T ^ rotr(X, 16); */ \
373
vpxor t2, x0, x0; \
374
vpxor x1, t3, t3; \
375
vpxor t0, x2, x2; \
376
vpxor t1, x3, x1; \
377
vmovdqu t3, x3;
378
379
#define aria_diff_word(x0, x1, x2, x3, \
380
x4, x5, x6, x7, \
381
y0, y1, y2, y3, \
382
y4, y5, y6, y7) \
383
/* t1 ^= t2; */ \
384
vpxor y0, x4, x4; \
385
vpxor y1, x5, x5; \
386
vpxor y2, x6, x6; \
387
vpxor y3, x7, x7; \
388
\
389
/* t2 ^= t3; */ \
390
vpxor y4, y0, y0; \
391
vpxor y5, y1, y1; \
392
vpxor y6, y2, y2; \
393
vpxor y7, y3, y3; \
394
\
395
/* t0 ^= t1; */ \
396
vpxor x4, x0, x0; \
397
vpxor x5, x1, x1; \
398
vpxor x6, x2, x2; \
399
vpxor x7, x3, x3; \
400
\
401
/* t3 ^= t1; */ \
402
vpxor x4, y4, y4; \
403
vpxor x5, y5, y5; \
404
vpxor x6, y6, y6; \
405
vpxor x7, y7, y7; \
406
\
407
/* t2 ^= t0; */ \
408
vpxor x0, y0, y0; \
409
vpxor x1, y1, y1; \
410
vpxor x2, y2, y2; \
411
vpxor x3, y3, y3; \
412
\
413
/* t1 ^= t2; */ \
414
vpxor y0, x4, x4; \
415
vpxor y1, x5, x5; \
416
vpxor y2, x6, x6; \
417
vpxor y3, x7, x7;
418
419
#define aria_fe(x0, x1, x2, x3, \
420
x4, x5, x6, x7, \
421
y0, y1, y2, y3, \
422
y4, y5, y6, y7, \
423
mem_tmp, rk, round) \
424
vpxor y7, y7, y7; \
425
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
426
y0, y7, y2, rk, 8, round); \
427
\
428
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
429
y0, y1, y2, y3, y4, y5, y6, y7); \
430
\
431
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
432
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
433
aria_store_state_8way(x0, x1, x2, x3, \
434
x4, x5, x6, x7, \
435
mem_tmp, 8); \
436
\
437
aria_load_state_8way(x0, x1, x2, x3, \
438
x4, x5, x6, x7, \
439
mem_tmp, 0); \
440
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
441
y0, y7, y2, rk, 0, round); \
442
\
443
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
444
y0, y1, y2, y3, y4, y5, y6, y7); \
445
\
446
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
447
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
448
aria_store_state_8way(x0, x1, x2, x3, \
449
x4, x5, x6, x7, \
450
mem_tmp, 0); \
451
aria_load_state_8way(y0, y1, y2, y3, \
452
y4, y5, y6, y7, \
453
mem_tmp, 8); \
454
aria_diff_word(x0, x1, x2, x3, \
455
x4, x5, x6, x7, \
456
y0, y1, y2, y3, \
457
y4, y5, y6, y7); \
458
/* aria_diff_byte() \
459
* T3 = ABCD -> BADC \
460
* T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
461
* T0 = ABCD -> CDAB \
462
* T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
463
* T1 = ABCD -> DCBA \
464
* T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
465
*/ \
466
aria_diff_word(x2, x3, x0, x1, \
467
x7, x6, x5, x4, \
468
y0, y1, y2, y3, \
469
y5, y4, y7, y6); \
470
aria_store_state_8way(x3, x2, x1, x0, \
471
x6, x7, x4, x5, \
472
mem_tmp, 0);
473
474
#define aria_fo(x0, x1, x2, x3, \
475
x4, x5, x6, x7, \
476
y0, y1, y2, y3, \
477
y4, y5, y6, y7, \
478
mem_tmp, rk, round) \
479
vpxor y7, y7, y7; \
480
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
481
y0, y7, y2, rk, 8, round); \
482
\
483
aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
484
y0, y1, y2, y3, y4, y5, y6, y7); \
485
\
486
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
487
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
488
aria_store_state_8way(x0, x1, x2, x3, \
489
x4, x5, x6, x7, \
490
mem_tmp, 8); \
491
\
492
aria_load_state_8way(x0, x1, x2, x3, \
493
x4, x5, x6, x7, \
494
mem_tmp, 0); \
495
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
496
y0, y7, y2, rk, 0, round); \
497
\
498
aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
499
y0, y1, y2, y3, y4, y5, y6, y7); \
500
\
501
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
502
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
503
aria_store_state_8way(x0, x1, x2, x3, \
504
x4, x5, x6, x7, \
505
mem_tmp, 0); \
506
aria_load_state_8way(y0, y1, y2, y3, \
507
y4, y5, y6, y7, \
508
mem_tmp, 8); \
509
aria_diff_word(x0, x1, x2, x3, \
510
x4, x5, x6, x7, \
511
y0, y1, y2, y3, \
512
y4, y5, y6, y7); \
513
/* aria_diff_byte() \
514
* T1 = ABCD -> BADC \
515
* T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
516
* T2 = ABCD -> CDAB \
517
* T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
518
* T3 = ABCD -> DCBA \
519
* T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
520
*/ \
521
aria_diff_word(x0, x1, x2, x3, \
522
x5, x4, x7, x6, \
523
y2, y3, y0, y1, \
524
y7, y6, y5, y4); \
525
aria_store_state_8way(x3, x2, x1, x0, \
526
x6, x7, x4, x5, \
527
mem_tmp, 0);
528
529
#define aria_ff(x0, x1, x2, x3, \
530
x4, x5, x6, x7, \
531
y0, y1, y2, y3, \
532
y4, y5, y6, y7, \
533
mem_tmp, rk, round, last_round) \
534
vpxor y7, y7, y7; \
535
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
536
y0, y7, y2, rk, 8, round); \
537
\
538
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
539
y0, y1, y2, y3, y4, y5, y6, y7); \
540
\
541
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
542
y0, y7, y2, rk, 8, last_round); \
543
\
544
aria_store_state_8way(x0, x1, x2, x3, \
545
x4, x5, x6, x7, \
546
mem_tmp, 8); \
547
\
548
aria_load_state_8way(x0, x1, x2, x3, \
549
x4, x5, x6, x7, \
550
mem_tmp, 0); \
551
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
552
y0, y7, y2, rk, 0, round); \
553
\
554
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
555
y0, y1, y2, y3, y4, y5, y6, y7); \
556
\
557
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
558
y0, y7, y2, rk, 0, last_round); \
559
\
560
aria_load_state_8way(y0, y1, y2, y3, \
561
y4, y5, y6, y7, \
562
mem_tmp, 8);
563
564
#ifdef CONFIG_AS_GFNI
565
#define aria_fe_gfni(x0, x1, x2, x3, \
566
x4, x5, x6, x7, \
567
y0, y1, y2, y3, \
568
y4, y5, y6, y7, \
569
mem_tmp, rk, round) \
570
vpxor y7, y7, y7; \
571
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
572
y0, y7, y2, rk, 8, round); \
573
\
574
aria_sbox_8way_gfni(x2, x3, x0, x1, \
575
x6, x7, x4, x5, \
576
y0, y1, y2, y3, \
577
y4, y5, y6, y7); \
578
\
579
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
580
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
581
aria_store_state_8way(x0, x1, x2, x3, \
582
x4, x5, x6, x7, \
583
mem_tmp, 8); \
584
\
585
aria_load_state_8way(x0, x1, x2, x3, \
586
x4, x5, x6, x7, \
587
mem_tmp, 0); \
588
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
589
y0, y7, y2, rk, 0, round); \
590
\
591
aria_sbox_8way_gfni(x2, x3, x0, x1, \
592
x6, x7, x4, x5, \
593
y0, y1, y2, y3, \
594
y4, y5, y6, y7); \
595
\
596
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
597
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
598
aria_store_state_8way(x0, x1, x2, x3, \
599
x4, x5, x6, x7, \
600
mem_tmp, 0); \
601
aria_load_state_8way(y0, y1, y2, y3, \
602
y4, y5, y6, y7, \
603
mem_tmp, 8); \
604
aria_diff_word(x0, x1, x2, x3, \
605
x4, x5, x6, x7, \
606
y0, y1, y2, y3, \
607
y4, y5, y6, y7); \
608
/* aria_diff_byte() \
609
* T3 = ABCD -> BADC \
610
* T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
611
* T0 = ABCD -> CDAB \
612
* T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
613
* T1 = ABCD -> DCBA \
614
* T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
615
*/ \
616
aria_diff_word(x2, x3, x0, x1, \
617
x7, x6, x5, x4, \
618
y0, y1, y2, y3, \
619
y5, y4, y7, y6); \
620
aria_store_state_8way(x3, x2, x1, x0, \
621
x6, x7, x4, x5, \
622
mem_tmp, 0);
623
624
#define aria_fo_gfni(x0, x1, x2, x3, \
625
x4, x5, x6, x7, \
626
y0, y1, y2, y3, \
627
y4, y5, y6, y7, \
628
mem_tmp, rk, round) \
629
vpxor y7, y7, y7; \
630
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
631
y0, y7, y2, rk, 8, round); \
632
\
633
aria_sbox_8way_gfni(x0, x1, x2, x3, \
634
x4, x5, x6, x7, \
635
y0, y1, y2, y3, \
636
y4, y5, y6, y7); \
637
\
638
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
639
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
640
aria_store_state_8way(x0, x1, x2, x3, \
641
x4, x5, x6, x7, \
642
mem_tmp, 8); \
643
\
644
aria_load_state_8way(x0, x1, x2, x3, \
645
x4, x5, x6, x7, \
646
mem_tmp, 0); \
647
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
648
y0, y7, y2, rk, 0, round); \
649
\
650
aria_sbox_8way_gfni(x0, x1, x2, x3, \
651
x4, x5, x6, x7, \
652
y0, y1, y2, y3, \
653
y4, y5, y6, y7); \
654
\
655
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
656
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
657
aria_store_state_8way(x0, x1, x2, x3, \
658
x4, x5, x6, x7, \
659
mem_tmp, 0); \
660
aria_load_state_8way(y0, y1, y2, y3, \
661
y4, y5, y6, y7, \
662
mem_tmp, 8); \
663
aria_diff_word(x0, x1, x2, x3, \
664
x4, x5, x6, x7, \
665
y0, y1, y2, y3, \
666
y4, y5, y6, y7); \
667
/* aria_diff_byte() \
668
* T1 = ABCD -> BADC \
669
* T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
670
* T2 = ABCD -> CDAB \
671
* T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
672
* T3 = ABCD -> DCBA \
673
* T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
674
*/ \
675
aria_diff_word(x0, x1, x2, x3, \
676
x5, x4, x7, x6, \
677
y2, y3, y0, y1, \
678
y7, y6, y5, y4); \
679
aria_store_state_8way(x3, x2, x1, x0, \
680
x6, x7, x4, x5, \
681
mem_tmp, 0);
682
683
#define aria_ff_gfni(x0, x1, x2, x3, \
684
x4, x5, x6, x7, \
685
y0, y1, y2, y3, \
686
y4, y5, y6, y7, \
687
mem_tmp, rk, round, last_round) \
688
vpxor y7, y7, y7; \
689
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
690
y0, y7, y2, rk, 8, round); \
691
\
692
aria_sbox_8way_gfni(x2, x3, x0, x1, \
693
x6, x7, x4, x5, \
694
y0, y1, y2, y3, \
695
y4, y5, y6, y7); \
696
\
697
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
698
y0, y7, y2, rk, 8, last_round); \
699
\
700
aria_store_state_8way(x0, x1, x2, x3, \
701
x4, x5, x6, x7, \
702
mem_tmp, 8); \
703
\
704
aria_load_state_8way(x0, x1, x2, x3, \
705
x4, x5, x6, x7, \
706
mem_tmp, 0); \
707
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
708
y0, y7, y2, rk, 0, round); \
709
\
710
aria_sbox_8way_gfni(x2, x3, x0, x1, \
711
x6, x7, x4, x5, \
712
y0, y1, y2, y3, \
713
y4, y5, y6, y7); \
714
\
715
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
716
y0, y7, y2, rk, 0, last_round); \
717
\
718
aria_load_state_8way(y0, y1, y2, y3, \
719
y4, y5, y6, y7, \
720
mem_tmp, 8);
721
722
#endif /* CONFIG_AS_GFNI */
723
724
/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
725
.section .rodata.cst16, "aM", @progbits, 16
726
.align 16
727
728
#define SHUFB_BYTES(idx) \
729
0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
730
731
.Lshufb_16x16b:
732
.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
733
/* For isolating SubBytes from AESENCLAST, inverse shift row */
734
.Linv_shift_row:
735
.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
736
.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
737
.Lshift_row:
738
.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
739
.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
740
/* For CTR-mode IV byteswap */
741
.Lbswap128_mask:
742
.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
743
.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
744
745
/* AES inverse affine and S2 combined:
746
* 1 1 0 0 0 0 0 1 x0 0
747
* 0 1 0 0 1 0 0 0 x1 0
748
* 1 1 0 0 1 1 1 1 x2 0
749
* 0 1 1 0 1 0 0 1 x3 1
750
* 0 1 0 0 1 1 0 0 * x4 + 0
751
* 0 1 0 1 1 0 0 0 x5 0
752
* 0 0 0 0 0 1 0 1 x6 0
753
* 1 1 1 0 0 1 1 1 x7 1
754
*/
755
.Ltf_lo__inv_aff__and__s2:
756
.octa 0x92172DA81A9FA520B2370D883ABF8500
757
.Ltf_hi__inv_aff__and__s2:
758
.octa 0x2B15FFC1AF917B45E6D8320C625CB688
759
760
/* X2 and AES forward affine combined:
761
* 1 0 1 1 0 0 0 1 x0 0
762
* 0 1 1 1 1 0 1 1 x1 0
763
* 0 0 0 1 1 0 1 0 x2 1
764
* 0 1 0 0 0 1 0 0 x3 0
765
* 0 0 1 1 1 0 1 1 * x4 + 0
766
* 0 1 0 0 1 0 0 0 x5 0
767
* 1 1 0 1 0 0 1 1 x6 0
768
* 0 1 0 0 1 0 1 0 x7 0
769
*/
770
.Ltf_lo__x2__and__fwd_aff:
771
.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
772
.Ltf_hi__x2__and__fwd_aff:
773
.octa 0x3F893781E95FE1576CDA64D2BA0CB204
774
775
#ifdef CONFIG_AS_GFNI
776
/* AES affine: */
777
#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
778
.Ltf_aff_bitmatrix:
779
.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
780
BV8(1, 1, 0, 0, 0, 1, 1, 1),
781
BV8(1, 1, 1, 0, 0, 0, 1, 1),
782
BV8(1, 1, 1, 1, 0, 0, 0, 1),
783
BV8(1, 1, 1, 1, 1, 0, 0, 0),
784
BV8(0, 1, 1, 1, 1, 1, 0, 0),
785
BV8(0, 0, 1, 1, 1, 1, 1, 0),
786
BV8(0, 0, 0, 1, 1, 1, 1, 1))
787
.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
788
BV8(1, 1, 0, 0, 0, 1, 1, 1),
789
BV8(1, 1, 1, 0, 0, 0, 1, 1),
790
BV8(1, 1, 1, 1, 0, 0, 0, 1),
791
BV8(1, 1, 1, 1, 1, 0, 0, 0),
792
BV8(0, 1, 1, 1, 1, 1, 0, 0),
793
BV8(0, 0, 1, 1, 1, 1, 1, 0),
794
BV8(0, 0, 0, 1, 1, 1, 1, 1))
795
796
/* AES inverse affine: */
797
#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
798
.Ltf_inv_bitmatrix:
799
.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
800
BV8(1, 0, 0, 1, 0, 0, 1, 0),
801
BV8(0, 1, 0, 0, 1, 0, 0, 1),
802
BV8(1, 0, 1, 0, 0, 1, 0, 0),
803
BV8(0, 1, 0, 1, 0, 0, 1, 0),
804
BV8(0, 0, 1, 0, 1, 0, 0, 1),
805
BV8(1, 0, 0, 1, 0, 1, 0, 0),
806
BV8(0, 1, 0, 0, 1, 0, 1, 0))
807
.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
808
BV8(1, 0, 0, 1, 0, 0, 1, 0),
809
BV8(0, 1, 0, 0, 1, 0, 0, 1),
810
BV8(1, 0, 1, 0, 0, 1, 0, 0),
811
BV8(0, 1, 0, 1, 0, 0, 1, 0),
812
BV8(0, 0, 1, 0, 1, 0, 0, 1),
813
BV8(1, 0, 0, 1, 0, 1, 0, 0),
814
BV8(0, 1, 0, 0, 1, 0, 1, 0))
815
816
/* S2: */
817
#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
818
.Ltf_s2_bitmatrix:
819
.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
820
BV8(0, 0, 1, 1, 1, 1, 1, 1),
821
BV8(1, 1, 1, 0, 1, 1, 0, 1),
822
BV8(1, 1, 0, 0, 0, 0, 1, 1),
823
BV8(0, 1, 0, 0, 0, 0, 1, 1),
824
BV8(1, 1, 0, 0, 1, 1, 1, 0),
825
BV8(0, 1, 1, 0, 0, 0, 1, 1),
826
BV8(1, 1, 1, 1, 0, 1, 1, 0))
827
.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
828
BV8(0, 0, 1, 1, 1, 1, 1, 1),
829
BV8(1, 1, 1, 0, 1, 1, 0, 1),
830
BV8(1, 1, 0, 0, 0, 0, 1, 1),
831
BV8(0, 1, 0, 0, 0, 0, 1, 1),
832
BV8(1, 1, 0, 0, 1, 1, 1, 0),
833
BV8(0, 1, 1, 0, 0, 0, 1, 1),
834
BV8(1, 1, 1, 1, 0, 1, 1, 0))
835
836
/* X2: */
837
#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
838
.Ltf_x2_bitmatrix:
839
.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
840
BV8(0, 0, 1, 0, 0, 1, 1, 0),
841
BV8(0, 0, 0, 0, 1, 0, 1, 0),
842
BV8(1, 1, 1, 0, 0, 0, 1, 1),
843
BV8(1, 1, 1, 0, 1, 1, 0, 0),
844
BV8(0, 1, 1, 0, 1, 0, 1, 1),
845
BV8(1, 0, 1, 1, 1, 1, 0, 1),
846
BV8(1, 0, 0, 1, 0, 0, 1, 1))
847
.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
848
BV8(0, 0, 1, 0, 0, 1, 1, 0),
849
BV8(0, 0, 0, 0, 1, 0, 1, 0),
850
BV8(1, 1, 1, 0, 0, 0, 1, 1),
851
BV8(1, 1, 1, 0, 1, 1, 0, 0),
852
BV8(0, 1, 1, 0, 1, 0, 1, 1),
853
BV8(1, 0, 1, 1, 1, 1, 0, 1),
854
BV8(1, 0, 0, 1, 0, 0, 1, 1))
855
856
/* Identity matrix: */
857
.Ltf_id_bitmatrix:
858
.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
859
BV8(0, 1, 0, 0, 0, 0, 0, 0),
860
BV8(0, 0, 1, 0, 0, 0, 0, 0),
861
BV8(0, 0, 0, 1, 0, 0, 0, 0),
862
BV8(0, 0, 0, 0, 1, 0, 0, 0),
863
BV8(0, 0, 0, 0, 0, 1, 0, 0),
864
BV8(0, 0, 0, 0, 0, 0, 1, 0),
865
BV8(0, 0, 0, 0, 0, 0, 0, 1))
866
.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
867
BV8(0, 1, 0, 0, 0, 0, 0, 0),
868
BV8(0, 0, 1, 0, 0, 0, 0, 0),
869
BV8(0, 0, 0, 1, 0, 0, 0, 0),
870
BV8(0, 0, 0, 0, 1, 0, 0, 0),
871
BV8(0, 0, 0, 0, 0, 1, 0, 0),
872
BV8(0, 0, 0, 0, 0, 0, 1, 0),
873
BV8(0, 0, 0, 0, 0, 0, 0, 1))
874
#endif /* CONFIG_AS_GFNI */
875
876
/* 4-bit mask */
877
.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
878
.align 4
879
.L0f0f0f0f:
880
.long 0x0f0f0f0f
881
882
.text
883
884
SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
885
/* input:
886
* %r9: rk
887
* %rsi: dst
888
* %rdx: src
889
* %xmm0..%xmm15: 16 byte-sliced blocks
890
*/
891
892
FRAME_BEGIN
893
894
movq %rsi, %rax;
895
leaq 8 * 16(%rax), %r8;
896
897
inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
898
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
899
%xmm15, %rax, %r8);
900
aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
901
%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
902
%rax, %r9, 0);
903
aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
904
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
905
%xmm15, %rax, %r9, 1);
906
aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
907
%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
908
%rax, %r9, 2);
909
aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
910
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
911
%xmm15, %rax, %r9, 3);
912
aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
913
%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
914
%rax, %r9, 4);
915
aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
916
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
917
%xmm15, %rax, %r9, 5);
918
aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
919
%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
920
%rax, %r9, 6);
921
aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
922
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
923
%xmm15, %rax, %r9, 7);
924
aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
925
%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
926
%rax, %r9, 8);
927
aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
928
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
929
%xmm15, %rax, %r9, 9);
930
aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
931
%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
932
%rax, %r9, 10);
933
cmpl $12, ARIA_CTX_rounds(CTX);
934
jne .Laria_192;
935
aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
936
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
937
%xmm15, %rax, %r9, 11, 12);
938
jmp .Laria_end;
939
.Laria_192:
940
aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
941
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
942
%xmm15, %rax, %r9, 11);
943
aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
944
%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
945
%rax, %r9, 12);
946
cmpl $14, ARIA_CTX_rounds(CTX);
947
jne .Laria_256;
948
aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
949
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
950
%xmm15, %rax, %r9, 13, 14);
951
jmp .Laria_end;
952
.Laria_256:
953
aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
954
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
955
%xmm15, %rax, %r9, 13);
956
aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
957
%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
958
%rax, %r9, 14);
959
aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
960
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
961
%xmm15, %rax, %r9, 15, 16);
962
.Laria_end:
963
debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
964
%xmm9, %xmm13, %xmm0, %xmm5,
965
%xmm10, %xmm14, %xmm3, %xmm6,
966
%xmm11, %xmm15, %xmm2, %xmm7,
967
(%rax), (%r8));
968
969
FRAME_END
970
RET;
971
SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
972
973
SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
974
/* input:
975
* %rdi: ctx, CTX
976
* %rsi: dst
977
* %rdx: src
978
*/
979
980
FRAME_BEGIN
981
982
leaq ARIA_CTX_enc_key(CTX), %r9;
983
984
inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
985
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
986
%xmm15, %rdx);
987
988
call __aria_aesni_avx_crypt_16way;
989
990
write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
991
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
992
%xmm15, %rax);
993
994
FRAME_END
995
RET;
996
SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
997
998
SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
999
/* input:
1000
* %rdi: ctx, CTX
1001
* %rsi: dst
1002
* %rdx: src
1003
*/
1004
1005
FRAME_BEGIN
1006
1007
leaq ARIA_CTX_dec_key(CTX), %r9;
1008
1009
inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1010
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1011
%xmm15, %rdx);
1012
1013
call __aria_aesni_avx_crypt_16way;
1014
1015
write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1016
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1017
%xmm15, %rax);
1018
1019
FRAME_END
1020
RET;
1021
SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
1022
1023
SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
1024
/* input:
1025
* %rdi: ctx
1026
* %rsi: dst
1027
* %rdx: src
1028
* %rcx: keystream
1029
* %r8: iv (big endian, 128bit)
1030
*/
1031
1032
FRAME_BEGIN
1033
/* load IV and byteswap */
1034
vmovdqu (%r8), %xmm8;
1035
1036
vmovdqa .Lbswap128_mask (%rip), %xmm1;
1037
vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
1038
1039
vpcmpeqd %xmm0, %xmm0, %xmm0;
1040
vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
1041
1042
/* construct IVs */
1043
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1044
vpshufb %xmm1, %xmm3, %xmm9;
1045
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1046
vpshufb %xmm1, %xmm3, %xmm10;
1047
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1048
vpshufb %xmm1, %xmm3, %xmm11;
1049
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1050
vpshufb %xmm1, %xmm3, %xmm12;
1051
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1052
vpshufb %xmm1, %xmm3, %xmm13;
1053
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1054
vpshufb %xmm1, %xmm3, %xmm14;
1055
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1056
vpshufb %xmm1, %xmm3, %xmm15;
1057
vmovdqu %xmm8, (0 * 16)(%rcx);
1058
vmovdqu %xmm9, (1 * 16)(%rcx);
1059
vmovdqu %xmm10, (2 * 16)(%rcx);
1060
vmovdqu %xmm11, (3 * 16)(%rcx);
1061
vmovdqu %xmm12, (4 * 16)(%rcx);
1062
vmovdqu %xmm13, (5 * 16)(%rcx);
1063
vmovdqu %xmm14, (6 * 16)(%rcx);
1064
vmovdqu %xmm15, (7 * 16)(%rcx);
1065
1066
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1067
vpshufb %xmm1, %xmm3, %xmm8;
1068
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1069
vpshufb %xmm1, %xmm3, %xmm9;
1070
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1071
vpshufb %xmm1, %xmm3, %xmm10;
1072
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1073
vpshufb %xmm1, %xmm3, %xmm11;
1074
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1075
vpshufb %xmm1, %xmm3, %xmm12;
1076
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1077
vpshufb %xmm1, %xmm3, %xmm13;
1078
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1079
vpshufb %xmm1, %xmm3, %xmm14;
1080
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1081
vpshufb %xmm1, %xmm3, %xmm15;
1082
inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1083
vpshufb %xmm1, %xmm3, %xmm4;
1084
vmovdqu %xmm4, (%r8);
1085
1086
vmovdqu (0 * 16)(%rcx), %xmm0;
1087
vmovdqu (1 * 16)(%rcx), %xmm1;
1088
vmovdqu (2 * 16)(%rcx), %xmm2;
1089
vmovdqu (3 * 16)(%rcx), %xmm3;
1090
vmovdqu (4 * 16)(%rcx), %xmm4;
1091
vmovdqu (5 * 16)(%rcx), %xmm5;
1092
vmovdqu (6 * 16)(%rcx), %xmm6;
1093
vmovdqu (7 * 16)(%rcx), %xmm7;
1094
1095
FRAME_END
1096
RET;
1097
SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
1098
1099
SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
1100
/* input:
1101
* %rdi: ctx
1102
* %rsi: dst
1103
* %rdx: src
1104
* %rcx: keystream
1105
* %r8: iv (big endian, 128bit)
1106
*/
1107
FRAME_BEGIN
1108
1109
call __aria_aesni_avx_ctr_gen_keystream_16way;
1110
1111
leaq (%rsi), %r10;
1112
leaq (%rdx), %r11;
1113
leaq (%rcx), %rsi;
1114
leaq (%rcx), %rdx;
1115
leaq ARIA_CTX_enc_key(CTX), %r9;
1116
1117
call __aria_aesni_avx_crypt_16way;
1118
1119
vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1120
vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1121
vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1122
vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1123
vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1124
vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1125
vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1126
vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1127
vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1128
vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1129
vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1130
vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1131
vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1132
vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1133
vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1134
vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1135
write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1136
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1137
%xmm15, %r10);
1138
1139
FRAME_END
1140
RET;
1141
SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
1142
1143
#ifdef CONFIG_AS_GFNI
1144
SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
1145
/* input:
1146
* %r9: rk
1147
* %rsi: dst
1148
* %rdx: src
1149
* %xmm0..%xmm15: 16 byte-sliced blocks
1150
*/
1151
1152
FRAME_BEGIN
1153
1154
movq %rsi, %rax;
1155
leaq 8 * 16(%rax), %r8;
1156
1157
inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
1158
%xmm4, %xmm5, %xmm6, %xmm7,
1159
%xmm8, %xmm9, %xmm10, %xmm11,
1160
%xmm12, %xmm13, %xmm14,
1161
%xmm15, %rax, %r8);
1162
aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
1163
%xmm12, %xmm13, %xmm14, %xmm15,
1164
%xmm0, %xmm1, %xmm2, %xmm3,
1165
%xmm4, %xmm5, %xmm6, %xmm7,
1166
%rax, %r9, 0);
1167
aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1168
%xmm4, %xmm5, %xmm6, %xmm7,
1169
%xmm8, %xmm9, %xmm10, %xmm11,
1170
%xmm12, %xmm13, %xmm14,
1171
%xmm15, %rax, %r9, 1);
1172
aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1173
%xmm12, %xmm13, %xmm14, %xmm15,
1174
%xmm0, %xmm1, %xmm2, %xmm3,
1175
%xmm4, %xmm5, %xmm6, %xmm7,
1176
%rax, %r9, 2);
1177
aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1178
%xmm4, %xmm5, %xmm6, %xmm7,
1179
%xmm8, %xmm9, %xmm10, %xmm11,
1180
%xmm12, %xmm13, %xmm14,
1181
%xmm15, %rax, %r9, 3);
1182
aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1183
%xmm12, %xmm13, %xmm14, %xmm15,
1184
%xmm0, %xmm1, %xmm2, %xmm3,
1185
%xmm4, %xmm5, %xmm6, %xmm7,
1186
%rax, %r9, 4);
1187
aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1188
%xmm4, %xmm5, %xmm6, %xmm7,
1189
%xmm8, %xmm9, %xmm10, %xmm11,
1190
%xmm12, %xmm13, %xmm14,
1191
%xmm15, %rax, %r9, 5);
1192
aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1193
%xmm12, %xmm13, %xmm14, %xmm15,
1194
%xmm0, %xmm1, %xmm2, %xmm3,
1195
%xmm4, %xmm5, %xmm6, %xmm7,
1196
%rax, %r9, 6);
1197
aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1198
%xmm4, %xmm5, %xmm6, %xmm7,
1199
%xmm8, %xmm9, %xmm10, %xmm11,
1200
%xmm12, %xmm13, %xmm14,
1201
%xmm15, %rax, %r9, 7);
1202
aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1203
%xmm12, %xmm13, %xmm14, %xmm15,
1204
%xmm0, %xmm1, %xmm2, %xmm3,
1205
%xmm4, %xmm5, %xmm6, %xmm7,
1206
%rax, %r9, 8);
1207
aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1208
%xmm4, %xmm5, %xmm6, %xmm7,
1209
%xmm8, %xmm9, %xmm10, %xmm11,
1210
%xmm12, %xmm13, %xmm14,
1211
%xmm15, %rax, %r9, 9);
1212
aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1213
%xmm12, %xmm13, %xmm14, %xmm15,
1214
%xmm0, %xmm1, %xmm2, %xmm3,
1215
%xmm4, %xmm5, %xmm6, %xmm7,
1216
%rax, %r9, 10);
1217
cmpl $12, ARIA_CTX_rounds(CTX);
1218
jne .Laria_gfni_192;
1219
aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1220
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1221
%xmm15, %rax, %r9, 11, 12);
1222
jmp .Laria_gfni_end;
1223
.Laria_gfni_192:
1224
aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1225
%xmm4, %xmm5, %xmm6, %xmm7,
1226
%xmm8, %xmm9, %xmm10, %xmm11,
1227
%xmm12, %xmm13, %xmm14,
1228
%xmm15, %rax, %r9, 11);
1229
aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1230
%xmm12, %xmm13, %xmm14, %xmm15,
1231
%xmm0, %xmm1, %xmm2, %xmm3,
1232
%xmm4, %xmm5, %xmm6, %xmm7,
1233
%rax, %r9, 12);
1234
cmpl $14, ARIA_CTX_rounds(CTX);
1235
jne .Laria_gfni_256;
1236
aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1237
%xmm4, %xmm5, %xmm6, %xmm7,
1238
%xmm8, %xmm9, %xmm10, %xmm11,
1239
%xmm12, %xmm13, %xmm14,
1240
%xmm15, %rax, %r9, 13, 14);
1241
jmp .Laria_gfni_end;
1242
.Laria_gfni_256:
1243
aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1244
%xmm4, %xmm5, %xmm6, %xmm7,
1245
%xmm8, %xmm9, %xmm10, %xmm11,
1246
%xmm12, %xmm13, %xmm14,
1247
%xmm15, %rax, %r9, 13);
1248
aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1249
%xmm12, %xmm13, %xmm14, %xmm15,
1250
%xmm0, %xmm1, %xmm2, %xmm3,
1251
%xmm4, %xmm5, %xmm6, %xmm7,
1252
%rax, %r9, 14);
1253
aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1254
%xmm4, %xmm5, %xmm6, %xmm7,
1255
%xmm8, %xmm9, %xmm10, %xmm11,
1256
%xmm12, %xmm13, %xmm14,
1257
%xmm15, %rax, %r9, 15, 16);
1258
.Laria_gfni_end:
1259
debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
1260
%xmm9, %xmm13, %xmm0, %xmm5,
1261
%xmm10, %xmm14, %xmm3, %xmm6,
1262
%xmm11, %xmm15, %xmm2, %xmm7,
1263
(%rax), (%r8));
1264
1265
FRAME_END
1266
RET;
1267
SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
1268
1269
SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
1270
/* input:
1271
* %rdi: ctx, CTX
1272
* %rsi: dst
1273
* %rdx: src
1274
*/
1275
1276
FRAME_BEGIN
1277
1278
leaq ARIA_CTX_enc_key(CTX), %r9;
1279
1280
inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1281
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1282
%xmm15, %rdx);
1283
1284
call __aria_aesni_avx_gfni_crypt_16way;
1285
1286
write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1287
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1288
%xmm15, %rax);
1289
1290
FRAME_END
1291
RET;
1292
SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
1293
1294
SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
1295
/* input:
1296
* %rdi: ctx, CTX
1297
* %rsi: dst
1298
* %rdx: src
1299
*/
1300
1301
FRAME_BEGIN
1302
1303
leaq ARIA_CTX_dec_key(CTX), %r9;
1304
1305
inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1306
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1307
%xmm15, %rdx);
1308
1309
call __aria_aesni_avx_gfni_crypt_16way;
1310
1311
write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1312
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1313
%xmm15, %rax);
1314
1315
FRAME_END
1316
RET;
1317
SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
1318
1319
SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
1320
/* input:
1321
* %rdi: ctx
1322
* %rsi: dst
1323
* %rdx: src
1324
* %rcx: keystream
1325
* %r8: iv (big endian, 128bit)
1326
*/
1327
FRAME_BEGIN
1328
1329
call __aria_aesni_avx_ctr_gen_keystream_16way
1330
1331
leaq (%rsi), %r10;
1332
leaq (%rdx), %r11;
1333
leaq (%rcx), %rsi;
1334
leaq (%rcx), %rdx;
1335
leaq ARIA_CTX_enc_key(CTX), %r9;
1336
1337
call __aria_aesni_avx_gfni_crypt_16way;
1338
1339
vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1340
vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1341
vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1342
vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1343
vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1344
vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1345
vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1346
vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1347
vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1348
vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1349
vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1350
vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1351
vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1352
vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1353
vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1354
vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1355
write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1356
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1357
%xmm15, %r10);
1358
1359
FRAME_END
1360
RET;
1361
SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
1362
#endif /* CONFIG_AS_GFNI */
1363
1364