Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/crypto/aria-aesni-avx2-asm_64.S
26451 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* ARIA Cipher 32-way parallel algorithm (AVX2)
4
*
5
* Copyright (c) 2022 Taehee Yoo <[email protected]>
6
*
7
*/
8
9
#include <linux/linkage.h>
10
#include <asm/frame.h>
11
#include <asm/asm-offsets.h>
12
#include <linux/cfi_types.h>
13
14
/* register macros */
15
#define CTX %rdi
16
17
#define ymm0_x xmm0
18
#define ymm1_x xmm1
19
#define ymm2_x xmm2
20
#define ymm3_x xmm3
21
#define ymm4_x xmm4
22
#define ymm5_x xmm5
23
#define ymm6_x xmm6
24
#define ymm7_x xmm7
25
#define ymm8_x xmm8
26
#define ymm9_x xmm9
27
#define ymm10_x xmm10
28
#define ymm11_x xmm11
29
#define ymm12_x xmm12
30
#define ymm13_x xmm13
31
#define ymm14_x xmm14
32
#define ymm15_x xmm15
33
34
#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
35
( (((a0) & 1) << 0) | \
36
(((a1) & 1) << 1) | \
37
(((a2) & 1) << 2) | \
38
(((a3) & 1) << 3) | \
39
(((a4) & 1) << 4) | \
40
(((a5) & 1) << 5) | \
41
(((a6) & 1) << 6) | \
42
(((a7) & 1) << 7) )
43
44
#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
45
( ((l7) << (0 * 8)) | \
46
((l6) << (1 * 8)) | \
47
((l5) << (2 * 8)) | \
48
((l4) << (3 * 8)) | \
49
((l3) << (4 * 8)) | \
50
((l2) << (5 * 8)) | \
51
((l1) << (6 * 8)) | \
52
((l0) << (7 * 8)) )
53
54
#define inc_le128(x, minus_one, tmp) \
55
vpcmpeqq minus_one, x, tmp; \
56
vpsubq minus_one, x, x; \
57
vpslldq $8, tmp, tmp; \
58
vpsubq tmp, x, x;
59
60
#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
61
vpand x, mask4bit, tmp0; \
62
vpandn x, mask4bit, x; \
63
vpsrld $4, x, x; \
64
\
65
vpshufb tmp0, lo_t, tmp0; \
66
vpshufb x, hi_t, x; \
67
vpxor tmp0, x, x;
68
69
#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
70
vpunpckhdq x1, x0, t2; \
71
vpunpckldq x1, x0, x0; \
72
\
73
vpunpckldq x3, x2, t1; \
74
vpunpckhdq x3, x2, x2; \
75
\
76
vpunpckhqdq t1, x0, x1; \
77
vpunpcklqdq t1, x0, x0; \
78
\
79
vpunpckhqdq x2, t2, x3; \
80
vpunpcklqdq x2, t2, x2;
81
82
#define byteslice_16x16b(a0, b0, c0, d0, \
83
a1, b1, c1, d1, \
84
a2, b2, c2, d2, \
85
a3, b3, c3, d3, \
86
st0, st1) \
87
vmovdqu d2, st0; \
88
vmovdqu d3, st1; \
89
transpose_4x4(a0, a1, a2, a3, d2, d3); \
90
transpose_4x4(b0, b1, b2, b3, d2, d3); \
91
vmovdqu st0, d2; \
92
vmovdqu st1, d3; \
93
\
94
vmovdqu a0, st0; \
95
vmovdqu a1, st1; \
96
transpose_4x4(c0, c1, c2, c3, a0, a1); \
97
transpose_4x4(d0, d1, d2, d3, a0, a1); \
98
\
99
vbroadcasti128 .Lshufb_16x16b(%rip), a0; \
100
vmovdqu st1, a1; \
101
vpshufb a0, a2, a2; \
102
vpshufb a0, a3, a3; \
103
vpshufb a0, b0, b0; \
104
vpshufb a0, b1, b1; \
105
vpshufb a0, b2, b2; \
106
vpshufb a0, b3, b3; \
107
vpshufb a0, a1, a1; \
108
vpshufb a0, c0, c0; \
109
vpshufb a0, c1, c1; \
110
vpshufb a0, c2, c2; \
111
vpshufb a0, c3, c3; \
112
vpshufb a0, d0, d0; \
113
vpshufb a0, d1, d1; \
114
vpshufb a0, d2, d2; \
115
vpshufb a0, d3, d3; \
116
vmovdqu d3, st1; \
117
vmovdqu st0, d3; \
118
vpshufb a0, d3, a0; \
119
vmovdqu d2, st0; \
120
\
121
transpose_4x4(a0, b0, c0, d0, d2, d3); \
122
transpose_4x4(a1, b1, c1, d1, d2, d3); \
123
vmovdqu st0, d2; \
124
vmovdqu st1, d3; \
125
\
126
vmovdqu b0, st0; \
127
vmovdqu b1, st1; \
128
transpose_4x4(a2, b2, c2, d2, b0, b1); \
129
transpose_4x4(a3, b3, c3, d3, b0, b1); \
130
vmovdqu st0, b0; \
131
vmovdqu st1, b1; \
132
/* does not adjust output bytes inside vectors */
133
134
#define debyteslice_16x16b(a0, b0, c0, d0, \
135
a1, b1, c1, d1, \
136
a2, b2, c2, d2, \
137
a3, b3, c3, d3, \
138
st0, st1) \
139
vmovdqu d2, st0; \
140
vmovdqu d3, st1; \
141
transpose_4x4(a0, a1, a2, a3, d2, d3); \
142
transpose_4x4(b0, b1, b2, b3, d2, d3); \
143
vmovdqu st0, d2; \
144
vmovdqu st1, d3; \
145
\
146
vmovdqu a0, st0; \
147
vmovdqu a1, st1; \
148
transpose_4x4(c0, c1, c2, c3, a0, a1); \
149
transpose_4x4(d0, d1, d2, d3, a0, a1); \
150
\
151
vbroadcasti128 .Lshufb_16x16b(%rip), a0; \
152
vmovdqu st1, a1; \
153
vpshufb a0, a2, a2; \
154
vpshufb a0, a3, a3; \
155
vpshufb a0, b0, b0; \
156
vpshufb a0, b1, b1; \
157
vpshufb a0, b2, b2; \
158
vpshufb a0, b3, b3; \
159
vpshufb a0, a1, a1; \
160
vpshufb a0, c0, c0; \
161
vpshufb a0, c1, c1; \
162
vpshufb a0, c2, c2; \
163
vpshufb a0, c3, c3; \
164
vpshufb a0, d0, d0; \
165
vpshufb a0, d1, d1; \
166
vpshufb a0, d2, d2; \
167
vpshufb a0, d3, d3; \
168
vmovdqu d3, st1; \
169
vmovdqu st0, d3; \
170
vpshufb a0, d3, a0; \
171
vmovdqu d2, st0; \
172
\
173
transpose_4x4(c0, d0, a0, b0, d2, d3); \
174
transpose_4x4(c1, d1, a1, b1, d2, d3); \
175
vmovdqu st0, d2; \
176
vmovdqu st1, d3; \
177
\
178
vmovdqu b0, st0; \
179
vmovdqu b1, st1; \
180
transpose_4x4(c2, d2, a2, b2, b0, b1); \
181
transpose_4x4(c3, d3, a3, b3, b0, b1); \
182
vmovdqu st0, b0; \
183
vmovdqu st1, b1; \
184
/* does not adjust output bytes inside vectors */
185
186
/* load blocks to registers and apply pre-whitening */
187
#define inpack16_pre(x0, x1, x2, x3, \
188
x4, x5, x6, x7, \
189
y0, y1, y2, y3, \
190
y4, y5, y6, y7, \
191
rio) \
192
vmovdqu (0 * 32)(rio), x0; \
193
vmovdqu (1 * 32)(rio), x1; \
194
vmovdqu (2 * 32)(rio), x2; \
195
vmovdqu (3 * 32)(rio), x3; \
196
vmovdqu (4 * 32)(rio), x4; \
197
vmovdqu (5 * 32)(rio), x5; \
198
vmovdqu (6 * 32)(rio), x6; \
199
vmovdqu (7 * 32)(rio), x7; \
200
vmovdqu (8 * 32)(rio), y0; \
201
vmovdqu (9 * 32)(rio), y1; \
202
vmovdqu (10 * 32)(rio), y2; \
203
vmovdqu (11 * 32)(rio), y3; \
204
vmovdqu (12 * 32)(rio), y4; \
205
vmovdqu (13 * 32)(rio), y5; \
206
vmovdqu (14 * 32)(rio), y6; \
207
vmovdqu (15 * 32)(rio), y7;
208
209
/* byteslice pre-whitened blocks and store to temporary memory */
210
#define inpack16_post(x0, x1, x2, x3, \
211
x4, x5, x6, x7, \
212
y0, y1, y2, y3, \
213
y4, y5, y6, y7, \
214
mem_ab, mem_cd) \
215
byteslice_16x16b(x0, x1, x2, x3, \
216
x4, x5, x6, x7, \
217
y0, y1, y2, y3, \
218
y4, y5, y6, y7, \
219
(mem_ab), (mem_cd)); \
220
\
221
vmovdqu x0, 0 * 32(mem_ab); \
222
vmovdqu x1, 1 * 32(mem_ab); \
223
vmovdqu x2, 2 * 32(mem_ab); \
224
vmovdqu x3, 3 * 32(mem_ab); \
225
vmovdqu x4, 4 * 32(mem_ab); \
226
vmovdqu x5, 5 * 32(mem_ab); \
227
vmovdqu x6, 6 * 32(mem_ab); \
228
vmovdqu x7, 7 * 32(mem_ab); \
229
vmovdqu y0, 0 * 32(mem_cd); \
230
vmovdqu y1, 1 * 32(mem_cd); \
231
vmovdqu y2, 2 * 32(mem_cd); \
232
vmovdqu y3, 3 * 32(mem_cd); \
233
vmovdqu y4, 4 * 32(mem_cd); \
234
vmovdqu y5, 5 * 32(mem_cd); \
235
vmovdqu y6, 6 * 32(mem_cd); \
236
vmovdqu y7, 7 * 32(mem_cd);
237
238
#define write_output(x0, x1, x2, x3, \
239
x4, x5, x6, x7, \
240
y0, y1, y2, y3, \
241
y4, y5, y6, y7, \
242
mem) \
243
vmovdqu x0, 0 * 32(mem); \
244
vmovdqu x1, 1 * 32(mem); \
245
vmovdqu x2, 2 * 32(mem); \
246
vmovdqu x3, 3 * 32(mem); \
247
vmovdqu x4, 4 * 32(mem); \
248
vmovdqu x5, 5 * 32(mem); \
249
vmovdqu x6, 6 * 32(mem); \
250
vmovdqu x7, 7 * 32(mem); \
251
vmovdqu y0, 8 * 32(mem); \
252
vmovdqu y1, 9 * 32(mem); \
253
vmovdqu y2, 10 * 32(mem); \
254
vmovdqu y3, 11 * 32(mem); \
255
vmovdqu y4, 12 * 32(mem); \
256
vmovdqu y5, 13 * 32(mem); \
257
vmovdqu y6, 14 * 32(mem); \
258
vmovdqu y7, 15 * 32(mem); \
259
260
#define aria_store_state_8way(x0, x1, x2, x3, \
261
x4, x5, x6, x7, \
262
mem_tmp, idx) \
263
vmovdqu x0, ((idx + 0) * 32)(mem_tmp); \
264
vmovdqu x1, ((idx + 1) * 32)(mem_tmp); \
265
vmovdqu x2, ((idx + 2) * 32)(mem_tmp); \
266
vmovdqu x3, ((idx + 3) * 32)(mem_tmp); \
267
vmovdqu x4, ((idx + 4) * 32)(mem_tmp); \
268
vmovdqu x5, ((idx + 5) * 32)(mem_tmp); \
269
vmovdqu x6, ((idx + 6) * 32)(mem_tmp); \
270
vmovdqu x7, ((idx + 7) * 32)(mem_tmp);
271
272
#define aria_load_state_8way(x0, x1, x2, x3, \
273
x4, x5, x6, x7, \
274
mem_tmp, idx) \
275
vmovdqu ((idx + 0) * 32)(mem_tmp), x0; \
276
vmovdqu ((idx + 1) * 32)(mem_tmp), x1; \
277
vmovdqu ((idx + 2) * 32)(mem_tmp), x2; \
278
vmovdqu ((idx + 3) * 32)(mem_tmp), x3; \
279
vmovdqu ((idx + 4) * 32)(mem_tmp), x4; \
280
vmovdqu ((idx + 5) * 32)(mem_tmp), x5; \
281
vmovdqu ((idx + 6) * 32)(mem_tmp), x6; \
282
vmovdqu ((idx + 7) * 32)(mem_tmp), x7;
283
284
#define aria_ark_8way(x0, x1, x2, x3, \
285
x4, x5, x6, x7, \
286
t0, rk, idx, round) \
287
/* AddRoundKey */ \
288
vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \
289
vpxor t0, x0, x0; \
290
vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \
291
vpxor t0, x1, x1; \
292
vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \
293
vpxor t0, x2, x2; \
294
vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \
295
vpxor t0, x3, x3; \
296
vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \
297
vpxor t0, x4, x4; \
298
vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \
299
vpxor t0, x5, x5; \
300
vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \
301
vpxor t0, x6, x6; \
302
vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \
303
vpxor t0, x7, x7;
304
305
#ifdef CONFIG_AS_GFNI
306
#define aria_sbox_8way_gfni(x0, x1, x2, x3, \
307
x4, x5, x6, x7, \
308
t0, t1, t2, t3, \
309
t4, t5, t6, t7) \
310
vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \
311
vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \
312
vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \
313
vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \
314
vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \
315
vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
316
vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
317
vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
318
vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
319
vgf2p8affineinvqb $0, t2, x2, x2; \
320
vgf2p8affineinvqb $0, t2, x6, x6; \
321
vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
322
vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
323
vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
324
vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
325
vgf2p8affineinvqb $0, t2, x3, x3; \
326
vgf2p8affineinvqb $0, t2, x7, x7
327
328
#endif /* CONFIG_AS_GFNI */
329
#define aria_sbox_8way(x0, x1, x2, x3, \
330
x4, x5, x6, x7, \
331
t0, t1, t2, t3, \
332
t4, t5, t6, t7) \
333
vpxor t7, t7, t7; \
334
vpxor t6, t6, t6; \
335
vbroadcasti128 .Linv_shift_row(%rip), t0; \
336
vbroadcasti128 .Lshift_row(%rip), t1; \
337
vbroadcasti128 .Ltf_lo__inv_aff__and__s2(%rip), t2; \
338
vbroadcasti128 .Ltf_hi__inv_aff__and__s2(%rip), t3; \
339
vbroadcasti128 .Ltf_lo__x2__and__fwd_aff(%rip), t4; \
340
vbroadcasti128 .Ltf_hi__x2__and__fwd_aff(%rip), t5; \
341
\
342
vextracti128 $1, x0, t6##_x; \
343
vaesenclast t7##_x, x0##_x, x0##_x; \
344
vaesenclast t7##_x, t6##_x, t6##_x; \
345
vinserti128 $1, t6##_x, x0, x0; \
346
\
347
vextracti128 $1, x4, t6##_x; \
348
vaesenclast t7##_x, x4##_x, x4##_x; \
349
vaesenclast t7##_x, t6##_x, t6##_x; \
350
vinserti128 $1, t6##_x, x4, x4; \
351
\
352
vextracti128 $1, x1, t6##_x; \
353
vaesenclast t7##_x, x1##_x, x1##_x; \
354
vaesenclast t7##_x, t6##_x, t6##_x; \
355
vinserti128 $1, t6##_x, x1, x1; \
356
\
357
vextracti128 $1, x5, t6##_x; \
358
vaesenclast t7##_x, x5##_x, x5##_x; \
359
vaesenclast t7##_x, t6##_x, t6##_x; \
360
vinserti128 $1, t6##_x, x5, x5; \
361
\
362
vextracti128 $1, x2, t6##_x; \
363
vaesdeclast t7##_x, x2##_x, x2##_x; \
364
vaesdeclast t7##_x, t6##_x, t6##_x; \
365
vinserti128 $1, t6##_x, x2, x2; \
366
\
367
vextracti128 $1, x6, t6##_x; \
368
vaesdeclast t7##_x, x6##_x, x6##_x; \
369
vaesdeclast t7##_x, t6##_x, t6##_x; \
370
vinserti128 $1, t6##_x, x6, x6; \
371
\
372
vpbroadcastd .L0f0f0f0f(%rip), t6; \
373
\
374
/* AES inverse shift rows */ \
375
vpshufb t0, x0, x0; \
376
vpshufb t0, x4, x4; \
377
vpshufb t0, x1, x1; \
378
vpshufb t0, x5, x5; \
379
vpshufb t1, x3, x3; \
380
vpshufb t1, x7, x7; \
381
vpshufb t1, x2, x2; \
382
vpshufb t1, x6, x6; \
383
\
384
/* affine transformation for S2 */ \
385
filter_8bit(x1, t2, t3, t6, t0); \
386
/* affine transformation for S2 */ \
387
filter_8bit(x5, t2, t3, t6, t0); \
388
\
389
/* affine transformation for X2 */ \
390
filter_8bit(x3, t4, t5, t6, t0); \
391
/* affine transformation for X2 */ \
392
filter_8bit(x7, t4, t5, t6, t0); \
393
\
394
vpxor t6, t6, t6; \
395
vextracti128 $1, x3, t6##_x; \
396
vaesdeclast t7##_x, x3##_x, x3##_x; \
397
vaesdeclast t7##_x, t6##_x, t6##_x; \
398
vinserti128 $1, t6##_x, x3, x3; \
399
\
400
vextracti128 $1, x7, t6##_x; \
401
vaesdeclast t7##_x, x7##_x, x7##_x; \
402
vaesdeclast t7##_x, t6##_x, t6##_x; \
403
vinserti128 $1, t6##_x, x7, x7; \
404
405
#define aria_diff_m(x0, x1, x2, x3, \
406
t0, t1, t2, t3) \
407
/* T = rotr32(X, 8); */ \
408
/* X ^= T */ \
409
vpxor x0, x3, t0; \
410
vpxor x1, x0, t1; \
411
vpxor x2, x1, t2; \
412
vpxor x3, x2, t3; \
413
/* X = T ^ rotr(X, 16); */ \
414
vpxor t2, x0, x0; \
415
vpxor x1, t3, t3; \
416
vpxor t0, x2, x2; \
417
vpxor t1, x3, x1; \
418
vmovdqu t3, x3;
419
420
#define aria_diff_word(x0, x1, x2, x3, \
421
x4, x5, x6, x7, \
422
y0, y1, y2, y3, \
423
y4, y5, y6, y7) \
424
/* t1 ^= t2; */ \
425
vpxor y0, x4, x4; \
426
vpxor y1, x5, x5; \
427
vpxor y2, x6, x6; \
428
vpxor y3, x7, x7; \
429
\
430
/* t2 ^= t3; */ \
431
vpxor y4, y0, y0; \
432
vpxor y5, y1, y1; \
433
vpxor y6, y2, y2; \
434
vpxor y7, y3, y3; \
435
\
436
/* t0 ^= t1; */ \
437
vpxor x4, x0, x0; \
438
vpxor x5, x1, x1; \
439
vpxor x6, x2, x2; \
440
vpxor x7, x3, x3; \
441
\
442
/* t3 ^= t1; */ \
443
vpxor x4, y4, y4; \
444
vpxor x5, y5, y5; \
445
vpxor x6, y6, y6; \
446
vpxor x7, y7, y7; \
447
\
448
/* t2 ^= t0; */ \
449
vpxor x0, y0, y0; \
450
vpxor x1, y1, y1; \
451
vpxor x2, y2, y2; \
452
vpxor x3, y3, y3; \
453
\
454
/* t1 ^= t2; */ \
455
vpxor y0, x4, x4; \
456
vpxor y1, x5, x5; \
457
vpxor y2, x6, x6; \
458
vpxor y3, x7, x7;
459
460
#define aria_fe(x0, x1, x2, x3, \
461
x4, x5, x6, x7, \
462
y0, y1, y2, y3, \
463
y4, y5, y6, y7, \
464
mem_tmp, rk, round) \
465
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
466
y0, rk, 8, round); \
467
\
468
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
469
y0, y1, y2, y3, y4, y5, y6, y7); \
470
\
471
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
472
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
473
aria_store_state_8way(x0, x1, x2, x3, \
474
x4, x5, x6, x7, \
475
mem_tmp, 8); \
476
\
477
aria_load_state_8way(x0, x1, x2, x3, \
478
x4, x5, x6, x7, \
479
mem_tmp, 0); \
480
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
481
y0, rk, 0, round); \
482
\
483
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
484
y0, y1, y2, y3, y4, y5, y6, y7); \
485
\
486
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
487
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
488
aria_store_state_8way(x0, x1, x2, x3, \
489
x4, x5, x6, x7, \
490
mem_tmp, 0); \
491
aria_load_state_8way(y0, y1, y2, y3, \
492
y4, y5, y6, y7, \
493
mem_tmp, 8); \
494
aria_diff_word(x0, x1, x2, x3, \
495
x4, x5, x6, x7, \
496
y0, y1, y2, y3, \
497
y4, y5, y6, y7); \
498
/* aria_diff_byte() \
499
* T3 = ABCD -> BADC \
500
* T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
501
* T0 = ABCD -> CDAB \
502
* T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
503
* T1 = ABCD -> DCBA \
504
* T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
505
*/ \
506
aria_diff_word(x2, x3, x0, x1, \
507
x7, x6, x5, x4, \
508
y0, y1, y2, y3, \
509
y5, y4, y7, y6); \
510
aria_store_state_8way(x3, x2, x1, x0, \
511
x6, x7, x4, x5, \
512
mem_tmp, 0);
513
514
#define aria_fo(x0, x1, x2, x3, \
515
x4, x5, x6, x7, \
516
y0, y1, y2, y3, \
517
y4, y5, y6, y7, \
518
mem_tmp, rk, round) \
519
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
520
y0, rk, 8, round); \
521
\
522
aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
523
y0, y1, y2, y3, y4, y5, y6, y7); \
524
\
525
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
526
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
527
aria_store_state_8way(x0, x1, x2, x3, \
528
x4, x5, x6, x7, \
529
mem_tmp, 8); \
530
\
531
aria_load_state_8way(x0, x1, x2, x3, \
532
x4, x5, x6, x7, \
533
mem_tmp, 0); \
534
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
535
y0, rk, 0, round); \
536
\
537
aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
538
y0, y1, y2, y3, y4, y5, y6, y7); \
539
\
540
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
541
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
542
aria_store_state_8way(x0, x1, x2, x3, \
543
x4, x5, x6, x7, \
544
mem_tmp, 0); \
545
aria_load_state_8way(y0, y1, y2, y3, \
546
y4, y5, y6, y7, \
547
mem_tmp, 8); \
548
aria_diff_word(x0, x1, x2, x3, \
549
x4, x5, x6, x7, \
550
y0, y1, y2, y3, \
551
y4, y5, y6, y7); \
552
/* aria_diff_byte() \
553
* T1 = ABCD -> BADC \
554
* T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
555
* T2 = ABCD -> CDAB \
556
* T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
557
* T3 = ABCD -> DCBA \
558
* T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
559
*/ \
560
aria_diff_word(x0, x1, x2, x3, \
561
x5, x4, x7, x6, \
562
y2, y3, y0, y1, \
563
y7, y6, y5, y4); \
564
aria_store_state_8way(x3, x2, x1, x0, \
565
x6, x7, x4, x5, \
566
mem_tmp, 0);
567
568
#define aria_ff(x0, x1, x2, x3, \
569
x4, x5, x6, x7, \
570
y0, y1, y2, y3, \
571
y4, y5, y6, y7, \
572
mem_tmp, rk, round, last_round) \
573
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
574
y0, rk, 8, round); \
575
\
576
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
577
y0, y1, y2, y3, y4, y5, y6, y7); \
578
\
579
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
580
y0, rk, 8, last_round); \
581
\
582
aria_store_state_8way(x0, x1, x2, x3, \
583
x4, x5, x6, x7, \
584
mem_tmp, 8); \
585
\
586
aria_load_state_8way(x0, x1, x2, x3, \
587
x4, x5, x6, x7, \
588
mem_tmp, 0); \
589
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
590
y0, rk, 0, round); \
591
\
592
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
593
y0, y1, y2, y3, y4, y5, y6, y7); \
594
\
595
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
596
y0, rk, 0, last_round); \
597
\
598
aria_load_state_8way(y0, y1, y2, y3, \
599
y4, y5, y6, y7, \
600
mem_tmp, 8);
601
#ifdef CONFIG_AS_GFNI
602
#define aria_fe_gfni(x0, x1, x2, x3, \
603
x4, x5, x6, x7, \
604
y0, y1, y2, y3, \
605
y4, y5, y6, y7, \
606
mem_tmp, rk, round) \
607
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
608
y0, rk, 8, round); \
609
\
610
aria_sbox_8way_gfni(x2, x3, x0, x1, \
611
x6, x7, x4, x5, \
612
y0, y1, y2, y3, \
613
y4, y5, y6, y7); \
614
\
615
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
616
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
617
aria_store_state_8way(x0, x1, x2, x3, \
618
x4, x5, x6, x7, \
619
mem_tmp, 8); \
620
\
621
aria_load_state_8way(x0, x1, x2, x3, \
622
x4, x5, x6, x7, \
623
mem_tmp, 0); \
624
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
625
y0, rk, 0, round); \
626
\
627
aria_sbox_8way_gfni(x2, x3, x0, x1, \
628
x6, x7, x4, x5, \
629
y0, y1, y2, y3, \
630
y4, y5, y6, y7); \
631
\
632
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
633
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
634
aria_store_state_8way(x0, x1, x2, x3, \
635
x4, x5, x6, x7, \
636
mem_tmp, 0); \
637
aria_load_state_8way(y0, y1, y2, y3, \
638
y4, y5, y6, y7, \
639
mem_tmp, 8); \
640
aria_diff_word(x0, x1, x2, x3, \
641
x4, x5, x6, x7, \
642
y0, y1, y2, y3, \
643
y4, y5, y6, y7); \
644
/* aria_diff_byte() \
645
* T3 = ABCD -> BADC \
646
* T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
647
* T0 = ABCD -> CDAB \
648
* T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
649
* T1 = ABCD -> DCBA \
650
* T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
651
*/ \
652
aria_diff_word(x2, x3, x0, x1, \
653
x7, x6, x5, x4, \
654
y0, y1, y2, y3, \
655
y5, y4, y7, y6); \
656
aria_store_state_8way(x3, x2, x1, x0, \
657
x6, x7, x4, x5, \
658
mem_tmp, 0);
659
660
#define aria_fo_gfni(x0, x1, x2, x3, \
661
x4, x5, x6, x7, \
662
y0, y1, y2, y3, \
663
y4, y5, y6, y7, \
664
mem_tmp, rk, round) \
665
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
666
y0, rk, 8, round); \
667
\
668
aria_sbox_8way_gfni(x0, x1, x2, x3, \
669
x4, x5, x6, x7, \
670
y0, y1, y2, y3, \
671
y4, y5, y6, y7); \
672
\
673
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
674
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
675
aria_store_state_8way(x0, x1, x2, x3, \
676
x4, x5, x6, x7, \
677
mem_tmp, 8); \
678
\
679
aria_load_state_8way(x0, x1, x2, x3, \
680
x4, x5, x6, x7, \
681
mem_tmp, 0); \
682
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
683
y0, rk, 0, round); \
684
\
685
aria_sbox_8way_gfni(x0, x1, x2, x3, \
686
x4, x5, x6, x7, \
687
y0, y1, y2, y3, \
688
y4, y5, y6, y7); \
689
\
690
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
691
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
692
aria_store_state_8way(x0, x1, x2, x3, \
693
x4, x5, x6, x7, \
694
mem_tmp, 0); \
695
aria_load_state_8way(y0, y1, y2, y3, \
696
y4, y5, y6, y7, \
697
mem_tmp, 8); \
698
aria_diff_word(x0, x1, x2, x3, \
699
x4, x5, x6, x7, \
700
y0, y1, y2, y3, \
701
y4, y5, y6, y7); \
702
/* aria_diff_byte() \
703
* T1 = ABCD -> BADC \
704
* T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
705
* T2 = ABCD -> CDAB \
706
* T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
707
* T3 = ABCD -> DCBA \
708
* T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
709
*/ \
710
aria_diff_word(x0, x1, x2, x3, \
711
x5, x4, x7, x6, \
712
y2, y3, y0, y1, \
713
y7, y6, y5, y4); \
714
aria_store_state_8way(x3, x2, x1, x0, \
715
x6, x7, x4, x5, \
716
mem_tmp, 0);
717
718
#define aria_ff_gfni(x0, x1, x2, x3, \
719
x4, x5, x6, x7, \
720
y0, y1, y2, y3, \
721
y4, y5, y6, y7, \
722
mem_tmp, rk, round, last_round) \
723
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
724
y0, rk, 8, round); \
725
\
726
aria_sbox_8way_gfni(x2, x3, x0, x1, \
727
x6, x7, x4, x5, \
728
y0, y1, y2, y3, \
729
y4, y5, y6, y7); \
730
\
731
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
732
y0, rk, 8, last_round); \
733
\
734
aria_store_state_8way(x0, x1, x2, x3, \
735
x4, x5, x6, x7, \
736
mem_tmp, 8); \
737
\
738
aria_load_state_8way(x0, x1, x2, x3, \
739
x4, x5, x6, x7, \
740
mem_tmp, 0); \
741
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
742
y0, rk, 0, round); \
743
\
744
aria_sbox_8way_gfni(x2, x3, x0, x1, \
745
x6, x7, x4, x5, \
746
y0, y1, y2, y3, \
747
y4, y5, y6, y7); \
748
\
749
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
750
y0, rk, 0, last_round); \
751
\
752
aria_load_state_8way(y0, y1, y2, y3, \
753
y4, y5, y6, y7, \
754
mem_tmp, 8);
755
#endif /* CONFIG_AS_GFNI */
756
757
.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
758
.align 32
759
#define SHUFB_BYTES(idx) \
760
0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
761
.Lshufb_16x16b:
762
.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
763
.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
764
765
.section .rodata.cst16, "aM", @progbits, 16
766
.align 16
767
/* For isolating SubBytes from AESENCLAST, inverse shift row */
768
.Linv_shift_row:
769
.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
770
.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
771
.Lshift_row:
772
.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
773
.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
774
/* For CTR-mode IV byteswap */
775
.Lbswap128_mask:
776
.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
777
.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
778
779
/* AES inverse affine and S2 combined:
780
* 1 1 0 0 0 0 0 1 x0 0
781
* 0 1 0 0 1 0 0 0 x1 0
782
* 1 1 0 0 1 1 1 1 x2 0
783
* 0 1 1 0 1 0 0 1 x3 1
784
* 0 1 0 0 1 1 0 0 * x4 + 0
785
* 0 1 0 1 1 0 0 0 x5 0
786
* 0 0 0 0 0 1 0 1 x6 0
787
* 1 1 1 0 0 1 1 1 x7 1
788
*/
789
.Ltf_lo__inv_aff__and__s2:
790
.octa 0x92172DA81A9FA520B2370D883ABF8500
791
.Ltf_hi__inv_aff__and__s2:
792
.octa 0x2B15FFC1AF917B45E6D8320C625CB688
793
794
/* X2 and AES forward affine combined:
795
* 1 0 1 1 0 0 0 1 x0 0
796
* 0 1 1 1 1 0 1 1 x1 0
797
* 0 0 0 1 1 0 1 0 x2 1
798
* 0 1 0 0 0 1 0 0 x3 0
799
* 0 0 1 1 1 0 1 1 * x4 + 0
800
* 0 1 0 0 1 0 0 0 x5 0
801
* 1 1 0 1 0 0 1 1 x6 0
802
* 0 1 0 0 1 0 1 0 x7 0
803
*/
804
.Ltf_lo__x2__and__fwd_aff:
805
.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
806
.Ltf_hi__x2__and__fwd_aff:
807
.octa 0x3F893781E95FE1576CDA64D2BA0CB204
808
809
#ifdef CONFIG_AS_GFNI
810
.section .rodata.cst8, "aM", @progbits, 8
811
.align 8
812
/* AES affine: */
813
#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
814
.Ltf_aff_bitmatrix:
815
.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
816
BV8(1, 1, 0, 0, 0, 1, 1, 1),
817
BV8(1, 1, 1, 0, 0, 0, 1, 1),
818
BV8(1, 1, 1, 1, 0, 0, 0, 1),
819
BV8(1, 1, 1, 1, 1, 0, 0, 0),
820
BV8(0, 1, 1, 1, 1, 1, 0, 0),
821
BV8(0, 0, 1, 1, 1, 1, 1, 0),
822
BV8(0, 0, 0, 1, 1, 1, 1, 1))
823
824
/* AES inverse affine: */
825
#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
826
.Ltf_inv_bitmatrix:
827
.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
828
BV8(1, 0, 0, 1, 0, 0, 1, 0),
829
BV8(0, 1, 0, 0, 1, 0, 0, 1),
830
BV8(1, 0, 1, 0, 0, 1, 0, 0),
831
BV8(0, 1, 0, 1, 0, 0, 1, 0),
832
BV8(0, 0, 1, 0, 1, 0, 0, 1),
833
BV8(1, 0, 0, 1, 0, 1, 0, 0),
834
BV8(0, 1, 0, 0, 1, 0, 1, 0))
835
836
/* S2: */
837
#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
838
.Ltf_s2_bitmatrix:
839
.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
840
BV8(0, 0, 1, 1, 1, 1, 1, 1),
841
BV8(1, 1, 1, 0, 1, 1, 0, 1),
842
BV8(1, 1, 0, 0, 0, 0, 1, 1),
843
BV8(0, 1, 0, 0, 0, 0, 1, 1),
844
BV8(1, 1, 0, 0, 1, 1, 1, 0),
845
BV8(0, 1, 1, 0, 0, 0, 1, 1),
846
BV8(1, 1, 1, 1, 0, 1, 1, 0))
847
848
/* X2: */
849
#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
850
.Ltf_x2_bitmatrix:
851
.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
852
BV8(0, 0, 1, 0, 0, 1, 1, 0),
853
BV8(0, 0, 0, 0, 1, 0, 1, 0),
854
BV8(1, 1, 1, 0, 0, 0, 1, 1),
855
BV8(1, 1, 1, 0, 1, 1, 0, 0),
856
BV8(0, 1, 1, 0, 1, 0, 1, 1),
857
BV8(1, 0, 1, 1, 1, 1, 0, 1),
858
BV8(1, 0, 0, 1, 0, 0, 1, 1))
859
860
/* Identity matrix: */
861
.Ltf_id_bitmatrix:
862
.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
863
BV8(0, 1, 0, 0, 0, 0, 0, 0),
864
BV8(0, 0, 1, 0, 0, 0, 0, 0),
865
BV8(0, 0, 0, 1, 0, 0, 0, 0),
866
BV8(0, 0, 0, 0, 1, 0, 0, 0),
867
BV8(0, 0, 0, 0, 0, 1, 0, 0),
868
BV8(0, 0, 0, 0, 0, 0, 1, 0),
869
BV8(0, 0, 0, 0, 0, 0, 0, 1))
870
871
#endif /* CONFIG_AS_GFNI */
872
873
/* 4-bit mask */
874
.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
875
.align 4
876
.L0f0f0f0f:
877
.long 0x0f0f0f0f
878
879
.text
880
881
SYM_FUNC_START_LOCAL(__aria_aesni_avx2_crypt_32way)
882
/* input:
883
* %r9: rk
884
* %rsi: dst
885
* %rdx: src
886
* %ymm0..%ymm15: byte-sliced blocks
887
*/
888
889
FRAME_BEGIN
890
891
movq %rsi, %rax;
892
leaq 8 * 32(%rax), %r8;
893
894
inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
895
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
896
%ymm15, %rax, %r8);
897
aria_fo(%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
898
%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
899
%rax, %r9, 0);
900
aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
901
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
902
%ymm15, %rax, %r9, 1);
903
aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
904
%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
905
%rax, %r9, 2);
906
aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
907
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
908
%ymm15, %rax, %r9, 3);
909
aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
910
%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
911
%rax, %r9, 4);
912
aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
913
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
914
%ymm15, %rax, %r9, 5);
915
aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
916
%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
917
%rax, %r9, 6);
918
aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
919
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
920
%ymm15, %rax, %r9, 7);
921
aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
922
%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
923
%rax, %r9, 8);
924
aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
925
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
926
%ymm15, %rax, %r9, 9);
927
aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
928
%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
929
%rax, %r9, 10);
930
cmpl $12, ARIA_CTX_rounds(CTX);
931
jne .Laria_192;
932
aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
933
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
934
%ymm15, %rax, %r9, 11, 12);
935
jmp .Laria_end;
936
.Laria_192:
937
aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
938
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
939
%ymm15, %rax, %r9, 11);
940
aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
941
%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
942
%rax, %r9, 12);
943
cmpl $14, ARIA_CTX_rounds(CTX);
944
jne .Laria_256;
945
aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
946
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
947
%ymm15, %rax, %r9, 13, 14);
948
jmp .Laria_end;
949
.Laria_256:
950
aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
951
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
952
%ymm15, %rax, %r9, 13);
953
aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
954
%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
955
%rax, %r9, 14);
956
aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
957
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
958
%ymm15, %rax, %r9, 15, 16);
959
.Laria_end:
960
debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
961
%ymm9, %ymm13, %ymm0, %ymm5,
962
%ymm10, %ymm14, %ymm3, %ymm6,
963
%ymm11, %ymm15, %ymm2, %ymm7,
964
(%rax), (%r8));
965
966
FRAME_END
967
RET;
968
SYM_FUNC_END(__aria_aesni_avx2_crypt_32way)
969
970
SYM_TYPED_FUNC_START(aria_aesni_avx2_encrypt_32way)
971
/* input:
972
* %rdi: ctx, CTX
973
* %rsi: dst
974
* %rdx: src
975
*/
976
977
FRAME_BEGIN
978
979
leaq ARIA_CTX_enc_key(CTX), %r9;
980
981
inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
982
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
983
%ymm15, %rdx);
984
985
call __aria_aesni_avx2_crypt_32way;
986
987
write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
988
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
989
%ymm15, %rax);
990
991
FRAME_END
992
RET;
993
SYM_FUNC_END(aria_aesni_avx2_encrypt_32way)
994
995
SYM_TYPED_FUNC_START(aria_aesni_avx2_decrypt_32way)
996
/* input:
997
* %rdi: ctx, CTX
998
* %rsi: dst
999
* %rdx: src
1000
*/
1001
1002
FRAME_BEGIN
1003
1004
leaq ARIA_CTX_dec_key(CTX), %r9;
1005
1006
inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1007
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1008
%ymm15, %rdx);
1009
1010
call __aria_aesni_avx2_crypt_32way;
1011
1012
write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1013
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1014
%ymm15, %rax);
1015
1016
FRAME_END
1017
RET;
1018
SYM_FUNC_END(aria_aesni_avx2_decrypt_32way)
1019
1020
SYM_FUNC_START_LOCAL(__aria_aesni_avx2_ctr_gen_keystream_32way)
1021
/* input:
1022
* %rdi: ctx
1023
* %rsi: dst
1024
* %rdx: src
1025
* %rcx: keystream
1026
* %r8: iv (big endian, 128bit)
1027
*/
1028
1029
FRAME_BEGIN
1030
movq 8(%r8), %r11;
1031
bswapq %r11;
1032
1033
vbroadcasti128 .Lbswap128_mask (%rip), %ymm6;
1034
vpcmpeqd %ymm0, %ymm0, %ymm0;
1035
vpsrldq $8, %ymm0, %ymm0; /* ab: -1:0 ; cd: -1:0 */
1036
vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2:0 ; cd: -2:0 */
1037
1038
/* load IV and byteswap */
1039
vmovdqu (%r8), %xmm7;
1040
vpshufb %xmm6, %xmm7, %xmm7;
1041
vmovdqa %xmm7, %xmm3;
1042
inc_le128(%xmm7, %xmm0, %xmm4);
1043
vinserti128 $1, %xmm7, %ymm3, %ymm3;
1044
vpshufb %ymm6, %ymm3, %ymm8; /* +1 ; +0 */
1045
1046
/* check need for handling 64-bit overflow and carry */
1047
cmpq $(0xffffffffffffffff - 32), %r11;
1048
ja .Lhandle_ctr_carry;
1049
1050
/* construct IVs */
1051
vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; +2 */
1052
vpshufb %ymm6, %ymm3, %ymm9;
1053
vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; +4 */
1054
vpshufb %ymm6, %ymm3, %ymm10;
1055
vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; +6 */
1056
vpshufb %ymm6, %ymm3, %ymm11;
1057
vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; +8 */
1058
vpshufb %ymm6, %ymm3, %ymm12;
1059
vpsubq %ymm5, %ymm3, %ymm3; /* +11 ; +10 */
1060
vpshufb %ymm6, %ymm3, %ymm13;
1061
vpsubq %ymm5, %ymm3, %ymm3; /* +13 ; +12 */
1062
vpshufb %ymm6, %ymm3, %ymm14;
1063
vpsubq %ymm5, %ymm3, %ymm3; /* +15 ; +14 */
1064
vpshufb %ymm6, %ymm3, %ymm15;
1065
vmovdqu %ymm8, (0 * 32)(%rcx);
1066
vmovdqu %ymm9, (1 * 32)(%rcx);
1067
vmovdqu %ymm10, (2 * 32)(%rcx);
1068
vmovdqu %ymm11, (3 * 32)(%rcx);
1069
vmovdqu %ymm12, (4 * 32)(%rcx);
1070
vmovdqu %ymm13, (5 * 32)(%rcx);
1071
vmovdqu %ymm14, (6 * 32)(%rcx);
1072
vmovdqu %ymm15, (7 * 32)(%rcx);
1073
1074
vpsubq %ymm5, %ymm3, %ymm3; /* +17 ; +16 */
1075
vpshufb %ymm6, %ymm3, %ymm8;
1076
vpsubq %ymm5, %ymm3, %ymm3; /* +19 ; +18 */
1077
vpshufb %ymm6, %ymm3, %ymm9;
1078
vpsubq %ymm5, %ymm3, %ymm3; /* +21 ; +20 */
1079
vpshufb %ymm6, %ymm3, %ymm10;
1080
vpsubq %ymm5, %ymm3, %ymm3; /* +23 ; +22 */
1081
vpshufb %ymm6, %ymm3, %ymm11;
1082
vpsubq %ymm5, %ymm3, %ymm3; /* +25 ; +24 */
1083
vpshufb %ymm6, %ymm3, %ymm12;
1084
vpsubq %ymm5, %ymm3, %ymm3; /* +27 ; +26 */
1085
vpshufb %ymm6, %ymm3, %ymm13;
1086
vpsubq %ymm5, %ymm3, %ymm3; /* +29 ; +28 */
1087
vpshufb %ymm6, %ymm3, %ymm14;
1088
vpsubq %ymm5, %ymm3, %ymm3; /* +31 ; +30 */
1089
vpshufb %ymm6, %ymm3, %ymm15;
1090
vpsubq %ymm5, %ymm3, %ymm3; /* +32 */
1091
vpshufb %xmm6, %xmm3, %xmm3;
1092
vmovdqu %xmm3, (%r8);
1093
vmovdqu (0 * 32)(%rcx), %ymm0;
1094
vmovdqu (1 * 32)(%rcx), %ymm1;
1095
vmovdqu (2 * 32)(%rcx), %ymm2;
1096
vmovdqu (3 * 32)(%rcx), %ymm3;
1097
vmovdqu (4 * 32)(%rcx), %ymm4;
1098
vmovdqu (5 * 32)(%rcx), %ymm5;
1099
vmovdqu (6 * 32)(%rcx), %ymm6;
1100
vmovdqu (7 * 32)(%rcx), %ymm7;
1101
jmp .Lctr_carry_done;
1102
1103
.Lhandle_ctr_carry:
1104
/* construct IVs */
1105
inc_le128(%ymm3, %ymm0, %ymm4);
1106
inc_le128(%ymm3, %ymm0, %ymm4);
1107
vpshufb %ymm6, %ymm3, %ymm9; /* +3 ; +2 */
1108
inc_le128(%ymm3, %ymm0, %ymm4);
1109
inc_le128(%ymm3, %ymm0, %ymm4);
1110
vpshufb %ymm6, %ymm3, %ymm10; /* +5 ; +4 */
1111
inc_le128(%ymm3, %ymm0, %ymm4);
1112
inc_le128(%ymm3, %ymm0, %ymm4);
1113
vpshufb %ymm6, %ymm3, %ymm11; /* +7 ; +6 */
1114
inc_le128(%ymm3, %ymm0, %ymm4);
1115
inc_le128(%ymm3, %ymm0, %ymm4);
1116
vpshufb %ymm6, %ymm3, %ymm12; /* +9 ; +8 */
1117
inc_le128(%ymm3, %ymm0, %ymm4);
1118
inc_le128(%ymm3, %ymm0, %ymm4);
1119
vpshufb %ymm6, %ymm3, %ymm13; /* +11 ; +10 */
1120
inc_le128(%ymm3, %ymm0, %ymm4);
1121
inc_le128(%ymm3, %ymm0, %ymm4);
1122
vpshufb %ymm6, %ymm3, %ymm14; /* +13 ; +12 */
1123
inc_le128(%ymm3, %ymm0, %ymm4);
1124
inc_le128(%ymm3, %ymm0, %ymm4);
1125
vpshufb %ymm6, %ymm3, %ymm15; /* +15 ; +14 */
1126
vmovdqu %ymm8, (0 * 32)(%rcx);
1127
vmovdqu %ymm9, (1 * 32)(%rcx);
1128
vmovdqu %ymm10, (2 * 32)(%rcx);
1129
vmovdqu %ymm11, (3 * 32)(%rcx);
1130
vmovdqu %ymm12, (4 * 32)(%rcx);
1131
vmovdqu %ymm13, (5 * 32)(%rcx);
1132
vmovdqu %ymm14, (6 * 32)(%rcx);
1133
vmovdqu %ymm15, (7 * 32)(%rcx);
1134
1135
inc_le128(%ymm3, %ymm0, %ymm4);
1136
inc_le128(%ymm3, %ymm0, %ymm4);
1137
vpshufb %ymm6, %ymm3, %ymm8; /* +17 ; +16 */
1138
inc_le128(%ymm3, %ymm0, %ymm4);
1139
inc_le128(%ymm3, %ymm0, %ymm4);
1140
vpshufb %ymm6, %ymm3, %ymm9; /* +19 ; +18 */
1141
inc_le128(%ymm3, %ymm0, %ymm4);
1142
inc_le128(%ymm3, %ymm0, %ymm4);
1143
vpshufb %ymm6, %ymm3, %ymm10; /* +21 ; +20 */
1144
inc_le128(%ymm3, %ymm0, %ymm4);
1145
inc_le128(%ymm3, %ymm0, %ymm4);
1146
vpshufb %ymm6, %ymm3, %ymm11; /* +23 ; +22 */
1147
inc_le128(%ymm3, %ymm0, %ymm4);
1148
inc_le128(%ymm3, %ymm0, %ymm4);
1149
vpshufb %ymm6, %ymm3, %ymm12; /* +25 ; +24 */
1150
inc_le128(%ymm3, %ymm0, %ymm4);
1151
inc_le128(%ymm3, %ymm0, %ymm4);
1152
vpshufb %ymm6, %ymm3, %ymm13; /* +27 ; +26 */
1153
inc_le128(%ymm3, %ymm0, %ymm4);
1154
inc_le128(%ymm3, %ymm0, %ymm4);
1155
vpshufb %ymm6, %ymm3, %ymm14; /* +29 ; +28 */
1156
inc_le128(%ymm3, %ymm0, %ymm4);
1157
inc_le128(%ymm3, %ymm0, %ymm4);
1158
vpshufb %ymm6, %ymm3, %ymm15; /* +31 ; +30 */
1159
inc_le128(%ymm3, %ymm0, %ymm4);
1160
vextracti128 $1, %ymm3, %xmm3;
1161
vpshufb %xmm6, %xmm3, %xmm3; /* +32 */
1162
vmovdqu %xmm3, (%r8);
1163
vmovdqu (0 * 32)(%rcx), %ymm0;
1164
vmovdqu (1 * 32)(%rcx), %ymm1;
1165
vmovdqu (2 * 32)(%rcx), %ymm2;
1166
vmovdqu (3 * 32)(%rcx), %ymm3;
1167
vmovdqu (4 * 32)(%rcx), %ymm4;
1168
vmovdqu (5 * 32)(%rcx), %ymm5;
1169
vmovdqu (6 * 32)(%rcx), %ymm6;
1170
vmovdqu (7 * 32)(%rcx), %ymm7;
1171
1172
.Lctr_carry_done:
1173
1174
FRAME_END
1175
RET;
1176
SYM_FUNC_END(__aria_aesni_avx2_ctr_gen_keystream_32way)
1177
1178
SYM_TYPED_FUNC_START(aria_aesni_avx2_ctr_crypt_32way)
1179
/* input:
1180
* %rdi: ctx
1181
* %rsi: dst
1182
* %rdx: src
1183
* %rcx: keystream
1184
* %r8: iv (big endian, 128bit)
1185
*/
1186
FRAME_BEGIN
1187
1188
call __aria_aesni_avx2_ctr_gen_keystream_32way;
1189
1190
leaq (%rsi), %r10;
1191
leaq (%rdx), %r11;
1192
leaq (%rcx), %rsi;
1193
leaq (%rcx), %rdx;
1194
leaq ARIA_CTX_enc_key(CTX), %r9;
1195
1196
call __aria_aesni_avx2_crypt_32way;
1197
1198
vpxor (0 * 32)(%r11), %ymm1, %ymm1;
1199
vpxor (1 * 32)(%r11), %ymm0, %ymm0;
1200
vpxor (2 * 32)(%r11), %ymm3, %ymm3;
1201
vpxor (3 * 32)(%r11), %ymm2, %ymm2;
1202
vpxor (4 * 32)(%r11), %ymm4, %ymm4;
1203
vpxor (5 * 32)(%r11), %ymm5, %ymm5;
1204
vpxor (6 * 32)(%r11), %ymm6, %ymm6;
1205
vpxor (7 * 32)(%r11), %ymm7, %ymm7;
1206
vpxor (8 * 32)(%r11), %ymm8, %ymm8;
1207
vpxor (9 * 32)(%r11), %ymm9, %ymm9;
1208
vpxor (10 * 32)(%r11), %ymm10, %ymm10;
1209
vpxor (11 * 32)(%r11), %ymm11, %ymm11;
1210
vpxor (12 * 32)(%r11), %ymm12, %ymm12;
1211
vpxor (13 * 32)(%r11), %ymm13, %ymm13;
1212
vpxor (14 * 32)(%r11), %ymm14, %ymm14;
1213
vpxor (15 * 32)(%r11), %ymm15, %ymm15;
1214
write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1215
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1216
%ymm15, %r10);
1217
1218
FRAME_END
1219
RET;
1220
SYM_FUNC_END(aria_aesni_avx2_ctr_crypt_32way)
1221
1222
#ifdef CONFIG_AS_GFNI
1223
SYM_FUNC_START_LOCAL(__aria_aesni_avx2_gfni_crypt_32way)
1224
/* input:
1225
* %r9: rk
1226
* %rsi: dst
1227
* %rdx: src
1228
* %ymm0..%ymm15: 16 byte-sliced blocks
1229
*/
1230
1231
FRAME_BEGIN
1232
1233
movq %rsi, %rax;
1234
leaq 8 * 32(%rax), %r8;
1235
1236
inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3,
1237
%ymm4, %ymm5, %ymm6, %ymm7,
1238
%ymm8, %ymm9, %ymm10, %ymm11,
1239
%ymm12, %ymm13, %ymm14,
1240
%ymm15, %rax, %r8);
1241
aria_fo_gfni(%ymm8, %ymm9, %ymm10, %ymm11,
1242
%ymm12, %ymm13, %ymm14, %ymm15,
1243
%ymm0, %ymm1, %ymm2, %ymm3,
1244
%ymm4, %ymm5, %ymm6, %ymm7,
1245
%rax, %r9, 0);
1246
aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1247
%ymm4, %ymm5, %ymm6, %ymm7,
1248
%ymm8, %ymm9, %ymm10, %ymm11,
1249
%ymm12, %ymm13, %ymm14,
1250
%ymm15, %rax, %r9, 1);
1251
aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1252
%ymm12, %ymm13, %ymm14, %ymm15,
1253
%ymm0, %ymm1, %ymm2, %ymm3,
1254
%ymm4, %ymm5, %ymm6, %ymm7,
1255
%rax, %r9, 2);
1256
aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1257
%ymm4, %ymm5, %ymm6, %ymm7,
1258
%ymm8, %ymm9, %ymm10, %ymm11,
1259
%ymm12, %ymm13, %ymm14,
1260
%ymm15, %rax, %r9, 3);
1261
aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1262
%ymm12, %ymm13, %ymm14, %ymm15,
1263
%ymm0, %ymm1, %ymm2, %ymm3,
1264
%ymm4, %ymm5, %ymm6, %ymm7,
1265
%rax, %r9, 4);
1266
aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1267
%ymm4, %ymm5, %ymm6, %ymm7,
1268
%ymm8, %ymm9, %ymm10, %ymm11,
1269
%ymm12, %ymm13, %ymm14,
1270
%ymm15, %rax, %r9, 5);
1271
aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1272
%ymm12, %ymm13, %ymm14, %ymm15,
1273
%ymm0, %ymm1, %ymm2, %ymm3,
1274
%ymm4, %ymm5, %ymm6, %ymm7,
1275
%rax, %r9, 6);
1276
aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1277
%ymm4, %ymm5, %ymm6, %ymm7,
1278
%ymm8, %ymm9, %ymm10, %ymm11,
1279
%ymm12, %ymm13, %ymm14,
1280
%ymm15, %rax, %r9, 7);
1281
aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1282
%ymm12, %ymm13, %ymm14, %ymm15,
1283
%ymm0, %ymm1, %ymm2, %ymm3,
1284
%ymm4, %ymm5, %ymm6, %ymm7,
1285
%rax, %r9, 8);
1286
aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1287
%ymm4, %ymm5, %ymm6, %ymm7,
1288
%ymm8, %ymm9, %ymm10, %ymm11,
1289
%ymm12, %ymm13, %ymm14,
1290
%ymm15, %rax, %r9, 9);
1291
aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1292
%ymm12, %ymm13, %ymm14, %ymm15,
1293
%ymm0, %ymm1, %ymm2, %ymm3,
1294
%ymm4, %ymm5, %ymm6, %ymm7,
1295
%rax, %r9, 10);
1296
cmpl $12, ARIA_CTX_rounds(CTX);
1297
jne .Laria_gfni_192;
1298
aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1299
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1300
%ymm15, %rax, %r9, 11, 12);
1301
jmp .Laria_gfni_end;
1302
.Laria_gfni_192:
1303
aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1304
%ymm4, %ymm5, %ymm6, %ymm7,
1305
%ymm8, %ymm9, %ymm10, %ymm11,
1306
%ymm12, %ymm13, %ymm14,
1307
%ymm15, %rax, %r9, 11);
1308
aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1309
%ymm12, %ymm13, %ymm14, %ymm15,
1310
%ymm0, %ymm1, %ymm2, %ymm3,
1311
%ymm4, %ymm5, %ymm6, %ymm7,
1312
%rax, %r9, 12);
1313
cmpl $14, ARIA_CTX_rounds(CTX);
1314
jne .Laria_gfni_256;
1315
aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1316
%ymm4, %ymm5, %ymm6, %ymm7,
1317
%ymm8, %ymm9, %ymm10, %ymm11,
1318
%ymm12, %ymm13, %ymm14,
1319
%ymm15, %rax, %r9, 13, 14);
1320
jmp .Laria_gfni_end;
1321
.Laria_gfni_256:
1322
aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1323
%ymm4, %ymm5, %ymm6, %ymm7,
1324
%ymm8, %ymm9, %ymm10, %ymm11,
1325
%ymm12, %ymm13, %ymm14,
1326
%ymm15, %rax, %r9, 13);
1327
aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1328
%ymm12, %ymm13, %ymm14, %ymm15,
1329
%ymm0, %ymm1, %ymm2, %ymm3,
1330
%ymm4, %ymm5, %ymm6, %ymm7,
1331
%rax, %r9, 14);
1332
aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1333
%ymm4, %ymm5, %ymm6, %ymm7,
1334
%ymm8, %ymm9, %ymm10, %ymm11,
1335
%ymm12, %ymm13, %ymm14,
1336
%ymm15, %rax, %r9, 15, 16);
1337
.Laria_gfni_end:
1338
debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
1339
%ymm9, %ymm13, %ymm0, %ymm5,
1340
%ymm10, %ymm14, %ymm3, %ymm6,
1341
%ymm11, %ymm15, %ymm2, %ymm7,
1342
(%rax), (%r8));
1343
1344
FRAME_END
1345
RET;
1346
SYM_FUNC_END(__aria_aesni_avx2_gfni_crypt_32way)
1347
1348
SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_encrypt_32way)
1349
/* input:
1350
* %rdi: ctx, CTX
1351
* %rsi: dst
1352
* %rdx: src
1353
*/
1354
1355
FRAME_BEGIN
1356
1357
leaq ARIA_CTX_enc_key(CTX), %r9;
1358
1359
inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1360
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1361
%ymm15, %rdx);
1362
1363
call __aria_aesni_avx2_gfni_crypt_32way;
1364
1365
write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1366
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1367
%ymm15, %rax);
1368
1369
FRAME_END
1370
RET;
1371
SYM_FUNC_END(aria_aesni_avx2_gfni_encrypt_32way)
1372
1373
SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_decrypt_32way)
1374
/* input:
1375
* %rdi: ctx, CTX
1376
* %rsi: dst
1377
* %rdx: src
1378
*/
1379
1380
FRAME_BEGIN
1381
1382
leaq ARIA_CTX_dec_key(CTX), %r9;
1383
1384
inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1385
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1386
%ymm15, %rdx);
1387
1388
call __aria_aesni_avx2_gfni_crypt_32way;
1389
1390
write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1391
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1392
%ymm15, %rax);
1393
1394
FRAME_END
1395
RET;
1396
SYM_FUNC_END(aria_aesni_avx2_gfni_decrypt_32way)
1397
1398
SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_ctr_crypt_32way)
1399
/* input:
1400
* %rdi: ctx
1401
* %rsi: dst
1402
* %rdx: src
1403
* %rcx: keystream
1404
* %r8: iv (big endian, 128bit)
1405
*/
1406
FRAME_BEGIN
1407
1408
call __aria_aesni_avx2_ctr_gen_keystream_32way
1409
1410
leaq (%rsi), %r10;
1411
leaq (%rdx), %r11;
1412
leaq (%rcx), %rsi;
1413
leaq (%rcx), %rdx;
1414
leaq ARIA_CTX_enc_key(CTX), %r9;
1415
1416
call __aria_aesni_avx2_gfni_crypt_32way;
1417
1418
vpxor (0 * 32)(%r11), %ymm1, %ymm1;
1419
vpxor (1 * 32)(%r11), %ymm0, %ymm0;
1420
vpxor (2 * 32)(%r11), %ymm3, %ymm3;
1421
vpxor (3 * 32)(%r11), %ymm2, %ymm2;
1422
vpxor (4 * 32)(%r11), %ymm4, %ymm4;
1423
vpxor (5 * 32)(%r11), %ymm5, %ymm5;
1424
vpxor (6 * 32)(%r11), %ymm6, %ymm6;
1425
vpxor (7 * 32)(%r11), %ymm7, %ymm7;
1426
vpxor (8 * 32)(%r11), %ymm8, %ymm8;
1427
vpxor (9 * 32)(%r11), %ymm9, %ymm9;
1428
vpxor (10 * 32)(%r11), %ymm10, %ymm10;
1429
vpxor (11 * 32)(%r11), %ymm11, %ymm11;
1430
vpxor (12 * 32)(%r11), %ymm12, %ymm12;
1431
vpxor (13 * 32)(%r11), %ymm13, %ymm13;
1432
vpxor (14 * 32)(%r11), %ymm14, %ymm14;
1433
vpxor (15 * 32)(%r11), %ymm15, %ymm15;
1434
write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1435
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1436
%ymm15, %r10);
1437
1438
FRAME_END
1439
RET;
1440
SYM_FUNC_END(aria_aesni_avx2_gfni_ctr_crypt_32way)
1441
#endif /* CONFIG_AS_GFNI */
1442
1443