Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/crypto/aria-aesni-avx2-asm_64.S
54321 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* ARIA Cipher 32-way parallel algorithm (AVX2)
4
*
5
* Copyright (c) 2022 Taehee Yoo <[email protected]>
6
*
7
*/
8
9
#include <linux/linkage.h>
10
#include <asm/frame.h>
11
#include <asm/asm-offsets.h>
12
#include <linux/cfi_types.h>
13
14
/* register macros */
15
#define CTX %rdi
16
17
#define ymm0_x xmm0
18
#define ymm1_x xmm1
19
#define ymm2_x xmm2
20
#define ymm3_x xmm3
21
#define ymm4_x xmm4
22
#define ymm5_x xmm5
23
#define ymm6_x xmm6
24
#define ymm7_x xmm7
25
#define ymm8_x xmm8
26
#define ymm9_x xmm9
27
#define ymm10_x xmm10
28
#define ymm11_x xmm11
29
#define ymm12_x xmm12
30
#define ymm13_x xmm13
31
#define ymm14_x xmm14
32
#define ymm15_x xmm15
33
34
#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
35
( (((a0) & 1) << 0) | \
36
(((a1) & 1) << 1) | \
37
(((a2) & 1) << 2) | \
38
(((a3) & 1) << 3) | \
39
(((a4) & 1) << 4) | \
40
(((a5) & 1) << 5) | \
41
(((a6) & 1) << 6) | \
42
(((a7) & 1) << 7) )
43
44
#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
45
( ((l7) << (0 * 8)) | \
46
((l6) << (1 * 8)) | \
47
((l5) << (2 * 8)) | \
48
((l4) << (3 * 8)) | \
49
((l3) << (4 * 8)) | \
50
((l2) << (5 * 8)) | \
51
((l1) << (6 * 8)) | \
52
((l0) << (7 * 8)) )
53
54
#define inc_le128(x, minus_one, tmp) \
55
vpcmpeqq minus_one, x, tmp; \
56
vpsubq minus_one, x, x; \
57
vpslldq $8, tmp, tmp; \
58
vpsubq tmp, x, x;
59
60
#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
61
vpand x, mask4bit, tmp0; \
62
vpandn x, mask4bit, x; \
63
vpsrld $4, x, x; \
64
\
65
vpshufb tmp0, lo_t, tmp0; \
66
vpshufb x, hi_t, x; \
67
vpxor tmp0, x, x;
68
69
#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
70
vpunpckhdq x1, x0, t2; \
71
vpunpckldq x1, x0, x0; \
72
\
73
vpunpckldq x3, x2, t1; \
74
vpunpckhdq x3, x2, x2; \
75
\
76
vpunpckhqdq t1, x0, x1; \
77
vpunpcklqdq t1, x0, x0; \
78
\
79
vpunpckhqdq x2, t2, x3; \
80
vpunpcklqdq x2, t2, x2;
81
82
#define byteslice_16x16b(a0, b0, c0, d0, \
83
a1, b1, c1, d1, \
84
a2, b2, c2, d2, \
85
a3, b3, c3, d3, \
86
st0, st1) \
87
vmovdqu d2, st0; \
88
vmovdqu d3, st1; \
89
transpose_4x4(a0, a1, a2, a3, d2, d3); \
90
transpose_4x4(b0, b1, b2, b3, d2, d3); \
91
vmovdqu st0, d2; \
92
vmovdqu st1, d3; \
93
\
94
vmovdqu a0, st0; \
95
vmovdqu a1, st1; \
96
transpose_4x4(c0, c1, c2, c3, a0, a1); \
97
transpose_4x4(d0, d1, d2, d3, a0, a1); \
98
\
99
vbroadcasti128 .Lshufb_16x16b(%rip), a0; \
100
vmovdqu st1, a1; \
101
vpshufb a0, a2, a2; \
102
vpshufb a0, a3, a3; \
103
vpshufb a0, b0, b0; \
104
vpshufb a0, b1, b1; \
105
vpshufb a0, b2, b2; \
106
vpshufb a0, b3, b3; \
107
vpshufb a0, a1, a1; \
108
vpshufb a0, c0, c0; \
109
vpshufb a0, c1, c1; \
110
vpshufb a0, c2, c2; \
111
vpshufb a0, c3, c3; \
112
vpshufb a0, d0, d0; \
113
vpshufb a0, d1, d1; \
114
vpshufb a0, d2, d2; \
115
vpshufb a0, d3, d3; \
116
vmovdqu d3, st1; \
117
vmovdqu st0, d3; \
118
vpshufb a0, d3, a0; \
119
vmovdqu d2, st0; \
120
\
121
transpose_4x4(a0, b0, c0, d0, d2, d3); \
122
transpose_4x4(a1, b1, c1, d1, d2, d3); \
123
vmovdqu st0, d2; \
124
vmovdqu st1, d3; \
125
\
126
vmovdqu b0, st0; \
127
vmovdqu b1, st1; \
128
transpose_4x4(a2, b2, c2, d2, b0, b1); \
129
transpose_4x4(a3, b3, c3, d3, b0, b1); \
130
vmovdqu st0, b0; \
131
vmovdqu st1, b1; \
132
/* does not adjust output bytes inside vectors */
133
134
#define debyteslice_16x16b(a0, b0, c0, d0, \
135
a1, b1, c1, d1, \
136
a2, b2, c2, d2, \
137
a3, b3, c3, d3, \
138
st0, st1) \
139
vmovdqu d2, st0; \
140
vmovdqu d3, st1; \
141
transpose_4x4(a0, a1, a2, a3, d2, d3); \
142
transpose_4x4(b0, b1, b2, b3, d2, d3); \
143
vmovdqu st0, d2; \
144
vmovdqu st1, d3; \
145
\
146
vmovdqu a0, st0; \
147
vmovdqu a1, st1; \
148
transpose_4x4(c0, c1, c2, c3, a0, a1); \
149
transpose_4x4(d0, d1, d2, d3, a0, a1); \
150
\
151
vbroadcasti128 .Lshufb_16x16b(%rip), a0; \
152
vmovdqu st1, a1; \
153
vpshufb a0, a2, a2; \
154
vpshufb a0, a3, a3; \
155
vpshufb a0, b0, b0; \
156
vpshufb a0, b1, b1; \
157
vpshufb a0, b2, b2; \
158
vpshufb a0, b3, b3; \
159
vpshufb a0, a1, a1; \
160
vpshufb a0, c0, c0; \
161
vpshufb a0, c1, c1; \
162
vpshufb a0, c2, c2; \
163
vpshufb a0, c3, c3; \
164
vpshufb a0, d0, d0; \
165
vpshufb a0, d1, d1; \
166
vpshufb a0, d2, d2; \
167
vpshufb a0, d3, d3; \
168
vmovdqu d3, st1; \
169
vmovdqu st0, d3; \
170
vpshufb a0, d3, a0; \
171
vmovdqu d2, st0; \
172
\
173
transpose_4x4(c0, d0, a0, b0, d2, d3); \
174
transpose_4x4(c1, d1, a1, b1, d2, d3); \
175
vmovdqu st0, d2; \
176
vmovdqu st1, d3; \
177
\
178
vmovdqu b0, st0; \
179
vmovdqu b1, st1; \
180
transpose_4x4(c2, d2, a2, b2, b0, b1); \
181
transpose_4x4(c3, d3, a3, b3, b0, b1); \
182
vmovdqu st0, b0; \
183
vmovdqu st1, b1; \
184
/* does not adjust output bytes inside vectors */
185
186
/* load blocks to registers and apply pre-whitening */
187
#define inpack16_pre(x0, x1, x2, x3, \
188
x4, x5, x6, x7, \
189
y0, y1, y2, y3, \
190
y4, y5, y6, y7, \
191
rio) \
192
vmovdqu (0 * 32)(rio), x0; \
193
vmovdqu (1 * 32)(rio), x1; \
194
vmovdqu (2 * 32)(rio), x2; \
195
vmovdqu (3 * 32)(rio), x3; \
196
vmovdqu (4 * 32)(rio), x4; \
197
vmovdqu (5 * 32)(rio), x5; \
198
vmovdqu (6 * 32)(rio), x6; \
199
vmovdqu (7 * 32)(rio), x7; \
200
vmovdqu (8 * 32)(rio), y0; \
201
vmovdqu (9 * 32)(rio), y1; \
202
vmovdqu (10 * 32)(rio), y2; \
203
vmovdqu (11 * 32)(rio), y3; \
204
vmovdqu (12 * 32)(rio), y4; \
205
vmovdqu (13 * 32)(rio), y5; \
206
vmovdqu (14 * 32)(rio), y6; \
207
vmovdqu (15 * 32)(rio), y7;
208
209
/* byteslice pre-whitened blocks and store to temporary memory */
210
#define inpack16_post(x0, x1, x2, x3, \
211
x4, x5, x6, x7, \
212
y0, y1, y2, y3, \
213
y4, y5, y6, y7, \
214
mem_ab, mem_cd) \
215
byteslice_16x16b(x0, x1, x2, x3, \
216
x4, x5, x6, x7, \
217
y0, y1, y2, y3, \
218
y4, y5, y6, y7, \
219
(mem_ab), (mem_cd)); \
220
\
221
vmovdqu x0, 0 * 32(mem_ab); \
222
vmovdqu x1, 1 * 32(mem_ab); \
223
vmovdqu x2, 2 * 32(mem_ab); \
224
vmovdqu x3, 3 * 32(mem_ab); \
225
vmovdqu x4, 4 * 32(mem_ab); \
226
vmovdqu x5, 5 * 32(mem_ab); \
227
vmovdqu x6, 6 * 32(mem_ab); \
228
vmovdqu x7, 7 * 32(mem_ab); \
229
vmovdqu y0, 0 * 32(mem_cd); \
230
vmovdqu y1, 1 * 32(mem_cd); \
231
vmovdqu y2, 2 * 32(mem_cd); \
232
vmovdqu y3, 3 * 32(mem_cd); \
233
vmovdqu y4, 4 * 32(mem_cd); \
234
vmovdqu y5, 5 * 32(mem_cd); \
235
vmovdqu y6, 6 * 32(mem_cd); \
236
vmovdqu y7, 7 * 32(mem_cd);
237
238
#define write_output(x0, x1, x2, x3, \
239
x4, x5, x6, x7, \
240
y0, y1, y2, y3, \
241
y4, y5, y6, y7, \
242
mem) \
243
vmovdqu x0, 0 * 32(mem); \
244
vmovdqu x1, 1 * 32(mem); \
245
vmovdqu x2, 2 * 32(mem); \
246
vmovdqu x3, 3 * 32(mem); \
247
vmovdqu x4, 4 * 32(mem); \
248
vmovdqu x5, 5 * 32(mem); \
249
vmovdqu x6, 6 * 32(mem); \
250
vmovdqu x7, 7 * 32(mem); \
251
vmovdqu y0, 8 * 32(mem); \
252
vmovdqu y1, 9 * 32(mem); \
253
vmovdqu y2, 10 * 32(mem); \
254
vmovdqu y3, 11 * 32(mem); \
255
vmovdqu y4, 12 * 32(mem); \
256
vmovdqu y5, 13 * 32(mem); \
257
vmovdqu y6, 14 * 32(mem); \
258
vmovdqu y7, 15 * 32(mem); \
259
260
#define aria_store_state_8way(x0, x1, x2, x3, \
261
x4, x5, x6, x7, \
262
mem_tmp, idx) \
263
vmovdqu x0, ((idx + 0) * 32)(mem_tmp); \
264
vmovdqu x1, ((idx + 1) * 32)(mem_tmp); \
265
vmovdqu x2, ((idx + 2) * 32)(mem_tmp); \
266
vmovdqu x3, ((idx + 3) * 32)(mem_tmp); \
267
vmovdqu x4, ((idx + 4) * 32)(mem_tmp); \
268
vmovdqu x5, ((idx + 5) * 32)(mem_tmp); \
269
vmovdqu x6, ((idx + 6) * 32)(mem_tmp); \
270
vmovdqu x7, ((idx + 7) * 32)(mem_tmp);
271
272
#define aria_load_state_8way(x0, x1, x2, x3, \
273
x4, x5, x6, x7, \
274
mem_tmp, idx) \
275
vmovdqu ((idx + 0) * 32)(mem_tmp), x0; \
276
vmovdqu ((idx + 1) * 32)(mem_tmp), x1; \
277
vmovdqu ((idx + 2) * 32)(mem_tmp), x2; \
278
vmovdqu ((idx + 3) * 32)(mem_tmp), x3; \
279
vmovdqu ((idx + 4) * 32)(mem_tmp), x4; \
280
vmovdqu ((idx + 5) * 32)(mem_tmp), x5; \
281
vmovdqu ((idx + 6) * 32)(mem_tmp), x6; \
282
vmovdqu ((idx + 7) * 32)(mem_tmp), x7;
283
284
#define aria_ark_8way(x0, x1, x2, x3, \
285
x4, x5, x6, x7, \
286
t0, rk, idx, round) \
287
/* AddRoundKey */ \
288
vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \
289
vpxor t0, x0, x0; \
290
vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \
291
vpxor t0, x1, x1; \
292
vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \
293
vpxor t0, x2, x2; \
294
vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \
295
vpxor t0, x3, x3; \
296
vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \
297
vpxor t0, x4, x4; \
298
vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \
299
vpxor t0, x5, x5; \
300
vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \
301
vpxor t0, x6, x6; \
302
vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \
303
vpxor t0, x7, x7;
304
305
#define aria_sbox_8way_gfni(x0, x1, x2, x3, \
306
x4, x5, x6, x7, \
307
t0, t1, t2, t3, \
308
t4, t5, t6, t7) \
309
vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \
310
vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \
311
vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \
312
vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \
313
vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \
314
vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
315
vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
316
vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
317
vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
318
vgf2p8affineinvqb $0, t2, x2, x2; \
319
vgf2p8affineinvqb $0, t2, x6, x6; \
320
vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
321
vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
322
vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
323
vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
324
vgf2p8affineinvqb $0, t2, x3, x3; \
325
vgf2p8affineinvqb $0, t2, x7, x7
326
327
#define aria_sbox_8way(x0, x1, x2, x3, \
328
x4, x5, x6, x7, \
329
t0, t1, t2, t3, \
330
t4, t5, t6, t7) \
331
vpxor t7, t7, t7; \
332
vpxor t6, t6, t6; \
333
vbroadcasti128 .Linv_shift_row(%rip), t0; \
334
vbroadcasti128 .Lshift_row(%rip), t1; \
335
vbroadcasti128 .Ltf_lo__inv_aff__and__s2(%rip), t2; \
336
vbroadcasti128 .Ltf_hi__inv_aff__and__s2(%rip), t3; \
337
vbroadcasti128 .Ltf_lo__x2__and__fwd_aff(%rip), t4; \
338
vbroadcasti128 .Ltf_hi__x2__and__fwd_aff(%rip), t5; \
339
\
340
vextracti128 $1, x0, t6##_x; \
341
vaesenclast t7##_x, x0##_x, x0##_x; \
342
vaesenclast t7##_x, t6##_x, t6##_x; \
343
vinserti128 $1, t6##_x, x0, x0; \
344
\
345
vextracti128 $1, x4, t6##_x; \
346
vaesenclast t7##_x, x4##_x, x4##_x; \
347
vaesenclast t7##_x, t6##_x, t6##_x; \
348
vinserti128 $1, t6##_x, x4, x4; \
349
\
350
vextracti128 $1, x1, t6##_x; \
351
vaesenclast t7##_x, x1##_x, x1##_x; \
352
vaesenclast t7##_x, t6##_x, t6##_x; \
353
vinserti128 $1, t6##_x, x1, x1; \
354
\
355
vextracti128 $1, x5, t6##_x; \
356
vaesenclast t7##_x, x5##_x, x5##_x; \
357
vaesenclast t7##_x, t6##_x, t6##_x; \
358
vinserti128 $1, t6##_x, x5, x5; \
359
\
360
vextracti128 $1, x2, t6##_x; \
361
vaesdeclast t7##_x, x2##_x, x2##_x; \
362
vaesdeclast t7##_x, t6##_x, t6##_x; \
363
vinserti128 $1, t6##_x, x2, x2; \
364
\
365
vextracti128 $1, x6, t6##_x; \
366
vaesdeclast t7##_x, x6##_x, x6##_x; \
367
vaesdeclast t7##_x, t6##_x, t6##_x; \
368
vinserti128 $1, t6##_x, x6, x6; \
369
\
370
vpbroadcastd .L0f0f0f0f(%rip), t6; \
371
\
372
/* AES inverse shift rows */ \
373
vpshufb t0, x0, x0; \
374
vpshufb t0, x4, x4; \
375
vpshufb t0, x1, x1; \
376
vpshufb t0, x5, x5; \
377
vpshufb t1, x3, x3; \
378
vpshufb t1, x7, x7; \
379
vpshufb t1, x2, x2; \
380
vpshufb t1, x6, x6; \
381
\
382
/* affine transformation for S2 */ \
383
filter_8bit(x1, t2, t3, t6, t0); \
384
/* affine transformation for S2 */ \
385
filter_8bit(x5, t2, t3, t6, t0); \
386
\
387
/* affine transformation for X2 */ \
388
filter_8bit(x3, t4, t5, t6, t0); \
389
/* affine transformation for X2 */ \
390
filter_8bit(x7, t4, t5, t6, t0); \
391
\
392
vpxor t6, t6, t6; \
393
vextracti128 $1, x3, t6##_x; \
394
vaesdeclast t7##_x, x3##_x, x3##_x; \
395
vaesdeclast t7##_x, t6##_x, t6##_x; \
396
vinserti128 $1, t6##_x, x3, x3; \
397
\
398
vextracti128 $1, x7, t6##_x; \
399
vaesdeclast t7##_x, x7##_x, x7##_x; \
400
vaesdeclast t7##_x, t6##_x, t6##_x; \
401
vinserti128 $1, t6##_x, x7, x7; \
402
403
#define aria_diff_m(x0, x1, x2, x3, \
404
t0, t1, t2, t3) \
405
/* T = rotr32(X, 8); */ \
406
/* X ^= T */ \
407
vpxor x0, x3, t0; \
408
vpxor x1, x0, t1; \
409
vpxor x2, x1, t2; \
410
vpxor x3, x2, t3; \
411
/* X = T ^ rotr(X, 16); */ \
412
vpxor t2, x0, x0; \
413
vpxor x1, t3, t3; \
414
vpxor t0, x2, x2; \
415
vpxor t1, x3, x1; \
416
vmovdqu t3, x3;
417
418
#define aria_diff_word(x0, x1, x2, x3, \
419
x4, x5, x6, x7, \
420
y0, y1, y2, y3, \
421
y4, y5, y6, y7) \
422
/* t1 ^= t2; */ \
423
vpxor y0, x4, x4; \
424
vpxor y1, x5, x5; \
425
vpxor y2, x6, x6; \
426
vpxor y3, x7, x7; \
427
\
428
/* t2 ^= t3; */ \
429
vpxor y4, y0, y0; \
430
vpxor y5, y1, y1; \
431
vpxor y6, y2, y2; \
432
vpxor y7, y3, y3; \
433
\
434
/* t0 ^= t1; */ \
435
vpxor x4, x0, x0; \
436
vpxor x5, x1, x1; \
437
vpxor x6, x2, x2; \
438
vpxor x7, x3, x3; \
439
\
440
/* t3 ^= t1; */ \
441
vpxor x4, y4, y4; \
442
vpxor x5, y5, y5; \
443
vpxor x6, y6, y6; \
444
vpxor x7, y7, y7; \
445
\
446
/* t2 ^= t0; */ \
447
vpxor x0, y0, y0; \
448
vpxor x1, y1, y1; \
449
vpxor x2, y2, y2; \
450
vpxor x3, y3, y3; \
451
\
452
/* t1 ^= t2; */ \
453
vpxor y0, x4, x4; \
454
vpxor y1, x5, x5; \
455
vpxor y2, x6, x6; \
456
vpxor y3, x7, x7;
457
458
#define aria_fe(x0, x1, x2, x3, \
459
x4, x5, x6, x7, \
460
y0, y1, y2, y3, \
461
y4, y5, y6, y7, \
462
mem_tmp, rk, round) \
463
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
464
y0, rk, 8, round); \
465
\
466
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
467
y0, y1, y2, y3, y4, y5, y6, y7); \
468
\
469
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
470
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
471
aria_store_state_8way(x0, x1, x2, x3, \
472
x4, x5, x6, x7, \
473
mem_tmp, 8); \
474
\
475
aria_load_state_8way(x0, x1, x2, x3, \
476
x4, x5, x6, x7, \
477
mem_tmp, 0); \
478
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
479
y0, rk, 0, round); \
480
\
481
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
482
y0, y1, y2, y3, y4, y5, y6, y7); \
483
\
484
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
485
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
486
aria_store_state_8way(x0, x1, x2, x3, \
487
x4, x5, x6, x7, \
488
mem_tmp, 0); \
489
aria_load_state_8way(y0, y1, y2, y3, \
490
y4, y5, y6, y7, \
491
mem_tmp, 8); \
492
aria_diff_word(x0, x1, x2, x3, \
493
x4, x5, x6, x7, \
494
y0, y1, y2, y3, \
495
y4, y5, y6, y7); \
496
/* aria_diff_byte() \
497
* T3 = ABCD -> BADC \
498
* T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
499
* T0 = ABCD -> CDAB \
500
* T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
501
* T1 = ABCD -> DCBA \
502
* T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
503
*/ \
504
aria_diff_word(x2, x3, x0, x1, \
505
x7, x6, x5, x4, \
506
y0, y1, y2, y3, \
507
y5, y4, y7, y6); \
508
aria_store_state_8way(x3, x2, x1, x0, \
509
x6, x7, x4, x5, \
510
mem_tmp, 0);
511
512
#define aria_fo(x0, x1, x2, x3, \
513
x4, x5, x6, x7, \
514
y0, y1, y2, y3, \
515
y4, y5, y6, y7, \
516
mem_tmp, rk, round) \
517
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
518
y0, rk, 8, round); \
519
\
520
aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
521
y0, y1, y2, y3, y4, y5, y6, y7); \
522
\
523
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
524
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
525
aria_store_state_8way(x0, x1, x2, x3, \
526
x4, x5, x6, x7, \
527
mem_tmp, 8); \
528
\
529
aria_load_state_8way(x0, x1, x2, x3, \
530
x4, x5, x6, x7, \
531
mem_tmp, 0); \
532
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
533
y0, rk, 0, round); \
534
\
535
aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
536
y0, y1, y2, y3, y4, y5, y6, y7); \
537
\
538
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
539
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
540
aria_store_state_8way(x0, x1, x2, x3, \
541
x4, x5, x6, x7, \
542
mem_tmp, 0); \
543
aria_load_state_8way(y0, y1, y2, y3, \
544
y4, y5, y6, y7, \
545
mem_tmp, 8); \
546
aria_diff_word(x0, x1, x2, x3, \
547
x4, x5, x6, x7, \
548
y0, y1, y2, y3, \
549
y4, y5, y6, y7); \
550
/* aria_diff_byte() \
551
* T1 = ABCD -> BADC \
552
* T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
553
* T2 = ABCD -> CDAB \
554
* T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
555
* T3 = ABCD -> DCBA \
556
* T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
557
*/ \
558
aria_diff_word(x0, x1, x2, x3, \
559
x5, x4, x7, x6, \
560
y2, y3, y0, y1, \
561
y7, y6, y5, y4); \
562
aria_store_state_8way(x3, x2, x1, x0, \
563
x6, x7, x4, x5, \
564
mem_tmp, 0);
565
566
#define aria_ff(x0, x1, x2, x3, \
567
x4, x5, x6, x7, \
568
y0, y1, y2, y3, \
569
y4, y5, y6, y7, \
570
mem_tmp, rk, round, last_round) \
571
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
572
y0, rk, 8, round); \
573
\
574
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
575
y0, y1, y2, y3, y4, y5, y6, y7); \
576
\
577
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
578
y0, rk, 8, last_round); \
579
\
580
aria_store_state_8way(x0, x1, x2, x3, \
581
x4, x5, x6, x7, \
582
mem_tmp, 8); \
583
\
584
aria_load_state_8way(x0, x1, x2, x3, \
585
x4, x5, x6, x7, \
586
mem_tmp, 0); \
587
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
588
y0, rk, 0, round); \
589
\
590
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
591
y0, y1, y2, y3, y4, y5, y6, y7); \
592
\
593
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
594
y0, rk, 0, last_round); \
595
\
596
aria_load_state_8way(y0, y1, y2, y3, \
597
y4, y5, y6, y7, \
598
mem_tmp, 8);
599
600
#define aria_fe_gfni(x0, x1, x2, x3, \
601
x4, x5, x6, x7, \
602
y0, y1, y2, y3, \
603
y4, y5, y6, y7, \
604
mem_tmp, rk, round) \
605
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
606
y0, rk, 8, round); \
607
\
608
aria_sbox_8way_gfni(x2, x3, x0, x1, \
609
x6, x7, x4, x5, \
610
y0, y1, y2, y3, \
611
y4, y5, y6, y7); \
612
\
613
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
614
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
615
aria_store_state_8way(x0, x1, x2, x3, \
616
x4, x5, x6, x7, \
617
mem_tmp, 8); \
618
\
619
aria_load_state_8way(x0, x1, x2, x3, \
620
x4, x5, x6, x7, \
621
mem_tmp, 0); \
622
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
623
y0, rk, 0, round); \
624
\
625
aria_sbox_8way_gfni(x2, x3, x0, x1, \
626
x6, x7, x4, x5, \
627
y0, y1, y2, y3, \
628
y4, y5, y6, y7); \
629
\
630
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
631
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
632
aria_store_state_8way(x0, x1, x2, x3, \
633
x4, x5, x6, x7, \
634
mem_tmp, 0); \
635
aria_load_state_8way(y0, y1, y2, y3, \
636
y4, y5, y6, y7, \
637
mem_tmp, 8); \
638
aria_diff_word(x0, x1, x2, x3, \
639
x4, x5, x6, x7, \
640
y0, y1, y2, y3, \
641
y4, y5, y6, y7); \
642
/* aria_diff_byte() \
643
* T3 = ABCD -> BADC \
644
* T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
645
* T0 = ABCD -> CDAB \
646
* T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
647
* T1 = ABCD -> DCBA \
648
* T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
649
*/ \
650
aria_diff_word(x2, x3, x0, x1, \
651
x7, x6, x5, x4, \
652
y0, y1, y2, y3, \
653
y5, y4, y7, y6); \
654
aria_store_state_8way(x3, x2, x1, x0, \
655
x6, x7, x4, x5, \
656
mem_tmp, 0);
657
658
#define aria_fo_gfni(x0, x1, x2, x3, \
659
x4, x5, x6, x7, \
660
y0, y1, y2, y3, \
661
y4, y5, y6, y7, \
662
mem_tmp, rk, round) \
663
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
664
y0, rk, 8, round); \
665
\
666
aria_sbox_8way_gfni(x0, x1, x2, x3, \
667
x4, x5, x6, x7, \
668
y0, y1, y2, y3, \
669
y4, y5, y6, y7); \
670
\
671
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
672
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
673
aria_store_state_8way(x0, x1, x2, x3, \
674
x4, x5, x6, x7, \
675
mem_tmp, 8); \
676
\
677
aria_load_state_8way(x0, x1, x2, x3, \
678
x4, x5, x6, x7, \
679
mem_tmp, 0); \
680
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
681
y0, rk, 0, round); \
682
\
683
aria_sbox_8way_gfni(x0, x1, x2, x3, \
684
x4, x5, x6, x7, \
685
y0, y1, y2, y3, \
686
y4, y5, y6, y7); \
687
\
688
aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
689
aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
690
aria_store_state_8way(x0, x1, x2, x3, \
691
x4, x5, x6, x7, \
692
mem_tmp, 0); \
693
aria_load_state_8way(y0, y1, y2, y3, \
694
y4, y5, y6, y7, \
695
mem_tmp, 8); \
696
aria_diff_word(x0, x1, x2, x3, \
697
x4, x5, x6, x7, \
698
y0, y1, y2, y3, \
699
y4, y5, y6, y7); \
700
/* aria_diff_byte() \
701
* T1 = ABCD -> BADC \
702
* T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
703
* T2 = ABCD -> CDAB \
704
* T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
705
* T3 = ABCD -> DCBA \
706
* T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
707
*/ \
708
aria_diff_word(x0, x1, x2, x3, \
709
x5, x4, x7, x6, \
710
y2, y3, y0, y1, \
711
y7, y6, y5, y4); \
712
aria_store_state_8way(x3, x2, x1, x0, \
713
x6, x7, x4, x5, \
714
mem_tmp, 0);
715
716
#define aria_ff_gfni(x0, x1, x2, x3, \
717
x4, x5, x6, x7, \
718
y0, y1, y2, y3, \
719
y4, y5, y6, y7, \
720
mem_tmp, rk, round, last_round) \
721
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
722
y0, rk, 8, round); \
723
\
724
aria_sbox_8way_gfni(x2, x3, x0, x1, \
725
x6, x7, x4, x5, \
726
y0, y1, y2, y3, \
727
y4, y5, y6, y7); \
728
\
729
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
730
y0, rk, 8, last_round); \
731
\
732
aria_store_state_8way(x0, x1, x2, x3, \
733
x4, x5, x6, x7, \
734
mem_tmp, 8); \
735
\
736
aria_load_state_8way(x0, x1, x2, x3, \
737
x4, x5, x6, x7, \
738
mem_tmp, 0); \
739
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
740
y0, rk, 0, round); \
741
\
742
aria_sbox_8way_gfni(x2, x3, x0, x1, \
743
x6, x7, x4, x5, \
744
y0, y1, y2, y3, \
745
y4, y5, y6, y7); \
746
\
747
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
748
y0, rk, 0, last_round); \
749
\
750
aria_load_state_8way(y0, y1, y2, y3, \
751
y4, y5, y6, y7, \
752
mem_tmp, 8);
753
754
.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
755
.align 32
756
#define SHUFB_BYTES(idx) \
757
0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
758
.Lshufb_16x16b:
759
.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
760
.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
761
762
.section .rodata.cst16, "aM", @progbits, 16
763
.align 16
764
/* For isolating SubBytes from AESENCLAST, inverse shift row */
765
.Linv_shift_row:
766
.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
767
.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
768
.Lshift_row:
769
.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
770
.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
771
/* For CTR-mode IV byteswap */
772
.Lbswap128_mask:
773
.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
774
.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
775
776
/* AES inverse affine and S2 combined:
777
* 1 1 0 0 0 0 0 1 x0 0
778
* 0 1 0 0 1 0 0 0 x1 0
779
* 1 1 0 0 1 1 1 1 x2 0
780
* 0 1 1 0 1 0 0 1 x3 1
781
* 0 1 0 0 1 1 0 0 * x4 + 0
782
* 0 1 0 1 1 0 0 0 x5 0
783
* 0 0 0 0 0 1 0 1 x6 0
784
* 1 1 1 0 0 1 1 1 x7 1
785
*/
786
.Ltf_lo__inv_aff__and__s2:
787
.octa 0x92172DA81A9FA520B2370D883ABF8500
788
.Ltf_hi__inv_aff__and__s2:
789
.octa 0x2B15FFC1AF917B45E6D8320C625CB688
790
791
/* X2 and AES forward affine combined:
792
* 1 0 1 1 0 0 0 1 x0 0
793
* 0 1 1 1 1 0 1 1 x1 0
794
* 0 0 0 1 1 0 1 0 x2 1
795
* 0 1 0 0 0 1 0 0 x3 0
796
* 0 0 1 1 1 0 1 1 * x4 + 0
797
* 0 1 0 0 1 0 0 0 x5 0
798
* 1 1 0 1 0 0 1 1 x6 0
799
* 0 1 0 0 1 0 1 0 x7 0
800
*/
801
.Ltf_lo__x2__and__fwd_aff:
802
.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
803
.Ltf_hi__x2__and__fwd_aff:
804
.octa 0x3F893781E95FE1576CDA64D2BA0CB204
805
806
.section .rodata.cst8, "aM", @progbits, 8
807
.align 8
808
/* AES affine: */
809
#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
810
.Ltf_aff_bitmatrix:
811
.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
812
BV8(1, 1, 0, 0, 0, 1, 1, 1),
813
BV8(1, 1, 1, 0, 0, 0, 1, 1),
814
BV8(1, 1, 1, 1, 0, 0, 0, 1),
815
BV8(1, 1, 1, 1, 1, 0, 0, 0),
816
BV8(0, 1, 1, 1, 1, 1, 0, 0),
817
BV8(0, 0, 1, 1, 1, 1, 1, 0),
818
BV8(0, 0, 0, 1, 1, 1, 1, 1))
819
820
/* AES inverse affine: */
821
#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
822
.Ltf_inv_bitmatrix:
823
.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
824
BV8(1, 0, 0, 1, 0, 0, 1, 0),
825
BV8(0, 1, 0, 0, 1, 0, 0, 1),
826
BV8(1, 0, 1, 0, 0, 1, 0, 0),
827
BV8(0, 1, 0, 1, 0, 0, 1, 0),
828
BV8(0, 0, 1, 0, 1, 0, 0, 1),
829
BV8(1, 0, 0, 1, 0, 1, 0, 0),
830
BV8(0, 1, 0, 0, 1, 0, 1, 0))
831
832
/* S2: */
833
#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
834
.Ltf_s2_bitmatrix:
835
.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
836
BV8(0, 0, 1, 1, 1, 1, 1, 1),
837
BV8(1, 1, 1, 0, 1, 1, 0, 1),
838
BV8(1, 1, 0, 0, 0, 0, 1, 1),
839
BV8(0, 1, 0, 0, 0, 0, 1, 1),
840
BV8(1, 1, 0, 0, 1, 1, 1, 0),
841
BV8(0, 1, 1, 0, 0, 0, 1, 1),
842
BV8(1, 1, 1, 1, 0, 1, 1, 0))
843
844
/* X2: */
845
#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
846
.Ltf_x2_bitmatrix:
847
.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
848
BV8(0, 0, 1, 0, 0, 1, 1, 0),
849
BV8(0, 0, 0, 0, 1, 0, 1, 0),
850
BV8(1, 1, 1, 0, 0, 0, 1, 1),
851
BV8(1, 1, 1, 0, 1, 1, 0, 0),
852
BV8(0, 1, 1, 0, 1, 0, 1, 1),
853
BV8(1, 0, 1, 1, 1, 1, 0, 1),
854
BV8(1, 0, 0, 1, 0, 0, 1, 1))
855
856
/* Identity matrix: */
857
.Ltf_id_bitmatrix:
858
.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
859
BV8(0, 1, 0, 0, 0, 0, 0, 0),
860
BV8(0, 0, 1, 0, 0, 0, 0, 0),
861
BV8(0, 0, 0, 1, 0, 0, 0, 0),
862
BV8(0, 0, 0, 0, 1, 0, 0, 0),
863
BV8(0, 0, 0, 0, 0, 1, 0, 0),
864
BV8(0, 0, 0, 0, 0, 0, 1, 0),
865
BV8(0, 0, 0, 0, 0, 0, 0, 1))
866
867
/* 4-bit mask */
868
.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
869
.align 4
870
.L0f0f0f0f:
871
.long 0x0f0f0f0f
872
873
.text
874
875
SYM_FUNC_START_LOCAL(__aria_aesni_avx2_crypt_32way)
876
/* input:
877
* %r9: rk
878
* %rsi: dst
879
* %rdx: src
880
* %ymm0..%ymm15: byte-sliced blocks
881
*/
882
883
FRAME_BEGIN
884
885
movq %rsi, %rax;
886
leaq 8 * 32(%rax), %r8;
887
888
inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
889
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
890
%ymm15, %rax, %r8);
891
aria_fo(%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
892
%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
893
%rax, %r9, 0);
894
aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
895
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
896
%ymm15, %rax, %r9, 1);
897
aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
898
%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
899
%rax, %r9, 2);
900
aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
901
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
902
%ymm15, %rax, %r9, 3);
903
aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
904
%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
905
%rax, %r9, 4);
906
aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
907
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
908
%ymm15, %rax, %r9, 5);
909
aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
910
%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
911
%rax, %r9, 6);
912
aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
913
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
914
%ymm15, %rax, %r9, 7);
915
aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
916
%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
917
%rax, %r9, 8);
918
aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
919
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
920
%ymm15, %rax, %r9, 9);
921
aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
922
%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
923
%rax, %r9, 10);
924
cmpl $12, ARIA_CTX_rounds(CTX);
925
jne .Laria_192;
926
aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
927
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
928
%ymm15, %rax, %r9, 11, 12);
929
jmp .Laria_end;
930
.Laria_192:
931
aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
932
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
933
%ymm15, %rax, %r9, 11);
934
aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
935
%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
936
%rax, %r9, 12);
937
cmpl $14, ARIA_CTX_rounds(CTX);
938
jne .Laria_256;
939
aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
940
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
941
%ymm15, %rax, %r9, 13, 14);
942
jmp .Laria_end;
943
.Laria_256:
944
aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
945
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
946
%ymm15, %rax, %r9, 13);
947
aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
948
%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
949
%rax, %r9, 14);
950
aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
951
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
952
%ymm15, %rax, %r9, 15, 16);
953
.Laria_end:
954
debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
955
%ymm9, %ymm13, %ymm0, %ymm5,
956
%ymm10, %ymm14, %ymm3, %ymm6,
957
%ymm11, %ymm15, %ymm2, %ymm7,
958
(%rax), (%r8));
959
960
FRAME_END
961
RET;
962
SYM_FUNC_END(__aria_aesni_avx2_crypt_32way)
963
964
SYM_TYPED_FUNC_START(aria_aesni_avx2_encrypt_32way)
965
/* input:
966
* %rdi: ctx, CTX
967
* %rsi: dst
968
* %rdx: src
969
*/
970
971
FRAME_BEGIN
972
973
leaq ARIA_CTX_enc_key(CTX), %r9;
974
975
inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
976
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
977
%ymm15, %rdx);
978
979
call __aria_aesni_avx2_crypt_32way;
980
981
write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
982
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
983
%ymm15, %rax);
984
985
FRAME_END
986
RET;
987
SYM_FUNC_END(aria_aesni_avx2_encrypt_32way)
988
989
SYM_TYPED_FUNC_START(aria_aesni_avx2_decrypt_32way)
990
/* input:
991
* %rdi: ctx, CTX
992
* %rsi: dst
993
* %rdx: src
994
*/
995
996
FRAME_BEGIN
997
998
leaq ARIA_CTX_dec_key(CTX), %r9;
999
1000
inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1001
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1002
%ymm15, %rdx);
1003
1004
call __aria_aesni_avx2_crypt_32way;
1005
1006
write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1007
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1008
%ymm15, %rax);
1009
1010
FRAME_END
1011
RET;
1012
SYM_FUNC_END(aria_aesni_avx2_decrypt_32way)
1013
1014
SYM_FUNC_START_LOCAL(__aria_aesni_avx2_ctr_gen_keystream_32way)
1015
/* input:
1016
* %rdi: ctx
1017
* %rsi: dst
1018
* %rdx: src
1019
* %rcx: keystream
1020
* %r8: iv (big endian, 128bit)
1021
*/
1022
1023
FRAME_BEGIN
1024
movq 8(%r8), %r11;
1025
bswapq %r11;
1026
1027
vbroadcasti128 .Lbswap128_mask (%rip), %ymm6;
1028
vpcmpeqd %ymm0, %ymm0, %ymm0;
1029
vpsrldq $8, %ymm0, %ymm0; /* ab: -1:0 ; cd: -1:0 */
1030
vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2:0 ; cd: -2:0 */
1031
1032
/* load IV and byteswap */
1033
vmovdqu (%r8), %xmm7;
1034
vpshufb %xmm6, %xmm7, %xmm7;
1035
vmovdqa %xmm7, %xmm3;
1036
inc_le128(%xmm7, %xmm0, %xmm4);
1037
vinserti128 $1, %xmm7, %ymm3, %ymm3;
1038
vpshufb %ymm6, %ymm3, %ymm8; /* +1 ; +0 */
1039
1040
/* check need for handling 64-bit overflow and carry */
1041
cmpq $(0xffffffffffffffff - 32), %r11;
1042
ja .Lhandle_ctr_carry;
1043
1044
/* construct IVs */
1045
vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; +2 */
1046
vpshufb %ymm6, %ymm3, %ymm9;
1047
vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; +4 */
1048
vpshufb %ymm6, %ymm3, %ymm10;
1049
vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; +6 */
1050
vpshufb %ymm6, %ymm3, %ymm11;
1051
vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; +8 */
1052
vpshufb %ymm6, %ymm3, %ymm12;
1053
vpsubq %ymm5, %ymm3, %ymm3; /* +11 ; +10 */
1054
vpshufb %ymm6, %ymm3, %ymm13;
1055
vpsubq %ymm5, %ymm3, %ymm3; /* +13 ; +12 */
1056
vpshufb %ymm6, %ymm3, %ymm14;
1057
vpsubq %ymm5, %ymm3, %ymm3; /* +15 ; +14 */
1058
vpshufb %ymm6, %ymm3, %ymm15;
1059
vmovdqu %ymm8, (0 * 32)(%rcx);
1060
vmovdqu %ymm9, (1 * 32)(%rcx);
1061
vmovdqu %ymm10, (2 * 32)(%rcx);
1062
vmovdqu %ymm11, (3 * 32)(%rcx);
1063
vmovdqu %ymm12, (4 * 32)(%rcx);
1064
vmovdqu %ymm13, (5 * 32)(%rcx);
1065
vmovdqu %ymm14, (6 * 32)(%rcx);
1066
vmovdqu %ymm15, (7 * 32)(%rcx);
1067
1068
vpsubq %ymm5, %ymm3, %ymm3; /* +17 ; +16 */
1069
vpshufb %ymm6, %ymm3, %ymm8;
1070
vpsubq %ymm5, %ymm3, %ymm3; /* +19 ; +18 */
1071
vpshufb %ymm6, %ymm3, %ymm9;
1072
vpsubq %ymm5, %ymm3, %ymm3; /* +21 ; +20 */
1073
vpshufb %ymm6, %ymm3, %ymm10;
1074
vpsubq %ymm5, %ymm3, %ymm3; /* +23 ; +22 */
1075
vpshufb %ymm6, %ymm3, %ymm11;
1076
vpsubq %ymm5, %ymm3, %ymm3; /* +25 ; +24 */
1077
vpshufb %ymm6, %ymm3, %ymm12;
1078
vpsubq %ymm5, %ymm3, %ymm3; /* +27 ; +26 */
1079
vpshufb %ymm6, %ymm3, %ymm13;
1080
vpsubq %ymm5, %ymm3, %ymm3; /* +29 ; +28 */
1081
vpshufb %ymm6, %ymm3, %ymm14;
1082
vpsubq %ymm5, %ymm3, %ymm3; /* +31 ; +30 */
1083
vpshufb %ymm6, %ymm3, %ymm15;
1084
vpsubq %ymm5, %ymm3, %ymm3; /* +32 */
1085
vpshufb %xmm6, %xmm3, %xmm3;
1086
vmovdqu %xmm3, (%r8);
1087
vmovdqu (0 * 32)(%rcx), %ymm0;
1088
vmovdqu (1 * 32)(%rcx), %ymm1;
1089
vmovdqu (2 * 32)(%rcx), %ymm2;
1090
vmovdqu (3 * 32)(%rcx), %ymm3;
1091
vmovdqu (4 * 32)(%rcx), %ymm4;
1092
vmovdqu (5 * 32)(%rcx), %ymm5;
1093
vmovdqu (6 * 32)(%rcx), %ymm6;
1094
vmovdqu (7 * 32)(%rcx), %ymm7;
1095
jmp .Lctr_carry_done;
1096
1097
.Lhandle_ctr_carry:
1098
/* construct IVs */
1099
inc_le128(%ymm3, %ymm0, %ymm4);
1100
inc_le128(%ymm3, %ymm0, %ymm4);
1101
vpshufb %ymm6, %ymm3, %ymm9; /* +3 ; +2 */
1102
inc_le128(%ymm3, %ymm0, %ymm4);
1103
inc_le128(%ymm3, %ymm0, %ymm4);
1104
vpshufb %ymm6, %ymm3, %ymm10; /* +5 ; +4 */
1105
inc_le128(%ymm3, %ymm0, %ymm4);
1106
inc_le128(%ymm3, %ymm0, %ymm4);
1107
vpshufb %ymm6, %ymm3, %ymm11; /* +7 ; +6 */
1108
inc_le128(%ymm3, %ymm0, %ymm4);
1109
inc_le128(%ymm3, %ymm0, %ymm4);
1110
vpshufb %ymm6, %ymm3, %ymm12; /* +9 ; +8 */
1111
inc_le128(%ymm3, %ymm0, %ymm4);
1112
inc_le128(%ymm3, %ymm0, %ymm4);
1113
vpshufb %ymm6, %ymm3, %ymm13; /* +11 ; +10 */
1114
inc_le128(%ymm3, %ymm0, %ymm4);
1115
inc_le128(%ymm3, %ymm0, %ymm4);
1116
vpshufb %ymm6, %ymm3, %ymm14; /* +13 ; +12 */
1117
inc_le128(%ymm3, %ymm0, %ymm4);
1118
inc_le128(%ymm3, %ymm0, %ymm4);
1119
vpshufb %ymm6, %ymm3, %ymm15; /* +15 ; +14 */
1120
vmovdqu %ymm8, (0 * 32)(%rcx);
1121
vmovdqu %ymm9, (1 * 32)(%rcx);
1122
vmovdqu %ymm10, (2 * 32)(%rcx);
1123
vmovdqu %ymm11, (3 * 32)(%rcx);
1124
vmovdqu %ymm12, (4 * 32)(%rcx);
1125
vmovdqu %ymm13, (5 * 32)(%rcx);
1126
vmovdqu %ymm14, (6 * 32)(%rcx);
1127
vmovdqu %ymm15, (7 * 32)(%rcx);
1128
1129
inc_le128(%ymm3, %ymm0, %ymm4);
1130
inc_le128(%ymm3, %ymm0, %ymm4);
1131
vpshufb %ymm6, %ymm3, %ymm8; /* +17 ; +16 */
1132
inc_le128(%ymm3, %ymm0, %ymm4);
1133
inc_le128(%ymm3, %ymm0, %ymm4);
1134
vpshufb %ymm6, %ymm3, %ymm9; /* +19 ; +18 */
1135
inc_le128(%ymm3, %ymm0, %ymm4);
1136
inc_le128(%ymm3, %ymm0, %ymm4);
1137
vpshufb %ymm6, %ymm3, %ymm10; /* +21 ; +20 */
1138
inc_le128(%ymm3, %ymm0, %ymm4);
1139
inc_le128(%ymm3, %ymm0, %ymm4);
1140
vpshufb %ymm6, %ymm3, %ymm11; /* +23 ; +22 */
1141
inc_le128(%ymm3, %ymm0, %ymm4);
1142
inc_le128(%ymm3, %ymm0, %ymm4);
1143
vpshufb %ymm6, %ymm3, %ymm12; /* +25 ; +24 */
1144
inc_le128(%ymm3, %ymm0, %ymm4);
1145
inc_le128(%ymm3, %ymm0, %ymm4);
1146
vpshufb %ymm6, %ymm3, %ymm13; /* +27 ; +26 */
1147
inc_le128(%ymm3, %ymm0, %ymm4);
1148
inc_le128(%ymm3, %ymm0, %ymm4);
1149
vpshufb %ymm6, %ymm3, %ymm14; /* +29 ; +28 */
1150
inc_le128(%ymm3, %ymm0, %ymm4);
1151
inc_le128(%ymm3, %ymm0, %ymm4);
1152
vpshufb %ymm6, %ymm3, %ymm15; /* +31 ; +30 */
1153
inc_le128(%ymm3, %ymm0, %ymm4);
1154
vextracti128 $1, %ymm3, %xmm3;
1155
vpshufb %xmm6, %xmm3, %xmm3; /* +32 */
1156
vmovdqu %xmm3, (%r8);
1157
vmovdqu (0 * 32)(%rcx), %ymm0;
1158
vmovdqu (1 * 32)(%rcx), %ymm1;
1159
vmovdqu (2 * 32)(%rcx), %ymm2;
1160
vmovdqu (3 * 32)(%rcx), %ymm3;
1161
vmovdqu (4 * 32)(%rcx), %ymm4;
1162
vmovdqu (5 * 32)(%rcx), %ymm5;
1163
vmovdqu (6 * 32)(%rcx), %ymm6;
1164
vmovdqu (7 * 32)(%rcx), %ymm7;
1165
1166
.Lctr_carry_done:
1167
1168
FRAME_END
1169
RET;
1170
SYM_FUNC_END(__aria_aesni_avx2_ctr_gen_keystream_32way)
1171
1172
SYM_TYPED_FUNC_START(aria_aesni_avx2_ctr_crypt_32way)
1173
/* input:
1174
* %rdi: ctx
1175
* %rsi: dst
1176
* %rdx: src
1177
* %rcx: keystream
1178
* %r8: iv (big endian, 128bit)
1179
*/
1180
FRAME_BEGIN
1181
1182
call __aria_aesni_avx2_ctr_gen_keystream_32way;
1183
1184
leaq (%rsi), %r10;
1185
leaq (%rdx), %r11;
1186
leaq (%rcx), %rsi;
1187
leaq (%rcx), %rdx;
1188
leaq ARIA_CTX_enc_key(CTX), %r9;
1189
1190
call __aria_aesni_avx2_crypt_32way;
1191
1192
vpxor (0 * 32)(%r11), %ymm1, %ymm1;
1193
vpxor (1 * 32)(%r11), %ymm0, %ymm0;
1194
vpxor (2 * 32)(%r11), %ymm3, %ymm3;
1195
vpxor (3 * 32)(%r11), %ymm2, %ymm2;
1196
vpxor (4 * 32)(%r11), %ymm4, %ymm4;
1197
vpxor (5 * 32)(%r11), %ymm5, %ymm5;
1198
vpxor (6 * 32)(%r11), %ymm6, %ymm6;
1199
vpxor (7 * 32)(%r11), %ymm7, %ymm7;
1200
vpxor (8 * 32)(%r11), %ymm8, %ymm8;
1201
vpxor (9 * 32)(%r11), %ymm9, %ymm9;
1202
vpxor (10 * 32)(%r11), %ymm10, %ymm10;
1203
vpxor (11 * 32)(%r11), %ymm11, %ymm11;
1204
vpxor (12 * 32)(%r11), %ymm12, %ymm12;
1205
vpxor (13 * 32)(%r11), %ymm13, %ymm13;
1206
vpxor (14 * 32)(%r11), %ymm14, %ymm14;
1207
vpxor (15 * 32)(%r11), %ymm15, %ymm15;
1208
write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1209
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1210
%ymm15, %r10);
1211
1212
FRAME_END
1213
RET;
1214
SYM_FUNC_END(aria_aesni_avx2_ctr_crypt_32way)
1215
1216
SYM_FUNC_START_LOCAL(__aria_aesni_avx2_gfni_crypt_32way)
1217
/* input:
1218
* %r9: rk
1219
* %rsi: dst
1220
* %rdx: src
1221
* %ymm0..%ymm15: 16 byte-sliced blocks
1222
*/
1223
1224
FRAME_BEGIN
1225
1226
movq %rsi, %rax;
1227
leaq 8 * 32(%rax), %r8;
1228
1229
inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3,
1230
%ymm4, %ymm5, %ymm6, %ymm7,
1231
%ymm8, %ymm9, %ymm10, %ymm11,
1232
%ymm12, %ymm13, %ymm14,
1233
%ymm15, %rax, %r8);
1234
aria_fo_gfni(%ymm8, %ymm9, %ymm10, %ymm11,
1235
%ymm12, %ymm13, %ymm14, %ymm15,
1236
%ymm0, %ymm1, %ymm2, %ymm3,
1237
%ymm4, %ymm5, %ymm6, %ymm7,
1238
%rax, %r9, 0);
1239
aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1240
%ymm4, %ymm5, %ymm6, %ymm7,
1241
%ymm8, %ymm9, %ymm10, %ymm11,
1242
%ymm12, %ymm13, %ymm14,
1243
%ymm15, %rax, %r9, 1);
1244
aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1245
%ymm12, %ymm13, %ymm14, %ymm15,
1246
%ymm0, %ymm1, %ymm2, %ymm3,
1247
%ymm4, %ymm5, %ymm6, %ymm7,
1248
%rax, %r9, 2);
1249
aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1250
%ymm4, %ymm5, %ymm6, %ymm7,
1251
%ymm8, %ymm9, %ymm10, %ymm11,
1252
%ymm12, %ymm13, %ymm14,
1253
%ymm15, %rax, %r9, 3);
1254
aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1255
%ymm12, %ymm13, %ymm14, %ymm15,
1256
%ymm0, %ymm1, %ymm2, %ymm3,
1257
%ymm4, %ymm5, %ymm6, %ymm7,
1258
%rax, %r9, 4);
1259
aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1260
%ymm4, %ymm5, %ymm6, %ymm7,
1261
%ymm8, %ymm9, %ymm10, %ymm11,
1262
%ymm12, %ymm13, %ymm14,
1263
%ymm15, %rax, %r9, 5);
1264
aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1265
%ymm12, %ymm13, %ymm14, %ymm15,
1266
%ymm0, %ymm1, %ymm2, %ymm3,
1267
%ymm4, %ymm5, %ymm6, %ymm7,
1268
%rax, %r9, 6);
1269
aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1270
%ymm4, %ymm5, %ymm6, %ymm7,
1271
%ymm8, %ymm9, %ymm10, %ymm11,
1272
%ymm12, %ymm13, %ymm14,
1273
%ymm15, %rax, %r9, 7);
1274
aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1275
%ymm12, %ymm13, %ymm14, %ymm15,
1276
%ymm0, %ymm1, %ymm2, %ymm3,
1277
%ymm4, %ymm5, %ymm6, %ymm7,
1278
%rax, %r9, 8);
1279
aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1280
%ymm4, %ymm5, %ymm6, %ymm7,
1281
%ymm8, %ymm9, %ymm10, %ymm11,
1282
%ymm12, %ymm13, %ymm14,
1283
%ymm15, %rax, %r9, 9);
1284
aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1285
%ymm12, %ymm13, %ymm14, %ymm15,
1286
%ymm0, %ymm1, %ymm2, %ymm3,
1287
%ymm4, %ymm5, %ymm6, %ymm7,
1288
%rax, %r9, 10);
1289
cmpl $12, ARIA_CTX_rounds(CTX);
1290
jne .Laria_gfni_192;
1291
aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1292
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1293
%ymm15, %rax, %r9, 11, 12);
1294
jmp .Laria_gfni_end;
1295
.Laria_gfni_192:
1296
aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1297
%ymm4, %ymm5, %ymm6, %ymm7,
1298
%ymm8, %ymm9, %ymm10, %ymm11,
1299
%ymm12, %ymm13, %ymm14,
1300
%ymm15, %rax, %r9, 11);
1301
aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1302
%ymm12, %ymm13, %ymm14, %ymm15,
1303
%ymm0, %ymm1, %ymm2, %ymm3,
1304
%ymm4, %ymm5, %ymm6, %ymm7,
1305
%rax, %r9, 12);
1306
cmpl $14, ARIA_CTX_rounds(CTX);
1307
jne .Laria_gfni_256;
1308
aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1309
%ymm4, %ymm5, %ymm6, %ymm7,
1310
%ymm8, %ymm9, %ymm10, %ymm11,
1311
%ymm12, %ymm13, %ymm14,
1312
%ymm15, %rax, %r9, 13, 14);
1313
jmp .Laria_gfni_end;
1314
.Laria_gfni_256:
1315
aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1316
%ymm4, %ymm5, %ymm6, %ymm7,
1317
%ymm8, %ymm9, %ymm10, %ymm11,
1318
%ymm12, %ymm13, %ymm14,
1319
%ymm15, %rax, %r9, 13);
1320
aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1321
%ymm12, %ymm13, %ymm14, %ymm15,
1322
%ymm0, %ymm1, %ymm2, %ymm3,
1323
%ymm4, %ymm5, %ymm6, %ymm7,
1324
%rax, %r9, 14);
1325
aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1326
%ymm4, %ymm5, %ymm6, %ymm7,
1327
%ymm8, %ymm9, %ymm10, %ymm11,
1328
%ymm12, %ymm13, %ymm14,
1329
%ymm15, %rax, %r9, 15, 16);
1330
.Laria_gfni_end:
1331
debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
1332
%ymm9, %ymm13, %ymm0, %ymm5,
1333
%ymm10, %ymm14, %ymm3, %ymm6,
1334
%ymm11, %ymm15, %ymm2, %ymm7,
1335
(%rax), (%r8));
1336
1337
FRAME_END
1338
RET;
1339
SYM_FUNC_END(__aria_aesni_avx2_gfni_crypt_32way)
1340
1341
SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_encrypt_32way)
1342
/* input:
1343
* %rdi: ctx, CTX
1344
* %rsi: dst
1345
* %rdx: src
1346
*/
1347
1348
FRAME_BEGIN
1349
1350
leaq ARIA_CTX_enc_key(CTX), %r9;
1351
1352
inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1353
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1354
%ymm15, %rdx);
1355
1356
call __aria_aesni_avx2_gfni_crypt_32way;
1357
1358
write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1359
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1360
%ymm15, %rax);
1361
1362
FRAME_END
1363
RET;
1364
SYM_FUNC_END(aria_aesni_avx2_gfni_encrypt_32way)
1365
1366
SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_decrypt_32way)
1367
/* input:
1368
* %rdi: ctx, CTX
1369
* %rsi: dst
1370
* %rdx: src
1371
*/
1372
1373
FRAME_BEGIN
1374
1375
leaq ARIA_CTX_dec_key(CTX), %r9;
1376
1377
inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1378
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1379
%ymm15, %rdx);
1380
1381
call __aria_aesni_avx2_gfni_crypt_32way;
1382
1383
write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1384
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1385
%ymm15, %rax);
1386
1387
FRAME_END
1388
RET;
1389
SYM_FUNC_END(aria_aesni_avx2_gfni_decrypt_32way)
1390
1391
SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_ctr_crypt_32way)
1392
/* input:
1393
* %rdi: ctx
1394
* %rsi: dst
1395
* %rdx: src
1396
* %rcx: keystream
1397
* %r8: iv (big endian, 128bit)
1398
*/
1399
FRAME_BEGIN
1400
1401
call __aria_aesni_avx2_ctr_gen_keystream_32way
1402
1403
leaq (%rsi), %r10;
1404
leaq (%rdx), %r11;
1405
leaq (%rcx), %rsi;
1406
leaq (%rcx), %rdx;
1407
leaq ARIA_CTX_enc_key(CTX), %r9;
1408
1409
call __aria_aesni_avx2_gfni_crypt_32way;
1410
1411
vpxor (0 * 32)(%r11), %ymm1, %ymm1;
1412
vpxor (1 * 32)(%r11), %ymm0, %ymm0;
1413
vpxor (2 * 32)(%r11), %ymm3, %ymm3;
1414
vpxor (3 * 32)(%r11), %ymm2, %ymm2;
1415
vpxor (4 * 32)(%r11), %ymm4, %ymm4;
1416
vpxor (5 * 32)(%r11), %ymm5, %ymm5;
1417
vpxor (6 * 32)(%r11), %ymm6, %ymm6;
1418
vpxor (7 * 32)(%r11), %ymm7, %ymm7;
1419
vpxor (8 * 32)(%r11), %ymm8, %ymm8;
1420
vpxor (9 * 32)(%r11), %ymm9, %ymm9;
1421
vpxor (10 * 32)(%r11), %ymm10, %ymm10;
1422
vpxor (11 * 32)(%r11), %ymm11, %ymm11;
1423
vpxor (12 * 32)(%r11), %ymm12, %ymm12;
1424
vpxor (13 * 32)(%r11), %ymm13, %ymm13;
1425
vpxor (14 * 32)(%r11), %ymm14, %ymm14;
1426
vpxor (15 * 32)(%r11), %ymm15, %ymm15;
1427
write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1428
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1429
%ymm15, %r10);
1430
1431
FRAME_END
1432
RET;
1433
SYM_FUNC_END(aria_aesni_avx2_gfni_ctr_crypt_32way)
1434
1435