Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/crypto/aria-gfni-avx512-asm_64.S
26451 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* ARIA Cipher 64-way parallel algorithm (AVX512)
4
*
5
* Copyright (c) 2022 Taehee Yoo <[email protected]>
6
*
7
*/
8
9
#include <linux/linkage.h>
10
#include <asm/frame.h>
11
#include <asm/asm-offsets.h>
12
#include <linux/cfi_types.h>
13
14
/* register macros */
15
#define CTX %rdi
16
17
18
#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
19
( (((a0) & 1) << 0) | \
20
(((a1) & 1) << 1) | \
21
(((a2) & 1) << 2) | \
22
(((a3) & 1) << 3) | \
23
(((a4) & 1) << 4) | \
24
(((a5) & 1) << 5) | \
25
(((a6) & 1) << 6) | \
26
(((a7) & 1) << 7) )
27
28
#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
29
( ((l7) << (0 * 8)) | \
30
((l6) << (1 * 8)) | \
31
((l5) << (2 * 8)) | \
32
((l4) << (3 * 8)) | \
33
((l3) << (4 * 8)) | \
34
((l2) << (5 * 8)) | \
35
((l1) << (6 * 8)) | \
36
((l0) << (7 * 8)) )
37
38
#define add_le128(out, in, lo_counter, hi_counter1) \
39
vpaddq lo_counter, in, out; \
40
vpcmpuq $1, lo_counter, out, %k1; \
41
kaddb %k1, %k1, %k1; \
42
vpaddq hi_counter1, out, out{%k1};
43
44
#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
45
vpandq x, mask4bit, tmp0; \
46
vpandqn x, mask4bit, x; \
47
vpsrld $4, x, x; \
48
\
49
vpshufb tmp0, lo_t, tmp0; \
50
vpshufb x, hi_t, x; \
51
vpxorq tmp0, x, x;
52
53
#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
54
vpunpckhdq x1, x0, t2; \
55
vpunpckldq x1, x0, x0; \
56
\
57
vpunpckldq x3, x2, t1; \
58
vpunpckhdq x3, x2, x2; \
59
\
60
vpunpckhqdq t1, x0, x1; \
61
vpunpcklqdq t1, x0, x0; \
62
\
63
vpunpckhqdq x2, t2, x3; \
64
vpunpcklqdq x2, t2, x2;
65
66
#define byteslice_16x16b(a0, b0, c0, d0, \
67
a1, b1, c1, d1, \
68
a2, b2, c2, d2, \
69
a3, b3, c3, d3, \
70
st0, st1) \
71
vmovdqu64 d2, st0; \
72
vmovdqu64 d3, st1; \
73
transpose_4x4(a0, a1, a2, a3, d2, d3); \
74
transpose_4x4(b0, b1, b2, b3, d2, d3); \
75
vmovdqu64 st0, d2; \
76
vmovdqu64 st1, d3; \
77
\
78
vmovdqu64 a0, st0; \
79
vmovdqu64 a1, st1; \
80
transpose_4x4(c0, c1, c2, c3, a0, a1); \
81
transpose_4x4(d0, d1, d2, d3, a0, a1); \
82
\
83
vbroadcasti64x2 .Lshufb_16x16b(%rip), a0; \
84
vmovdqu64 st1, a1; \
85
vpshufb a0, a2, a2; \
86
vpshufb a0, a3, a3; \
87
vpshufb a0, b0, b0; \
88
vpshufb a0, b1, b1; \
89
vpshufb a0, b2, b2; \
90
vpshufb a0, b3, b3; \
91
vpshufb a0, a1, a1; \
92
vpshufb a0, c0, c0; \
93
vpshufb a0, c1, c1; \
94
vpshufb a0, c2, c2; \
95
vpshufb a0, c3, c3; \
96
vpshufb a0, d0, d0; \
97
vpshufb a0, d1, d1; \
98
vpshufb a0, d2, d2; \
99
vpshufb a0, d3, d3; \
100
vmovdqu64 d3, st1; \
101
vmovdqu64 st0, d3; \
102
vpshufb a0, d3, a0; \
103
vmovdqu64 d2, st0; \
104
\
105
transpose_4x4(a0, b0, c0, d0, d2, d3); \
106
transpose_4x4(a1, b1, c1, d1, d2, d3); \
107
vmovdqu64 st0, d2; \
108
vmovdqu64 st1, d3; \
109
\
110
vmovdqu64 b0, st0; \
111
vmovdqu64 b1, st1; \
112
transpose_4x4(a2, b2, c2, d2, b0, b1); \
113
transpose_4x4(a3, b3, c3, d3, b0, b1); \
114
vmovdqu64 st0, b0; \
115
vmovdqu64 st1, b1; \
116
/* does not adjust output bytes inside vectors */
117
118
#define debyteslice_16x16b(a0, b0, c0, d0, \
119
a1, b1, c1, d1, \
120
a2, b2, c2, d2, \
121
a3, b3, c3, d3, \
122
st0, st1) \
123
vmovdqu64 d2, st0; \
124
vmovdqu64 d3, st1; \
125
transpose_4x4(a0, a1, a2, a3, d2, d3); \
126
transpose_4x4(b0, b1, b2, b3, d2, d3); \
127
vmovdqu64 st0, d2; \
128
vmovdqu64 st1, d3; \
129
\
130
vmovdqu64 a0, st0; \
131
vmovdqu64 a1, st1; \
132
transpose_4x4(c0, c1, c2, c3, a0, a1); \
133
transpose_4x4(d0, d1, d2, d3, a0, a1); \
134
\
135
vbroadcasti64x2 .Lshufb_16x16b(%rip), a0; \
136
vmovdqu64 st1, a1; \
137
vpshufb a0, a2, a2; \
138
vpshufb a0, a3, a3; \
139
vpshufb a0, b0, b0; \
140
vpshufb a0, b1, b1; \
141
vpshufb a0, b2, b2; \
142
vpshufb a0, b3, b3; \
143
vpshufb a0, a1, a1; \
144
vpshufb a0, c0, c0; \
145
vpshufb a0, c1, c1; \
146
vpshufb a0, c2, c2; \
147
vpshufb a0, c3, c3; \
148
vpshufb a0, d0, d0; \
149
vpshufb a0, d1, d1; \
150
vpshufb a0, d2, d2; \
151
vpshufb a0, d3, d3; \
152
vmovdqu64 d3, st1; \
153
vmovdqu64 st0, d3; \
154
vpshufb a0, d3, a0; \
155
vmovdqu64 d2, st0; \
156
\
157
transpose_4x4(c0, d0, a0, b0, d2, d3); \
158
transpose_4x4(c1, d1, a1, b1, d2, d3); \
159
vmovdqu64 st0, d2; \
160
vmovdqu64 st1, d3; \
161
\
162
vmovdqu64 b0, st0; \
163
vmovdqu64 b1, st1; \
164
transpose_4x4(c2, d2, a2, b2, b0, b1); \
165
transpose_4x4(c3, d3, a3, b3, b0, b1); \
166
vmovdqu64 st0, b0; \
167
vmovdqu64 st1, b1; \
168
/* does not adjust output bytes inside vectors */
169
170
/* load blocks to registers and apply pre-whitening */
171
#define inpack16_pre(x0, x1, x2, x3, \
172
x4, x5, x6, x7, \
173
y0, y1, y2, y3, \
174
y4, y5, y6, y7, \
175
rio) \
176
vmovdqu64 (0 * 64)(rio), x0; \
177
vmovdqu64 (1 * 64)(rio), x1; \
178
vmovdqu64 (2 * 64)(rio), x2; \
179
vmovdqu64 (3 * 64)(rio), x3; \
180
vmovdqu64 (4 * 64)(rio), x4; \
181
vmovdqu64 (5 * 64)(rio), x5; \
182
vmovdqu64 (6 * 64)(rio), x6; \
183
vmovdqu64 (7 * 64)(rio), x7; \
184
vmovdqu64 (8 * 64)(rio), y0; \
185
vmovdqu64 (9 * 64)(rio), y1; \
186
vmovdqu64 (10 * 64)(rio), y2; \
187
vmovdqu64 (11 * 64)(rio), y3; \
188
vmovdqu64 (12 * 64)(rio), y4; \
189
vmovdqu64 (13 * 64)(rio), y5; \
190
vmovdqu64 (14 * 64)(rio), y6; \
191
vmovdqu64 (15 * 64)(rio), y7;
192
193
/* byteslice pre-whitened blocks and store to temporary memory */
194
#define inpack16_post(x0, x1, x2, x3, \
195
x4, x5, x6, x7, \
196
y0, y1, y2, y3, \
197
y4, y5, y6, y7, \
198
mem_ab, mem_cd) \
199
byteslice_16x16b(x0, x1, x2, x3, \
200
x4, x5, x6, x7, \
201
y0, y1, y2, y3, \
202
y4, y5, y6, y7, \
203
(mem_ab), (mem_cd)); \
204
\
205
vmovdqu64 x0, 0 * 64(mem_ab); \
206
vmovdqu64 x1, 1 * 64(mem_ab); \
207
vmovdqu64 x2, 2 * 64(mem_ab); \
208
vmovdqu64 x3, 3 * 64(mem_ab); \
209
vmovdqu64 x4, 4 * 64(mem_ab); \
210
vmovdqu64 x5, 5 * 64(mem_ab); \
211
vmovdqu64 x6, 6 * 64(mem_ab); \
212
vmovdqu64 x7, 7 * 64(mem_ab); \
213
vmovdqu64 y0, 0 * 64(mem_cd); \
214
vmovdqu64 y1, 1 * 64(mem_cd); \
215
vmovdqu64 y2, 2 * 64(mem_cd); \
216
vmovdqu64 y3, 3 * 64(mem_cd); \
217
vmovdqu64 y4, 4 * 64(mem_cd); \
218
vmovdqu64 y5, 5 * 64(mem_cd); \
219
vmovdqu64 y6, 6 * 64(mem_cd); \
220
vmovdqu64 y7, 7 * 64(mem_cd);
221
222
#define write_output(x0, x1, x2, x3, \
223
x4, x5, x6, x7, \
224
y0, y1, y2, y3, \
225
y4, y5, y6, y7, \
226
mem) \
227
vmovdqu64 x0, 0 * 64(mem); \
228
vmovdqu64 x1, 1 * 64(mem); \
229
vmovdqu64 x2, 2 * 64(mem); \
230
vmovdqu64 x3, 3 * 64(mem); \
231
vmovdqu64 x4, 4 * 64(mem); \
232
vmovdqu64 x5, 5 * 64(mem); \
233
vmovdqu64 x6, 6 * 64(mem); \
234
vmovdqu64 x7, 7 * 64(mem); \
235
vmovdqu64 y0, 8 * 64(mem); \
236
vmovdqu64 y1, 9 * 64(mem); \
237
vmovdqu64 y2, 10 * 64(mem); \
238
vmovdqu64 y3, 11 * 64(mem); \
239
vmovdqu64 y4, 12 * 64(mem); \
240
vmovdqu64 y5, 13 * 64(mem); \
241
vmovdqu64 y6, 14 * 64(mem); \
242
vmovdqu64 y7, 15 * 64(mem); \
243
244
#define aria_store_state_8way(x0, x1, x2, x3, \
245
x4, x5, x6, x7, \
246
mem_tmp, idx) \
247
vmovdqu64 x0, ((idx + 0) * 64)(mem_tmp); \
248
vmovdqu64 x1, ((idx + 1) * 64)(mem_tmp); \
249
vmovdqu64 x2, ((idx + 2) * 64)(mem_tmp); \
250
vmovdqu64 x3, ((idx + 3) * 64)(mem_tmp); \
251
vmovdqu64 x4, ((idx + 4) * 64)(mem_tmp); \
252
vmovdqu64 x5, ((idx + 5) * 64)(mem_tmp); \
253
vmovdqu64 x6, ((idx + 6) * 64)(mem_tmp); \
254
vmovdqu64 x7, ((idx + 7) * 64)(mem_tmp);
255
256
#define aria_load_state_8way(x0, x1, x2, x3, \
257
x4, x5, x6, x7, \
258
mem_tmp, idx) \
259
vmovdqu64 ((idx + 0) * 64)(mem_tmp), x0; \
260
vmovdqu64 ((idx + 1) * 64)(mem_tmp), x1; \
261
vmovdqu64 ((idx + 2) * 64)(mem_tmp), x2; \
262
vmovdqu64 ((idx + 3) * 64)(mem_tmp), x3; \
263
vmovdqu64 ((idx + 4) * 64)(mem_tmp), x4; \
264
vmovdqu64 ((idx + 5) * 64)(mem_tmp), x5; \
265
vmovdqu64 ((idx + 6) * 64)(mem_tmp), x6; \
266
vmovdqu64 ((idx + 7) * 64)(mem_tmp), x7;
267
268
#define aria_ark_16way(x0, x1, x2, x3, \
269
x4, x5, x6, x7, \
270
y0, y1, y2, y3, \
271
y4, y5, y6, y7, \
272
t0, rk, round) \
273
/* AddRoundKey */ \
274
vpbroadcastb ((round * 16) + 3)(rk), t0; \
275
vpxorq t0, x0, x0; \
276
vpbroadcastb ((round * 16) + 2)(rk), t0; \
277
vpxorq t0, x1, x1; \
278
vpbroadcastb ((round * 16) + 1)(rk), t0; \
279
vpxorq t0, x2, x2; \
280
vpbroadcastb ((round * 16) + 0)(rk), t0; \
281
vpxorq t0, x3, x3; \
282
vpbroadcastb ((round * 16) + 7)(rk), t0; \
283
vpxorq t0, x4, x4; \
284
vpbroadcastb ((round * 16) + 6)(rk), t0; \
285
vpxorq t0, x5, x5; \
286
vpbroadcastb ((round * 16) + 5)(rk), t0; \
287
vpxorq t0, x6, x6; \
288
vpbroadcastb ((round * 16) + 4)(rk), t0; \
289
vpxorq t0, x7, x7; \
290
vpbroadcastb ((round * 16) + 11)(rk), t0; \
291
vpxorq t0, y0, y0; \
292
vpbroadcastb ((round * 16) + 10)(rk), t0; \
293
vpxorq t0, y1, y1; \
294
vpbroadcastb ((round * 16) + 9)(rk), t0; \
295
vpxorq t0, y2, y2; \
296
vpbroadcastb ((round * 16) + 8)(rk), t0; \
297
vpxorq t0, y3, y3; \
298
vpbroadcastb ((round * 16) + 15)(rk), t0; \
299
vpxorq t0, y4, y4; \
300
vpbroadcastb ((round * 16) + 14)(rk), t0; \
301
vpxorq t0, y5, y5; \
302
vpbroadcastb ((round * 16) + 13)(rk), t0; \
303
vpxorq t0, y6, y6; \
304
vpbroadcastb ((round * 16) + 12)(rk), t0; \
305
vpxorq t0, y7, y7;
306
307
#define aria_sbox_8way_gfni(x0, x1, x2, x3, \
308
x4, x5, x6, x7, \
309
t0, t1, t2, t3, \
310
t4, t5, t6, t7) \
311
vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \
312
vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \
313
vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \
314
vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \
315
vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \
316
vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
317
vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
318
vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
319
vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
320
vgf2p8affineinvqb $0, t2, x2, x2; \
321
vgf2p8affineinvqb $0, t2, x6, x6; \
322
vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
323
vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
324
vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
325
vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
326
vgf2p8affineinvqb $0, t2, x3, x3; \
327
vgf2p8affineinvqb $0, t2, x7, x7;
328
329
#define aria_sbox_16way_gfni(x0, x1, x2, x3, \
330
x4, x5, x6, x7, \
331
y0, y1, y2, y3, \
332
y4, y5, y6, y7, \
333
t0, t1, t2, t3, \
334
t4, t5, t6, t7) \
335
vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \
336
vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \
337
vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \
338
vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \
339
vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \
340
vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
341
vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
342
vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
343
vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
344
vgf2p8affineinvqb $0, t2, x2, x2; \
345
vgf2p8affineinvqb $0, t2, x6, x6; \
346
vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
347
vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
348
vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
349
vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
350
vgf2p8affineinvqb $0, t2, x3, x3; \
351
vgf2p8affineinvqb $0, t2, x7, x7; \
352
vgf2p8affineinvqb $(tf_s2_const), t0, y1, y1; \
353
vgf2p8affineinvqb $(tf_s2_const), t0, y5, y5; \
354
vgf2p8affineqb $(tf_inv_const), t1, y2, y2; \
355
vgf2p8affineqb $(tf_inv_const), t1, y6, y6; \
356
vgf2p8affineinvqb $0, t2, y2, y2; \
357
vgf2p8affineinvqb $0, t2, y6, y6; \
358
vgf2p8affineinvqb $(tf_aff_const), t3, y0, y0; \
359
vgf2p8affineinvqb $(tf_aff_const), t3, y4, y4; \
360
vgf2p8affineqb $(tf_x2_const), t4, y3, y3; \
361
vgf2p8affineqb $(tf_x2_const), t4, y7, y7; \
362
vgf2p8affineinvqb $0, t2, y3, y3; \
363
vgf2p8affineinvqb $0, t2, y7, y7;
364
365
366
#define aria_diff_m(x0, x1, x2, x3, \
367
t0, t1, t2, t3) \
368
/* T = rotr32(X, 8); */ \
369
/* X ^= T */ \
370
vpxorq x0, x3, t0; \
371
vpxorq x1, x0, t1; \
372
vpxorq x2, x1, t2; \
373
vpxorq x3, x2, t3; \
374
/* X = T ^ rotr(X, 16); */ \
375
vpxorq t2, x0, x0; \
376
vpxorq x1, t3, t3; \
377
vpxorq t0, x2, x2; \
378
vpxorq t1, x3, x1; \
379
vmovdqu64 t3, x3;
380
381
#define aria_diff_word(x0, x1, x2, x3, \
382
x4, x5, x6, x7, \
383
y0, y1, y2, y3, \
384
y4, y5, y6, y7) \
385
/* t1 ^= t2; */ \
386
vpxorq y0, x4, x4; \
387
vpxorq y1, x5, x5; \
388
vpxorq y2, x6, x6; \
389
vpxorq y3, x7, x7; \
390
\
391
/* t2 ^= t3; */ \
392
vpxorq y4, y0, y0; \
393
vpxorq y5, y1, y1; \
394
vpxorq y6, y2, y2; \
395
vpxorq y7, y3, y3; \
396
\
397
/* t0 ^= t1; */ \
398
vpxorq x4, x0, x0; \
399
vpxorq x5, x1, x1; \
400
vpxorq x6, x2, x2; \
401
vpxorq x7, x3, x3; \
402
\
403
/* t3 ^= t1; */ \
404
vpxorq x4, y4, y4; \
405
vpxorq x5, y5, y5; \
406
vpxorq x6, y6, y6; \
407
vpxorq x7, y7, y7; \
408
\
409
/* t2 ^= t0; */ \
410
vpxorq x0, y0, y0; \
411
vpxorq x1, y1, y1; \
412
vpxorq x2, y2, y2; \
413
vpxorq x3, y3, y3; \
414
\
415
/* t1 ^= t2; */ \
416
vpxorq y0, x4, x4; \
417
vpxorq y1, x5, x5; \
418
vpxorq y2, x6, x6; \
419
vpxorq y3, x7, x7;
420
421
#define aria_fe_gfni(x0, x1, x2, x3, \
422
x4, x5, x6, x7, \
423
y0, y1, y2, y3, \
424
y4, y5, y6, y7, \
425
z0, z1, z2, z3, \
426
z4, z5, z6, z7, \
427
mem_tmp, rk, round) \
428
aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7, \
429
y0, y1, y2, y3, y4, y5, y6, y7, \
430
z0, rk, round); \
431
\
432
aria_sbox_16way_gfni(x2, x3, x0, x1, \
433
x6, x7, x4, x5, \
434
y2, y3, y0, y1, \
435
y6, y7, y4, y5, \
436
z0, z1, z2, z3, \
437
z4, z5, z6, z7); \
438
\
439
aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3); \
440
aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3); \
441
aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3); \
442
aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3); \
443
aria_diff_word(x0, x1, x2, x3, \
444
x4, x5, x6, x7, \
445
y0, y1, y2, y3, \
446
y4, y5, y6, y7); \
447
/* aria_diff_byte() \
448
* T3 = ABCD -> BADC \
449
* T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
450
* T0 = ABCD -> CDAB \
451
* T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
452
* T1 = ABCD -> DCBA \
453
* T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
454
*/ \
455
aria_diff_word(x2, x3, x0, x1, \
456
x7, x6, x5, x4, \
457
y0, y1, y2, y3, \
458
y5, y4, y7, y6); \
459
460
461
#define aria_fo_gfni(x0, x1, x2, x3, \
462
x4, x5, x6, x7, \
463
y0, y1, y2, y3, \
464
y4, y5, y6, y7, \
465
z0, z1, z2, z3, \
466
z4, z5, z6, z7, \
467
mem_tmp, rk, round) \
468
aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7, \
469
y0, y1, y2, y3, y4, y5, y6, y7, \
470
z0, rk, round); \
471
\
472
aria_sbox_16way_gfni(x0, x1, x2, x3, \
473
x4, x5, x6, x7, \
474
y0, y1, y2, y3, \
475
y4, y5, y6, y7, \
476
z0, z1, z2, z3, \
477
z4, z5, z6, z7); \
478
\
479
aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3); \
480
aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3); \
481
aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3); \
482
aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3); \
483
aria_diff_word(x0, x1, x2, x3, \
484
x4, x5, x6, x7, \
485
y0, y1, y2, y3, \
486
y4, y5, y6, y7); \
487
/* aria_diff_byte() \
488
* T1 = ABCD -> BADC \
489
* T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
490
* T2 = ABCD -> CDAB \
491
* T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
492
* T3 = ABCD -> DCBA \
493
* T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
494
*/ \
495
aria_diff_word(x0, x1, x2, x3, \
496
x5, x4, x7, x6, \
497
y2, y3, y0, y1, \
498
y7, y6, y5, y4);
499
500
#define aria_ff_gfni(x0, x1, x2, x3, \
501
x4, x5, x6, x7, \
502
y0, y1, y2, y3, \
503
y4, y5, y6, y7, \
504
z0, z1, z2, z3, \
505
z4, z5, z6, z7, \
506
mem_tmp, rk, round, last_round) \
507
aria_ark_16way(x0, x1, x2, x3, \
508
x4, x5, x6, x7, \
509
y0, y1, y2, y3, \
510
y4, y5, y6, y7, \
511
z0, rk, round); \
512
aria_sbox_16way_gfni(x2, x3, x0, x1, \
513
x6, x7, x4, x5, \
514
y2, y3, y0, y1, \
515
y6, y7, y4, y5, \
516
z0, z1, z2, z3, \
517
z4, z5, z6, z7); \
518
aria_ark_16way(x0, x1, x2, x3, \
519
x4, x5, x6, x7, \
520
y0, y1, y2, y3, \
521
y4, y5, y6, y7, \
522
z0, rk, last_round);
523
524
525
.section .rodata.cst64, "aM", @progbits, 64
526
.align 64
527
.Lcounter0123_lo:
528
.quad 0, 0
529
.quad 1, 0
530
.quad 2, 0
531
.quad 3, 0
532
533
.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
534
.align 32
535
#define SHUFB_BYTES(idx) \
536
0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
537
.Lshufb_16x16b:
538
.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
539
.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
540
541
.section .rodata.cst16, "aM", @progbits, 16
542
.align 16
543
544
.Lcounter4444_lo:
545
.quad 4, 0
546
.Lcounter8888_lo:
547
.quad 8, 0
548
.Lcounter16161616_lo:
549
.quad 16, 0
550
.Lcounter1111_hi:
551
.quad 0, 1
552
553
/* For CTR-mode IV byteswap */
554
.Lbswap128_mask:
555
.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
556
.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
557
558
.section .rodata.cst8, "aM", @progbits, 8
559
.align 8
560
/* AES affine: */
561
#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
562
.Ltf_aff_bitmatrix:
563
.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
564
BV8(1, 1, 0, 0, 0, 1, 1, 1),
565
BV8(1, 1, 1, 0, 0, 0, 1, 1),
566
BV8(1, 1, 1, 1, 0, 0, 0, 1),
567
BV8(1, 1, 1, 1, 1, 0, 0, 0),
568
BV8(0, 1, 1, 1, 1, 1, 0, 0),
569
BV8(0, 0, 1, 1, 1, 1, 1, 0),
570
BV8(0, 0, 0, 1, 1, 1, 1, 1))
571
572
/* AES inverse affine: */
573
#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
574
.Ltf_inv_bitmatrix:
575
.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
576
BV8(1, 0, 0, 1, 0, 0, 1, 0),
577
BV8(0, 1, 0, 0, 1, 0, 0, 1),
578
BV8(1, 0, 1, 0, 0, 1, 0, 0),
579
BV8(0, 1, 0, 1, 0, 0, 1, 0),
580
BV8(0, 0, 1, 0, 1, 0, 0, 1),
581
BV8(1, 0, 0, 1, 0, 1, 0, 0),
582
BV8(0, 1, 0, 0, 1, 0, 1, 0))
583
584
/* S2: */
585
#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
586
.Ltf_s2_bitmatrix:
587
.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
588
BV8(0, 0, 1, 1, 1, 1, 1, 1),
589
BV8(1, 1, 1, 0, 1, 1, 0, 1),
590
BV8(1, 1, 0, 0, 0, 0, 1, 1),
591
BV8(0, 1, 0, 0, 0, 0, 1, 1),
592
BV8(1, 1, 0, 0, 1, 1, 1, 0),
593
BV8(0, 1, 1, 0, 0, 0, 1, 1),
594
BV8(1, 1, 1, 1, 0, 1, 1, 0))
595
596
/* X2: */
597
#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
598
.Ltf_x2_bitmatrix:
599
.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
600
BV8(0, 0, 1, 0, 0, 1, 1, 0),
601
BV8(0, 0, 0, 0, 1, 0, 1, 0),
602
BV8(1, 1, 1, 0, 0, 0, 1, 1),
603
BV8(1, 1, 1, 0, 1, 1, 0, 0),
604
BV8(0, 1, 1, 0, 1, 0, 1, 1),
605
BV8(1, 0, 1, 1, 1, 1, 0, 1),
606
BV8(1, 0, 0, 1, 0, 0, 1, 1))
607
608
/* Identity matrix: */
609
.Ltf_id_bitmatrix:
610
.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
611
BV8(0, 1, 0, 0, 0, 0, 0, 0),
612
BV8(0, 0, 1, 0, 0, 0, 0, 0),
613
BV8(0, 0, 0, 1, 0, 0, 0, 0),
614
BV8(0, 0, 0, 0, 1, 0, 0, 0),
615
BV8(0, 0, 0, 0, 0, 1, 0, 0),
616
BV8(0, 0, 0, 0, 0, 0, 1, 0),
617
BV8(0, 0, 0, 0, 0, 0, 0, 1))
618
619
.text
620
SYM_FUNC_START_LOCAL(__aria_gfni_avx512_crypt_64way)
621
/* input:
622
* %r9: rk
623
* %rsi: dst
624
* %rdx: src
625
* %zmm0..%zmm15: byte-sliced blocks
626
*/
627
628
FRAME_BEGIN
629
630
movq %rsi, %rax;
631
leaq 8 * 64(%rax), %r8;
632
633
inpack16_post(%zmm0, %zmm1, %zmm2, %zmm3,
634
%zmm4, %zmm5, %zmm6, %zmm7,
635
%zmm8, %zmm9, %zmm10, %zmm11,
636
%zmm12, %zmm13, %zmm14,
637
%zmm15, %rax, %r8);
638
aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
639
%zmm4, %zmm5, %zmm6, %zmm7,
640
%zmm8, %zmm9, %zmm10, %zmm11,
641
%zmm12, %zmm13, %zmm14, %zmm15,
642
%zmm24, %zmm25, %zmm26, %zmm27,
643
%zmm28, %zmm29, %zmm30, %zmm31,
644
%rax, %r9, 0);
645
aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
646
%zmm6, %zmm7, %zmm4, %zmm5,
647
%zmm9, %zmm8, %zmm11, %zmm10,
648
%zmm12, %zmm13, %zmm14, %zmm15,
649
%zmm24, %zmm25, %zmm26, %zmm27,
650
%zmm28, %zmm29, %zmm30, %zmm31,
651
%rax, %r9, 1);
652
aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
653
%zmm4, %zmm5, %zmm6, %zmm7,
654
%zmm8, %zmm9, %zmm10, %zmm11,
655
%zmm12, %zmm13, %zmm14, %zmm15,
656
%zmm24, %zmm25, %zmm26, %zmm27,
657
%zmm28, %zmm29, %zmm30, %zmm31,
658
%rax, %r9, 2);
659
aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
660
%zmm6, %zmm7, %zmm4, %zmm5,
661
%zmm9, %zmm8, %zmm11, %zmm10,
662
%zmm12, %zmm13, %zmm14, %zmm15,
663
%zmm24, %zmm25, %zmm26, %zmm27,
664
%zmm28, %zmm29, %zmm30, %zmm31,
665
%rax, %r9, 3);
666
aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
667
%zmm4, %zmm5, %zmm6, %zmm7,
668
%zmm8, %zmm9, %zmm10, %zmm11,
669
%zmm12, %zmm13, %zmm14, %zmm15,
670
%zmm24, %zmm25, %zmm26, %zmm27,
671
%zmm28, %zmm29, %zmm30, %zmm31,
672
%rax, %r9, 4);
673
aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
674
%zmm6, %zmm7, %zmm4, %zmm5,
675
%zmm9, %zmm8, %zmm11, %zmm10,
676
%zmm12, %zmm13, %zmm14, %zmm15,
677
%zmm24, %zmm25, %zmm26, %zmm27,
678
%zmm28, %zmm29, %zmm30, %zmm31,
679
%rax, %r9, 5);
680
aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
681
%zmm4, %zmm5, %zmm6, %zmm7,
682
%zmm8, %zmm9, %zmm10, %zmm11,
683
%zmm12, %zmm13, %zmm14, %zmm15,
684
%zmm24, %zmm25, %zmm26, %zmm27,
685
%zmm28, %zmm29, %zmm30, %zmm31,
686
%rax, %r9, 6);
687
aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
688
%zmm6, %zmm7, %zmm4, %zmm5,
689
%zmm9, %zmm8, %zmm11, %zmm10,
690
%zmm12, %zmm13, %zmm14, %zmm15,
691
%zmm24, %zmm25, %zmm26, %zmm27,
692
%zmm28, %zmm29, %zmm30, %zmm31,
693
%rax, %r9, 7);
694
aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
695
%zmm4, %zmm5, %zmm6, %zmm7,
696
%zmm8, %zmm9, %zmm10, %zmm11,
697
%zmm12, %zmm13, %zmm14, %zmm15,
698
%zmm24, %zmm25, %zmm26, %zmm27,
699
%zmm28, %zmm29, %zmm30, %zmm31,
700
%rax, %r9, 8);
701
aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
702
%zmm6, %zmm7, %zmm4, %zmm5,
703
%zmm9, %zmm8, %zmm11, %zmm10,
704
%zmm12, %zmm13, %zmm14, %zmm15,
705
%zmm24, %zmm25, %zmm26, %zmm27,
706
%zmm28, %zmm29, %zmm30, %zmm31,
707
%rax, %r9, 9);
708
aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
709
%zmm4, %zmm5, %zmm6, %zmm7,
710
%zmm8, %zmm9, %zmm10, %zmm11,
711
%zmm12, %zmm13, %zmm14, %zmm15,
712
%zmm24, %zmm25, %zmm26, %zmm27,
713
%zmm28, %zmm29, %zmm30, %zmm31,
714
%rax, %r9, 10);
715
cmpl $12, ARIA_CTX_rounds(CTX);
716
jne .Laria_gfni_192;
717
aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
718
%zmm6, %zmm7, %zmm4, %zmm5,
719
%zmm9, %zmm8, %zmm11, %zmm10,
720
%zmm12, %zmm13, %zmm14, %zmm15,
721
%zmm24, %zmm25, %zmm26, %zmm27,
722
%zmm28, %zmm29, %zmm30, %zmm31,
723
%rax, %r9, 11, 12);
724
jmp .Laria_gfni_end;
725
.Laria_gfni_192:
726
aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
727
%zmm6, %zmm7, %zmm4, %zmm5,
728
%zmm9, %zmm8, %zmm11, %zmm10,
729
%zmm12, %zmm13, %zmm14, %zmm15,
730
%zmm24, %zmm25, %zmm26, %zmm27,
731
%zmm28, %zmm29, %zmm30, %zmm31,
732
%rax, %r9, 11);
733
aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
734
%zmm4, %zmm5, %zmm6, %zmm7,
735
%zmm8, %zmm9, %zmm10, %zmm11,
736
%zmm12, %zmm13, %zmm14, %zmm15,
737
%zmm24, %zmm25, %zmm26, %zmm27,
738
%zmm28, %zmm29, %zmm30, %zmm31,
739
%rax, %r9, 12);
740
cmpl $14, ARIA_CTX_rounds(CTX);
741
jne .Laria_gfni_256;
742
aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
743
%zmm6, %zmm7, %zmm4, %zmm5,
744
%zmm9, %zmm8, %zmm11, %zmm10,
745
%zmm12, %zmm13, %zmm14, %zmm15,
746
%zmm24, %zmm25, %zmm26, %zmm27,
747
%zmm28, %zmm29, %zmm30, %zmm31,
748
%rax, %r9, 13, 14);
749
jmp .Laria_gfni_end;
750
.Laria_gfni_256:
751
aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
752
%zmm6, %zmm7, %zmm4, %zmm5,
753
%zmm9, %zmm8, %zmm11, %zmm10,
754
%zmm12, %zmm13, %zmm14, %zmm15,
755
%zmm24, %zmm25, %zmm26, %zmm27,
756
%zmm28, %zmm29, %zmm30, %zmm31,
757
%rax, %r9, 13);
758
aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
759
%zmm4, %zmm5, %zmm6, %zmm7,
760
%zmm8, %zmm9, %zmm10, %zmm11,
761
%zmm12, %zmm13, %zmm14, %zmm15,
762
%zmm24, %zmm25, %zmm26, %zmm27,
763
%zmm28, %zmm29, %zmm30, %zmm31,
764
%rax, %r9, 14);
765
aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
766
%zmm6, %zmm7, %zmm4, %zmm5,
767
%zmm9, %zmm8, %zmm11, %zmm10,
768
%zmm12, %zmm13, %zmm14, %zmm15,
769
%zmm24, %zmm25, %zmm26, %zmm27,
770
%zmm28, %zmm29, %zmm30, %zmm31,
771
%rax, %r9, 15, 16);
772
.Laria_gfni_end:
773
debyteslice_16x16b(%zmm9, %zmm12, %zmm3, %zmm6,
774
%zmm8, %zmm13, %zmm2, %zmm7,
775
%zmm11, %zmm14, %zmm1, %zmm4,
776
%zmm10, %zmm15, %zmm0, %zmm5,
777
(%rax), (%r8));
778
FRAME_END
779
RET;
780
SYM_FUNC_END(__aria_gfni_avx512_crypt_64way)
781
782
SYM_TYPED_FUNC_START(aria_gfni_avx512_encrypt_64way)
783
/* input:
784
* %rdi: ctx, CTX
785
* %rsi: dst
786
* %rdx: src
787
*/
788
789
FRAME_BEGIN
790
791
leaq ARIA_CTX_enc_key(CTX), %r9;
792
793
inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
794
%zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
795
%zmm15, %rdx);
796
797
call __aria_gfni_avx512_crypt_64way;
798
799
write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
800
%zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
801
%zmm15, %rax);
802
803
FRAME_END
804
RET;
805
SYM_FUNC_END(aria_gfni_avx512_encrypt_64way)
806
807
SYM_TYPED_FUNC_START(aria_gfni_avx512_decrypt_64way)
808
/* input:
809
* %rdi: ctx, CTX
810
* %rsi: dst
811
* %rdx: src
812
*/
813
814
FRAME_BEGIN
815
816
leaq ARIA_CTX_dec_key(CTX), %r9;
817
818
inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
819
%zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
820
%zmm15, %rdx);
821
822
call __aria_gfni_avx512_crypt_64way;
823
824
write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
825
%zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
826
%zmm15, %rax);
827
828
FRAME_END
829
RET;
830
SYM_FUNC_END(aria_gfni_avx512_decrypt_64way)
831
832
SYM_FUNC_START_LOCAL(__aria_gfni_avx512_ctr_gen_keystream_64way)
833
/* input:
834
* %rdi: ctx
835
* %rsi: dst
836
* %rdx: src
837
* %rcx: keystream
838
* %r8: iv (big endian, 128bit)
839
*/
840
841
FRAME_BEGIN
842
843
vbroadcasti64x2 .Lbswap128_mask (%rip), %zmm19;
844
vmovdqa64 .Lcounter0123_lo (%rip), %zmm21;
845
vbroadcasti64x2 .Lcounter4444_lo (%rip), %zmm22;
846
vbroadcasti64x2 .Lcounter8888_lo (%rip), %zmm23;
847
vbroadcasti64x2 .Lcounter16161616_lo (%rip), %zmm24;
848
vbroadcasti64x2 .Lcounter1111_hi (%rip), %zmm25;
849
850
/* load IV and byteswap */
851
movq 8(%r8), %r11;
852
movq (%r8), %r10;
853
bswapq %r11;
854
bswapq %r10;
855
vbroadcasti64x2 (%r8), %zmm20;
856
vpshufb %zmm19, %zmm20, %zmm20;
857
858
/* check need for handling 64-bit overflow and carry */
859
cmpq $(0xffffffffffffffff - 64), %r11;
860
ja .Lload_ctr_carry;
861
862
/* construct IVs */
863
vpaddq %zmm21, %zmm20, %zmm0; /* +0:+1:+2:+3 */
864
vpaddq %zmm22, %zmm0, %zmm1; /* +4:+5:+6:+7 */
865
vpaddq %zmm23, %zmm0, %zmm2; /* +8:+9:+10:+11 */
866
vpaddq %zmm23, %zmm1, %zmm3; /* +12:+13:+14:+15 */
867
vpaddq %zmm24, %zmm0, %zmm4; /* +16... */
868
vpaddq %zmm24, %zmm1, %zmm5; /* +20... */
869
vpaddq %zmm24, %zmm2, %zmm6; /* +24... */
870
vpaddq %zmm24, %zmm3, %zmm7; /* +28... */
871
vpaddq %zmm24, %zmm4, %zmm8; /* +32... */
872
vpaddq %zmm24, %zmm5, %zmm9; /* +36... */
873
vpaddq %zmm24, %zmm6, %zmm10; /* +40... */
874
vpaddq %zmm24, %zmm7, %zmm11; /* +44... */
875
vpaddq %zmm24, %zmm8, %zmm12; /* +48... */
876
vpaddq %zmm24, %zmm9, %zmm13; /* +52... */
877
vpaddq %zmm24, %zmm10, %zmm14; /* +56... */
878
vpaddq %zmm24, %zmm11, %zmm15; /* +60... */
879
jmp .Lload_ctr_done;
880
881
.Lload_ctr_carry:
882
/* construct IVs */
883
add_le128(%zmm0, %zmm20, %zmm21, %zmm25); /* +0:+1:+2:+3 */
884
add_le128(%zmm1, %zmm0, %zmm22, %zmm25); /* +4:+5:+6:+7 */
885
add_le128(%zmm2, %zmm0, %zmm23, %zmm25); /* +8:+9:+10:+11 */
886
add_le128(%zmm3, %zmm1, %zmm23, %zmm25); /* +12:+13:+14:+15 */
887
add_le128(%zmm4, %zmm0, %zmm24, %zmm25); /* +16... */
888
add_le128(%zmm5, %zmm1, %zmm24, %zmm25); /* +20... */
889
add_le128(%zmm6, %zmm2, %zmm24, %zmm25); /* +24... */
890
add_le128(%zmm7, %zmm3, %zmm24, %zmm25); /* +28... */
891
add_le128(%zmm8, %zmm4, %zmm24, %zmm25); /* +32... */
892
add_le128(%zmm9, %zmm5, %zmm24, %zmm25); /* +36... */
893
add_le128(%zmm10, %zmm6, %zmm24, %zmm25); /* +40... */
894
add_le128(%zmm11, %zmm7, %zmm24, %zmm25); /* +44... */
895
add_le128(%zmm12, %zmm8, %zmm24, %zmm25); /* +48... */
896
add_le128(%zmm13, %zmm9, %zmm24, %zmm25); /* +52... */
897
add_le128(%zmm14, %zmm10, %zmm24, %zmm25); /* +56... */
898
add_le128(%zmm15, %zmm11, %zmm24, %zmm25); /* +60... */
899
900
.Lload_ctr_done:
901
/* Byte-swap IVs and update counter. */
902
addq $64, %r11;
903
adcq $0, %r10;
904
vpshufb %zmm19, %zmm15, %zmm15;
905
vpshufb %zmm19, %zmm14, %zmm14;
906
vpshufb %zmm19, %zmm13, %zmm13;
907
vpshufb %zmm19, %zmm12, %zmm12;
908
vpshufb %zmm19, %zmm11, %zmm11;
909
vpshufb %zmm19, %zmm10, %zmm10;
910
vpshufb %zmm19, %zmm9, %zmm9;
911
vpshufb %zmm19, %zmm8, %zmm8;
912
bswapq %r11;
913
bswapq %r10;
914
vpshufb %zmm19, %zmm7, %zmm7;
915
vpshufb %zmm19, %zmm6, %zmm6;
916
vpshufb %zmm19, %zmm5, %zmm5;
917
vpshufb %zmm19, %zmm4, %zmm4;
918
vpshufb %zmm19, %zmm3, %zmm3;
919
vpshufb %zmm19, %zmm2, %zmm2;
920
vpshufb %zmm19, %zmm1, %zmm1;
921
vpshufb %zmm19, %zmm0, %zmm0;
922
movq %r11, 8(%r8);
923
movq %r10, (%r8);
924
925
FRAME_END
926
RET;
927
SYM_FUNC_END(__aria_gfni_avx512_ctr_gen_keystream_64way)
928
929
SYM_TYPED_FUNC_START(aria_gfni_avx512_ctr_crypt_64way)
930
/* input:
931
* %rdi: ctx
932
* %rsi: dst
933
* %rdx: src
934
* %rcx: keystream
935
* %r8: iv (big endian, 128bit)
936
*/
937
FRAME_BEGIN
938
939
call __aria_gfni_avx512_ctr_gen_keystream_64way
940
941
leaq (%rsi), %r10;
942
leaq (%rdx), %r11;
943
leaq (%rcx), %rsi;
944
leaq (%rcx), %rdx;
945
leaq ARIA_CTX_enc_key(CTX), %r9;
946
947
call __aria_gfni_avx512_crypt_64way;
948
949
vpxorq (0 * 64)(%r11), %zmm3, %zmm3;
950
vpxorq (1 * 64)(%r11), %zmm2, %zmm2;
951
vpxorq (2 * 64)(%r11), %zmm1, %zmm1;
952
vpxorq (3 * 64)(%r11), %zmm0, %zmm0;
953
vpxorq (4 * 64)(%r11), %zmm6, %zmm6;
954
vpxorq (5 * 64)(%r11), %zmm7, %zmm7;
955
vpxorq (6 * 64)(%r11), %zmm4, %zmm4;
956
vpxorq (7 * 64)(%r11), %zmm5, %zmm5;
957
vpxorq (8 * 64)(%r11), %zmm9, %zmm9;
958
vpxorq (9 * 64)(%r11), %zmm8, %zmm8;
959
vpxorq (10 * 64)(%r11), %zmm11, %zmm11;
960
vpxorq (11 * 64)(%r11), %zmm10, %zmm10;
961
vpxorq (12 * 64)(%r11), %zmm12, %zmm12;
962
vpxorq (13 * 64)(%r11), %zmm13, %zmm13;
963
vpxorq (14 * 64)(%r11), %zmm14, %zmm14;
964
vpxorq (15 * 64)(%r11), %zmm15, %zmm15;
965
write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
966
%zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
967
%zmm15, %r10);
968
969
FRAME_END
970
RET;
971
SYM_FUNC_END(aria_gfni_avx512_ctr_crypt_64way)
972
973