Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/x86/chacha-avx512vl-x86_64.S
26292 views
1
/* SPDX-License-Identifier: GPL-2.0+ */
2
/*
3
* ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
4
*
5
* Copyright (C) 2018 Martin Willi
6
*/
7
8
#include <linux/linkage.h>
9
10
.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
11
.align 32
12
CTR2BL: .octa 0x00000000000000000000000000000000
13
.octa 0x00000000000000000000000000000001
14
15
.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
16
.align 32
17
CTR4BL: .octa 0x00000000000000000000000000000002
18
.octa 0x00000000000000000000000000000003
19
20
.section .rodata.cst32.CTR8BL, "aM", @progbits, 32
21
.align 32
22
CTR8BL: .octa 0x00000003000000020000000100000000
23
.octa 0x00000007000000060000000500000004
24
25
.text
26
27
SYM_FUNC_START(chacha_2block_xor_avx512vl)
28
# %rdi: Input state matrix, s
29
# %rsi: up to 2 data blocks output, o
30
# %rdx: up to 2 data blocks input, i
31
# %rcx: input/output length in bytes
32
# %r8d: nrounds
33
34
# This function encrypts two ChaCha blocks by loading the state
35
# matrix twice across four AVX registers. It performs matrix operations
36
# on four words in each matrix in parallel, but requires shuffling to
37
# rearrange the words after each round.
38
39
vzeroupper
40
41
# x0..3[0-2] = s0..3
42
vbroadcasti128 0x00(%rdi),%ymm0
43
vbroadcasti128 0x10(%rdi),%ymm1
44
vbroadcasti128 0x20(%rdi),%ymm2
45
vbroadcasti128 0x30(%rdi),%ymm3
46
47
vpaddd CTR2BL(%rip),%ymm3,%ymm3
48
49
vmovdqa %ymm0,%ymm8
50
vmovdqa %ymm1,%ymm9
51
vmovdqa %ymm2,%ymm10
52
vmovdqa %ymm3,%ymm11
53
54
.Ldoubleround:
55
56
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
57
vpaddd %ymm1,%ymm0,%ymm0
58
vpxord %ymm0,%ymm3,%ymm3
59
vprold $16,%ymm3,%ymm3
60
61
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
62
vpaddd %ymm3,%ymm2,%ymm2
63
vpxord %ymm2,%ymm1,%ymm1
64
vprold $12,%ymm1,%ymm1
65
66
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
67
vpaddd %ymm1,%ymm0,%ymm0
68
vpxord %ymm0,%ymm3,%ymm3
69
vprold $8,%ymm3,%ymm3
70
71
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
72
vpaddd %ymm3,%ymm2,%ymm2
73
vpxord %ymm2,%ymm1,%ymm1
74
vprold $7,%ymm1,%ymm1
75
76
# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
77
vpshufd $0x39,%ymm1,%ymm1
78
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
79
vpshufd $0x4e,%ymm2,%ymm2
80
# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
81
vpshufd $0x93,%ymm3,%ymm3
82
83
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
84
vpaddd %ymm1,%ymm0,%ymm0
85
vpxord %ymm0,%ymm3,%ymm3
86
vprold $16,%ymm3,%ymm3
87
88
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
89
vpaddd %ymm3,%ymm2,%ymm2
90
vpxord %ymm2,%ymm1,%ymm1
91
vprold $12,%ymm1,%ymm1
92
93
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
94
vpaddd %ymm1,%ymm0,%ymm0
95
vpxord %ymm0,%ymm3,%ymm3
96
vprold $8,%ymm3,%ymm3
97
98
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
99
vpaddd %ymm3,%ymm2,%ymm2
100
vpxord %ymm2,%ymm1,%ymm1
101
vprold $7,%ymm1,%ymm1
102
103
# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
104
vpshufd $0x93,%ymm1,%ymm1
105
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
106
vpshufd $0x4e,%ymm2,%ymm2
107
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
108
vpshufd $0x39,%ymm3,%ymm3
109
110
sub $2,%r8d
111
jnz .Ldoubleround
112
113
# o0 = i0 ^ (x0 + s0)
114
vpaddd %ymm8,%ymm0,%ymm7
115
cmp $0x10,%rcx
116
jl .Lxorpart2
117
vpxord 0x00(%rdx),%xmm7,%xmm6
118
vmovdqu %xmm6,0x00(%rsi)
119
vextracti128 $1,%ymm7,%xmm0
120
# o1 = i1 ^ (x1 + s1)
121
vpaddd %ymm9,%ymm1,%ymm7
122
cmp $0x20,%rcx
123
jl .Lxorpart2
124
vpxord 0x10(%rdx),%xmm7,%xmm6
125
vmovdqu %xmm6,0x10(%rsi)
126
vextracti128 $1,%ymm7,%xmm1
127
# o2 = i2 ^ (x2 + s2)
128
vpaddd %ymm10,%ymm2,%ymm7
129
cmp $0x30,%rcx
130
jl .Lxorpart2
131
vpxord 0x20(%rdx),%xmm7,%xmm6
132
vmovdqu %xmm6,0x20(%rsi)
133
vextracti128 $1,%ymm7,%xmm2
134
# o3 = i3 ^ (x3 + s3)
135
vpaddd %ymm11,%ymm3,%ymm7
136
cmp $0x40,%rcx
137
jl .Lxorpart2
138
vpxord 0x30(%rdx),%xmm7,%xmm6
139
vmovdqu %xmm6,0x30(%rsi)
140
vextracti128 $1,%ymm7,%xmm3
141
142
# xor and write second block
143
vmovdqa %xmm0,%xmm7
144
cmp $0x50,%rcx
145
jl .Lxorpart2
146
vpxord 0x40(%rdx),%xmm7,%xmm6
147
vmovdqu %xmm6,0x40(%rsi)
148
149
vmovdqa %xmm1,%xmm7
150
cmp $0x60,%rcx
151
jl .Lxorpart2
152
vpxord 0x50(%rdx),%xmm7,%xmm6
153
vmovdqu %xmm6,0x50(%rsi)
154
155
vmovdqa %xmm2,%xmm7
156
cmp $0x70,%rcx
157
jl .Lxorpart2
158
vpxord 0x60(%rdx),%xmm7,%xmm6
159
vmovdqu %xmm6,0x60(%rsi)
160
161
vmovdqa %xmm3,%xmm7
162
cmp $0x80,%rcx
163
jl .Lxorpart2
164
vpxord 0x70(%rdx),%xmm7,%xmm6
165
vmovdqu %xmm6,0x70(%rsi)
166
167
.Ldone2:
168
vzeroupper
169
RET
170
171
.Lxorpart2:
172
# xor remaining bytes from partial register into output
173
mov %rcx,%rax
174
and $0xf,%rcx
175
jz .Ldone2
176
mov %rax,%r9
177
and $~0xf,%r9
178
179
mov $1,%rax
180
shld %cl,%rax,%rax
181
sub $1,%rax
182
kmovq %rax,%k1
183
184
vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
185
vpxord %xmm7,%xmm1,%xmm1
186
vmovdqu8 %xmm1,(%rsi,%r9){%k1}
187
188
jmp .Ldone2
189
190
SYM_FUNC_END(chacha_2block_xor_avx512vl)
191
192
SYM_FUNC_START(chacha_4block_xor_avx512vl)
193
# %rdi: Input state matrix, s
194
# %rsi: up to 4 data blocks output, o
195
# %rdx: up to 4 data blocks input, i
196
# %rcx: input/output length in bytes
197
# %r8d: nrounds
198
199
# This function encrypts four ChaCha blocks by loading the state
200
# matrix four times across eight AVX registers. It performs matrix
201
# operations on four words in two matrices in parallel, sequentially
202
# to the operations on the four words of the other two matrices. The
203
# required word shuffling has a rather high latency, we can do the
204
# arithmetic on two matrix-pairs without much slowdown.
205
206
vzeroupper
207
208
# x0..3[0-4] = s0..3
209
vbroadcasti128 0x00(%rdi),%ymm0
210
vbroadcasti128 0x10(%rdi),%ymm1
211
vbroadcasti128 0x20(%rdi),%ymm2
212
vbroadcasti128 0x30(%rdi),%ymm3
213
214
vmovdqa %ymm0,%ymm4
215
vmovdqa %ymm1,%ymm5
216
vmovdqa %ymm2,%ymm6
217
vmovdqa %ymm3,%ymm7
218
219
vpaddd CTR2BL(%rip),%ymm3,%ymm3
220
vpaddd CTR4BL(%rip),%ymm7,%ymm7
221
222
vmovdqa %ymm0,%ymm11
223
vmovdqa %ymm1,%ymm12
224
vmovdqa %ymm2,%ymm13
225
vmovdqa %ymm3,%ymm14
226
vmovdqa %ymm7,%ymm15
227
228
.Ldoubleround4:
229
230
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
231
vpaddd %ymm1,%ymm0,%ymm0
232
vpxord %ymm0,%ymm3,%ymm3
233
vprold $16,%ymm3,%ymm3
234
235
vpaddd %ymm5,%ymm4,%ymm4
236
vpxord %ymm4,%ymm7,%ymm7
237
vprold $16,%ymm7,%ymm7
238
239
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
240
vpaddd %ymm3,%ymm2,%ymm2
241
vpxord %ymm2,%ymm1,%ymm1
242
vprold $12,%ymm1,%ymm1
243
244
vpaddd %ymm7,%ymm6,%ymm6
245
vpxord %ymm6,%ymm5,%ymm5
246
vprold $12,%ymm5,%ymm5
247
248
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
249
vpaddd %ymm1,%ymm0,%ymm0
250
vpxord %ymm0,%ymm3,%ymm3
251
vprold $8,%ymm3,%ymm3
252
253
vpaddd %ymm5,%ymm4,%ymm4
254
vpxord %ymm4,%ymm7,%ymm7
255
vprold $8,%ymm7,%ymm7
256
257
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
258
vpaddd %ymm3,%ymm2,%ymm2
259
vpxord %ymm2,%ymm1,%ymm1
260
vprold $7,%ymm1,%ymm1
261
262
vpaddd %ymm7,%ymm6,%ymm6
263
vpxord %ymm6,%ymm5,%ymm5
264
vprold $7,%ymm5,%ymm5
265
266
# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
267
vpshufd $0x39,%ymm1,%ymm1
268
vpshufd $0x39,%ymm5,%ymm5
269
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
270
vpshufd $0x4e,%ymm2,%ymm2
271
vpshufd $0x4e,%ymm6,%ymm6
272
# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
273
vpshufd $0x93,%ymm3,%ymm3
274
vpshufd $0x93,%ymm7,%ymm7
275
276
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
277
vpaddd %ymm1,%ymm0,%ymm0
278
vpxord %ymm0,%ymm3,%ymm3
279
vprold $16,%ymm3,%ymm3
280
281
vpaddd %ymm5,%ymm4,%ymm4
282
vpxord %ymm4,%ymm7,%ymm7
283
vprold $16,%ymm7,%ymm7
284
285
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
286
vpaddd %ymm3,%ymm2,%ymm2
287
vpxord %ymm2,%ymm1,%ymm1
288
vprold $12,%ymm1,%ymm1
289
290
vpaddd %ymm7,%ymm6,%ymm6
291
vpxord %ymm6,%ymm5,%ymm5
292
vprold $12,%ymm5,%ymm5
293
294
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
295
vpaddd %ymm1,%ymm0,%ymm0
296
vpxord %ymm0,%ymm3,%ymm3
297
vprold $8,%ymm3,%ymm3
298
299
vpaddd %ymm5,%ymm4,%ymm4
300
vpxord %ymm4,%ymm7,%ymm7
301
vprold $8,%ymm7,%ymm7
302
303
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
304
vpaddd %ymm3,%ymm2,%ymm2
305
vpxord %ymm2,%ymm1,%ymm1
306
vprold $7,%ymm1,%ymm1
307
308
vpaddd %ymm7,%ymm6,%ymm6
309
vpxord %ymm6,%ymm5,%ymm5
310
vprold $7,%ymm5,%ymm5
311
312
# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
313
vpshufd $0x93,%ymm1,%ymm1
314
vpshufd $0x93,%ymm5,%ymm5
315
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
316
vpshufd $0x4e,%ymm2,%ymm2
317
vpshufd $0x4e,%ymm6,%ymm6
318
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
319
vpshufd $0x39,%ymm3,%ymm3
320
vpshufd $0x39,%ymm7,%ymm7
321
322
sub $2,%r8d
323
jnz .Ldoubleround4
324
325
# o0 = i0 ^ (x0 + s0), first block
326
vpaddd %ymm11,%ymm0,%ymm10
327
cmp $0x10,%rcx
328
jl .Lxorpart4
329
vpxord 0x00(%rdx),%xmm10,%xmm9
330
vmovdqu %xmm9,0x00(%rsi)
331
vextracti128 $1,%ymm10,%xmm0
332
# o1 = i1 ^ (x1 + s1), first block
333
vpaddd %ymm12,%ymm1,%ymm10
334
cmp $0x20,%rcx
335
jl .Lxorpart4
336
vpxord 0x10(%rdx),%xmm10,%xmm9
337
vmovdqu %xmm9,0x10(%rsi)
338
vextracti128 $1,%ymm10,%xmm1
339
# o2 = i2 ^ (x2 + s2), first block
340
vpaddd %ymm13,%ymm2,%ymm10
341
cmp $0x30,%rcx
342
jl .Lxorpart4
343
vpxord 0x20(%rdx),%xmm10,%xmm9
344
vmovdqu %xmm9,0x20(%rsi)
345
vextracti128 $1,%ymm10,%xmm2
346
# o3 = i3 ^ (x3 + s3), first block
347
vpaddd %ymm14,%ymm3,%ymm10
348
cmp $0x40,%rcx
349
jl .Lxorpart4
350
vpxord 0x30(%rdx),%xmm10,%xmm9
351
vmovdqu %xmm9,0x30(%rsi)
352
vextracti128 $1,%ymm10,%xmm3
353
354
# xor and write second block
355
vmovdqa %xmm0,%xmm10
356
cmp $0x50,%rcx
357
jl .Lxorpart4
358
vpxord 0x40(%rdx),%xmm10,%xmm9
359
vmovdqu %xmm9,0x40(%rsi)
360
361
vmovdqa %xmm1,%xmm10
362
cmp $0x60,%rcx
363
jl .Lxorpart4
364
vpxord 0x50(%rdx),%xmm10,%xmm9
365
vmovdqu %xmm9,0x50(%rsi)
366
367
vmovdqa %xmm2,%xmm10
368
cmp $0x70,%rcx
369
jl .Lxorpart4
370
vpxord 0x60(%rdx),%xmm10,%xmm9
371
vmovdqu %xmm9,0x60(%rsi)
372
373
vmovdqa %xmm3,%xmm10
374
cmp $0x80,%rcx
375
jl .Lxorpart4
376
vpxord 0x70(%rdx),%xmm10,%xmm9
377
vmovdqu %xmm9,0x70(%rsi)
378
379
# o0 = i0 ^ (x0 + s0), third block
380
vpaddd %ymm11,%ymm4,%ymm10
381
cmp $0x90,%rcx
382
jl .Lxorpart4
383
vpxord 0x80(%rdx),%xmm10,%xmm9
384
vmovdqu %xmm9,0x80(%rsi)
385
vextracti128 $1,%ymm10,%xmm4
386
# o1 = i1 ^ (x1 + s1), third block
387
vpaddd %ymm12,%ymm5,%ymm10
388
cmp $0xa0,%rcx
389
jl .Lxorpart4
390
vpxord 0x90(%rdx),%xmm10,%xmm9
391
vmovdqu %xmm9,0x90(%rsi)
392
vextracti128 $1,%ymm10,%xmm5
393
# o2 = i2 ^ (x2 + s2), third block
394
vpaddd %ymm13,%ymm6,%ymm10
395
cmp $0xb0,%rcx
396
jl .Lxorpart4
397
vpxord 0xa0(%rdx),%xmm10,%xmm9
398
vmovdqu %xmm9,0xa0(%rsi)
399
vextracti128 $1,%ymm10,%xmm6
400
# o3 = i3 ^ (x3 + s3), third block
401
vpaddd %ymm15,%ymm7,%ymm10
402
cmp $0xc0,%rcx
403
jl .Lxorpart4
404
vpxord 0xb0(%rdx),%xmm10,%xmm9
405
vmovdqu %xmm9,0xb0(%rsi)
406
vextracti128 $1,%ymm10,%xmm7
407
408
# xor and write fourth block
409
vmovdqa %xmm4,%xmm10
410
cmp $0xd0,%rcx
411
jl .Lxorpart4
412
vpxord 0xc0(%rdx),%xmm10,%xmm9
413
vmovdqu %xmm9,0xc0(%rsi)
414
415
vmovdqa %xmm5,%xmm10
416
cmp $0xe0,%rcx
417
jl .Lxorpart4
418
vpxord 0xd0(%rdx),%xmm10,%xmm9
419
vmovdqu %xmm9,0xd0(%rsi)
420
421
vmovdqa %xmm6,%xmm10
422
cmp $0xf0,%rcx
423
jl .Lxorpart4
424
vpxord 0xe0(%rdx),%xmm10,%xmm9
425
vmovdqu %xmm9,0xe0(%rsi)
426
427
vmovdqa %xmm7,%xmm10
428
cmp $0x100,%rcx
429
jl .Lxorpart4
430
vpxord 0xf0(%rdx),%xmm10,%xmm9
431
vmovdqu %xmm9,0xf0(%rsi)
432
433
.Ldone4:
434
vzeroupper
435
RET
436
437
.Lxorpart4:
438
# xor remaining bytes from partial register into output
439
mov %rcx,%rax
440
and $0xf,%rcx
441
jz .Ldone4
442
mov %rax,%r9
443
and $~0xf,%r9
444
445
mov $1,%rax
446
shld %cl,%rax,%rax
447
sub $1,%rax
448
kmovq %rax,%k1
449
450
vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
451
vpxord %xmm10,%xmm1,%xmm1
452
vmovdqu8 %xmm1,(%rsi,%r9){%k1}
453
454
jmp .Ldone4
455
456
SYM_FUNC_END(chacha_4block_xor_avx512vl)
457
458
SYM_FUNC_START(chacha_8block_xor_avx512vl)
459
# %rdi: Input state matrix, s
460
# %rsi: up to 8 data blocks output, o
461
# %rdx: up to 8 data blocks input, i
462
# %rcx: input/output length in bytes
463
# %r8d: nrounds
464
465
# This function encrypts eight consecutive ChaCha blocks by loading
466
# the state matrix in AVX registers eight times. Compared to AVX2, this
467
# mostly benefits from the new rotate instructions in VL and the
468
# additional registers.
469
470
vzeroupper
471
472
# x0..15[0-7] = s[0..15]
473
vpbroadcastd 0x00(%rdi),%ymm0
474
vpbroadcastd 0x04(%rdi),%ymm1
475
vpbroadcastd 0x08(%rdi),%ymm2
476
vpbroadcastd 0x0c(%rdi),%ymm3
477
vpbroadcastd 0x10(%rdi),%ymm4
478
vpbroadcastd 0x14(%rdi),%ymm5
479
vpbroadcastd 0x18(%rdi),%ymm6
480
vpbroadcastd 0x1c(%rdi),%ymm7
481
vpbroadcastd 0x20(%rdi),%ymm8
482
vpbroadcastd 0x24(%rdi),%ymm9
483
vpbroadcastd 0x28(%rdi),%ymm10
484
vpbroadcastd 0x2c(%rdi),%ymm11
485
vpbroadcastd 0x30(%rdi),%ymm12
486
vpbroadcastd 0x34(%rdi),%ymm13
487
vpbroadcastd 0x38(%rdi),%ymm14
488
vpbroadcastd 0x3c(%rdi),%ymm15
489
490
# x12 += counter values 0-3
491
vpaddd CTR8BL(%rip),%ymm12,%ymm12
492
493
vmovdqa64 %ymm0,%ymm16
494
vmovdqa64 %ymm1,%ymm17
495
vmovdqa64 %ymm2,%ymm18
496
vmovdqa64 %ymm3,%ymm19
497
vmovdqa64 %ymm4,%ymm20
498
vmovdqa64 %ymm5,%ymm21
499
vmovdqa64 %ymm6,%ymm22
500
vmovdqa64 %ymm7,%ymm23
501
vmovdqa64 %ymm8,%ymm24
502
vmovdqa64 %ymm9,%ymm25
503
vmovdqa64 %ymm10,%ymm26
504
vmovdqa64 %ymm11,%ymm27
505
vmovdqa64 %ymm12,%ymm28
506
vmovdqa64 %ymm13,%ymm29
507
vmovdqa64 %ymm14,%ymm30
508
vmovdqa64 %ymm15,%ymm31
509
510
.Ldoubleround8:
511
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
512
vpaddd %ymm0,%ymm4,%ymm0
513
vpxord %ymm0,%ymm12,%ymm12
514
vprold $16,%ymm12,%ymm12
515
# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
516
vpaddd %ymm1,%ymm5,%ymm1
517
vpxord %ymm1,%ymm13,%ymm13
518
vprold $16,%ymm13,%ymm13
519
# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
520
vpaddd %ymm2,%ymm6,%ymm2
521
vpxord %ymm2,%ymm14,%ymm14
522
vprold $16,%ymm14,%ymm14
523
# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
524
vpaddd %ymm3,%ymm7,%ymm3
525
vpxord %ymm3,%ymm15,%ymm15
526
vprold $16,%ymm15,%ymm15
527
528
# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
529
vpaddd %ymm12,%ymm8,%ymm8
530
vpxord %ymm8,%ymm4,%ymm4
531
vprold $12,%ymm4,%ymm4
532
# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
533
vpaddd %ymm13,%ymm9,%ymm9
534
vpxord %ymm9,%ymm5,%ymm5
535
vprold $12,%ymm5,%ymm5
536
# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
537
vpaddd %ymm14,%ymm10,%ymm10
538
vpxord %ymm10,%ymm6,%ymm6
539
vprold $12,%ymm6,%ymm6
540
# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
541
vpaddd %ymm15,%ymm11,%ymm11
542
vpxord %ymm11,%ymm7,%ymm7
543
vprold $12,%ymm7,%ymm7
544
545
# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
546
vpaddd %ymm0,%ymm4,%ymm0
547
vpxord %ymm0,%ymm12,%ymm12
548
vprold $8,%ymm12,%ymm12
549
# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
550
vpaddd %ymm1,%ymm5,%ymm1
551
vpxord %ymm1,%ymm13,%ymm13
552
vprold $8,%ymm13,%ymm13
553
# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
554
vpaddd %ymm2,%ymm6,%ymm2
555
vpxord %ymm2,%ymm14,%ymm14
556
vprold $8,%ymm14,%ymm14
557
# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
558
vpaddd %ymm3,%ymm7,%ymm3
559
vpxord %ymm3,%ymm15,%ymm15
560
vprold $8,%ymm15,%ymm15
561
562
# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
563
vpaddd %ymm12,%ymm8,%ymm8
564
vpxord %ymm8,%ymm4,%ymm4
565
vprold $7,%ymm4,%ymm4
566
# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
567
vpaddd %ymm13,%ymm9,%ymm9
568
vpxord %ymm9,%ymm5,%ymm5
569
vprold $7,%ymm5,%ymm5
570
# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
571
vpaddd %ymm14,%ymm10,%ymm10
572
vpxord %ymm10,%ymm6,%ymm6
573
vprold $7,%ymm6,%ymm6
574
# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
575
vpaddd %ymm15,%ymm11,%ymm11
576
vpxord %ymm11,%ymm7,%ymm7
577
vprold $7,%ymm7,%ymm7
578
579
# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
580
vpaddd %ymm0,%ymm5,%ymm0
581
vpxord %ymm0,%ymm15,%ymm15
582
vprold $16,%ymm15,%ymm15
583
# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
584
vpaddd %ymm1,%ymm6,%ymm1
585
vpxord %ymm1,%ymm12,%ymm12
586
vprold $16,%ymm12,%ymm12
587
# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
588
vpaddd %ymm2,%ymm7,%ymm2
589
vpxord %ymm2,%ymm13,%ymm13
590
vprold $16,%ymm13,%ymm13
591
# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
592
vpaddd %ymm3,%ymm4,%ymm3
593
vpxord %ymm3,%ymm14,%ymm14
594
vprold $16,%ymm14,%ymm14
595
596
# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
597
vpaddd %ymm15,%ymm10,%ymm10
598
vpxord %ymm10,%ymm5,%ymm5
599
vprold $12,%ymm5,%ymm5
600
# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
601
vpaddd %ymm12,%ymm11,%ymm11
602
vpxord %ymm11,%ymm6,%ymm6
603
vprold $12,%ymm6,%ymm6
604
# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
605
vpaddd %ymm13,%ymm8,%ymm8
606
vpxord %ymm8,%ymm7,%ymm7
607
vprold $12,%ymm7,%ymm7
608
# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
609
vpaddd %ymm14,%ymm9,%ymm9
610
vpxord %ymm9,%ymm4,%ymm4
611
vprold $12,%ymm4,%ymm4
612
613
# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
614
vpaddd %ymm0,%ymm5,%ymm0
615
vpxord %ymm0,%ymm15,%ymm15
616
vprold $8,%ymm15,%ymm15
617
# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
618
vpaddd %ymm1,%ymm6,%ymm1
619
vpxord %ymm1,%ymm12,%ymm12
620
vprold $8,%ymm12,%ymm12
621
# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
622
vpaddd %ymm2,%ymm7,%ymm2
623
vpxord %ymm2,%ymm13,%ymm13
624
vprold $8,%ymm13,%ymm13
625
# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
626
vpaddd %ymm3,%ymm4,%ymm3
627
vpxord %ymm3,%ymm14,%ymm14
628
vprold $8,%ymm14,%ymm14
629
630
# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
631
vpaddd %ymm15,%ymm10,%ymm10
632
vpxord %ymm10,%ymm5,%ymm5
633
vprold $7,%ymm5,%ymm5
634
# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
635
vpaddd %ymm12,%ymm11,%ymm11
636
vpxord %ymm11,%ymm6,%ymm6
637
vprold $7,%ymm6,%ymm6
638
# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
639
vpaddd %ymm13,%ymm8,%ymm8
640
vpxord %ymm8,%ymm7,%ymm7
641
vprold $7,%ymm7,%ymm7
642
# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
643
vpaddd %ymm14,%ymm9,%ymm9
644
vpxord %ymm9,%ymm4,%ymm4
645
vprold $7,%ymm4,%ymm4
646
647
sub $2,%r8d
648
jnz .Ldoubleround8
649
650
# x0..15[0-3] += s[0..15]
651
vpaddd %ymm16,%ymm0,%ymm0
652
vpaddd %ymm17,%ymm1,%ymm1
653
vpaddd %ymm18,%ymm2,%ymm2
654
vpaddd %ymm19,%ymm3,%ymm3
655
vpaddd %ymm20,%ymm4,%ymm4
656
vpaddd %ymm21,%ymm5,%ymm5
657
vpaddd %ymm22,%ymm6,%ymm6
658
vpaddd %ymm23,%ymm7,%ymm7
659
vpaddd %ymm24,%ymm8,%ymm8
660
vpaddd %ymm25,%ymm9,%ymm9
661
vpaddd %ymm26,%ymm10,%ymm10
662
vpaddd %ymm27,%ymm11,%ymm11
663
vpaddd %ymm28,%ymm12,%ymm12
664
vpaddd %ymm29,%ymm13,%ymm13
665
vpaddd %ymm30,%ymm14,%ymm14
666
vpaddd %ymm31,%ymm15,%ymm15
667
668
# interleave 32-bit words in state n, n+1
669
vpunpckldq %ymm1,%ymm0,%ymm16
670
vpunpckhdq %ymm1,%ymm0,%ymm17
671
vpunpckldq %ymm3,%ymm2,%ymm18
672
vpunpckhdq %ymm3,%ymm2,%ymm19
673
vpunpckldq %ymm5,%ymm4,%ymm20
674
vpunpckhdq %ymm5,%ymm4,%ymm21
675
vpunpckldq %ymm7,%ymm6,%ymm22
676
vpunpckhdq %ymm7,%ymm6,%ymm23
677
vpunpckldq %ymm9,%ymm8,%ymm24
678
vpunpckhdq %ymm9,%ymm8,%ymm25
679
vpunpckldq %ymm11,%ymm10,%ymm26
680
vpunpckhdq %ymm11,%ymm10,%ymm27
681
vpunpckldq %ymm13,%ymm12,%ymm28
682
vpunpckhdq %ymm13,%ymm12,%ymm29
683
vpunpckldq %ymm15,%ymm14,%ymm30
684
vpunpckhdq %ymm15,%ymm14,%ymm31
685
686
# interleave 64-bit words in state n, n+2
687
vpunpcklqdq %ymm18,%ymm16,%ymm0
688
vpunpcklqdq %ymm19,%ymm17,%ymm1
689
vpunpckhqdq %ymm18,%ymm16,%ymm2
690
vpunpckhqdq %ymm19,%ymm17,%ymm3
691
vpunpcklqdq %ymm22,%ymm20,%ymm4
692
vpunpcklqdq %ymm23,%ymm21,%ymm5
693
vpunpckhqdq %ymm22,%ymm20,%ymm6
694
vpunpckhqdq %ymm23,%ymm21,%ymm7
695
vpunpcklqdq %ymm26,%ymm24,%ymm8
696
vpunpcklqdq %ymm27,%ymm25,%ymm9
697
vpunpckhqdq %ymm26,%ymm24,%ymm10
698
vpunpckhqdq %ymm27,%ymm25,%ymm11
699
vpunpcklqdq %ymm30,%ymm28,%ymm12
700
vpunpcklqdq %ymm31,%ymm29,%ymm13
701
vpunpckhqdq %ymm30,%ymm28,%ymm14
702
vpunpckhqdq %ymm31,%ymm29,%ymm15
703
704
# interleave 128-bit words in state n, n+4
705
# xor/write first four blocks
706
vmovdqa64 %ymm0,%ymm16
707
vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
708
cmp $0x0020,%rcx
709
jl .Lxorpart8
710
vpxord 0x0000(%rdx),%ymm0,%ymm0
711
vmovdqu64 %ymm0,0x0000(%rsi)
712
vmovdqa64 %ymm16,%ymm0
713
vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
714
715
vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
716
cmp $0x0040,%rcx
717
jl .Lxorpart8
718
vpxord 0x0020(%rdx),%ymm0,%ymm0
719
vmovdqu64 %ymm0,0x0020(%rsi)
720
vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
721
722
vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
723
cmp $0x0060,%rcx
724
jl .Lxorpart8
725
vpxord 0x0040(%rdx),%ymm0,%ymm0
726
vmovdqu64 %ymm0,0x0040(%rsi)
727
vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
728
729
vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
730
cmp $0x0080,%rcx
731
jl .Lxorpart8
732
vpxord 0x0060(%rdx),%ymm0,%ymm0
733
vmovdqu64 %ymm0,0x0060(%rsi)
734
vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
735
736
vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
737
cmp $0x00a0,%rcx
738
jl .Lxorpart8
739
vpxord 0x0080(%rdx),%ymm0,%ymm0
740
vmovdqu64 %ymm0,0x0080(%rsi)
741
vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
742
743
vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
744
cmp $0x00c0,%rcx
745
jl .Lxorpart8
746
vpxord 0x00a0(%rdx),%ymm0,%ymm0
747
vmovdqu64 %ymm0,0x00a0(%rsi)
748
vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
749
750
vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
751
cmp $0x00e0,%rcx
752
jl .Lxorpart8
753
vpxord 0x00c0(%rdx),%ymm0,%ymm0
754
vmovdqu64 %ymm0,0x00c0(%rsi)
755
vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
756
757
vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
758
cmp $0x0100,%rcx
759
jl .Lxorpart8
760
vpxord 0x00e0(%rdx),%ymm0,%ymm0
761
vmovdqu64 %ymm0,0x00e0(%rsi)
762
vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
763
764
# xor remaining blocks, write to output
765
vmovdqa64 %ymm4,%ymm0
766
cmp $0x0120,%rcx
767
jl .Lxorpart8
768
vpxord 0x0100(%rdx),%ymm0,%ymm0
769
vmovdqu64 %ymm0,0x0100(%rsi)
770
771
vmovdqa64 %ymm12,%ymm0
772
cmp $0x0140,%rcx
773
jl .Lxorpart8
774
vpxord 0x0120(%rdx),%ymm0,%ymm0
775
vmovdqu64 %ymm0,0x0120(%rsi)
776
777
vmovdqa64 %ymm6,%ymm0
778
cmp $0x0160,%rcx
779
jl .Lxorpart8
780
vpxord 0x0140(%rdx),%ymm0,%ymm0
781
vmovdqu64 %ymm0,0x0140(%rsi)
782
783
vmovdqa64 %ymm14,%ymm0
784
cmp $0x0180,%rcx
785
jl .Lxorpart8
786
vpxord 0x0160(%rdx),%ymm0,%ymm0
787
vmovdqu64 %ymm0,0x0160(%rsi)
788
789
vmovdqa64 %ymm5,%ymm0
790
cmp $0x01a0,%rcx
791
jl .Lxorpart8
792
vpxord 0x0180(%rdx),%ymm0,%ymm0
793
vmovdqu64 %ymm0,0x0180(%rsi)
794
795
vmovdqa64 %ymm13,%ymm0
796
cmp $0x01c0,%rcx
797
jl .Lxorpart8
798
vpxord 0x01a0(%rdx),%ymm0,%ymm0
799
vmovdqu64 %ymm0,0x01a0(%rsi)
800
801
vmovdqa64 %ymm7,%ymm0
802
cmp $0x01e0,%rcx
803
jl .Lxorpart8
804
vpxord 0x01c0(%rdx),%ymm0,%ymm0
805
vmovdqu64 %ymm0,0x01c0(%rsi)
806
807
vmovdqa64 %ymm15,%ymm0
808
cmp $0x0200,%rcx
809
jl .Lxorpart8
810
vpxord 0x01e0(%rdx),%ymm0,%ymm0
811
vmovdqu64 %ymm0,0x01e0(%rsi)
812
813
.Ldone8:
814
vzeroupper
815
RET
816
817
.Lxorpart8:
818
# xor remaining bytes from partial register into output
819
mov %rcx,%rax
820
and $0x1f,%rcx
821
jz .Ldone8
822
mov %rax,%r9
823
and $~0x1f,%r9
824
825
mov $1,%rax
826
shld %cl,%rax,%rax
827
sub $1,%rax
828
kmovq %rax,%k1
829
830
vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
831
vpxord %ymm0,%ymm1,%ymm1
832
vmovdqu8 %ymm1,(%rsi,%r9){%k1}
833
834
jmp .Ldone8
835
836
SYM_FUNC_END(chacha_8block_xor_avx512vl)
837
838