Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/x86/chacha-avx2-x86_64.S
26292 views
1
/* SPDX-License-Identifier: GPL-2.0-or-later */
2
/*
3
* ChaCha 256-bit cipher algorithm, x64 AVX2 functions
4
*
5
* Copyright (C) 2015 Martin Willi
6
*/
7
8
#include <linux/linkage.h>
9
10
.section .rodata.cst32.ROT8, "aM", @progbits, 32
11
.align 32
12
ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
13
.octa 0x0e0d0c0f0a09080b0605040702010003
14
15
.section .rodata.cst32.ROT16, "aM", @progbits, 32
16
.align 32
17
ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
18
.octa 0x0d0c0f0e09080b0a0504070601000302
19
20
.section .rodata.cst32.CTRINC, "aM", @progbits, 32
21
.align 32
22
CTRINC: .octa 0x00000003000000020000000100000000
23
.octa 0x00000007000000060000000500000004
24
25
.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
26
.align 32
27
CTR2BL: .octa 0x00000000000000000000000000000000
28
.octa 0x00000000000000000000000000000001
29
30
.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
31
.align 32
32
CTR4BL: .octa 0x00000000000000000000000000000002
33
.octa 0x00000000000000000000000000000003
34
35
.text
36
37
SYM_FUNC_START(chacha_2block_xor_avx2)
38
# %rdi: Input state matrix, s
39
# %rsi: up to 2 data blocks output, o
40
# %rdx: up to 2 data blocks input, i
41
# %rcx: input/output length in bytes
42
# %r8d: nrounds
43
44
# This function encrypts two ChaCha blocks by loading the state
45
# matrix twice across four AVX registers. It performs matrix operations
46
# on four words in each matrix in parallel, but requires shuffling to
47
# rearrange the words after each round.
48
49
vzeroupper
50
51
# x0..3[0-2] = s0..3
52
vbroadcasti128 0x00(%rdi),%ymm0
53
vbroadcasti128 0x10(%rdi),%ymm1
54
vbroadcasti128 0x20(%rdi),%ymm2
55
vbroadcasti128 0x30(%rdi),%ymm3
56
57
vpaddd CTR2BL(%rip),%ymm3,%ymm3
58
59
vmovdqa %ymm0,%ymm8
60
vmovdqa %ymm1,%ymm9
61
vmovdqa %ymm2,%ymm10
62
vmovdqa %ymm3,%ymm11
63
64
vmovdqa ROT8(%rip),%ymm4
65
vmovdqa ROT16(%rip),%ymm5
66
67
mov %rcx,%rax
68
69
.Ldoubleround:
70
71
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
72
vpaddd %ymm1,%ymm0,%ymm0
73
vpxor %ymm0,%ymm3,%ymm3
74
vpshufb %ymm5,%ymm3,%ymm3
75
76
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
77
vpaddd %ymm3,%ymm2,%ymm2
78
vpxor %ymm2,%ymm1,%ymm1
79
vmovdqa %ymm1,%ymm6
80
vpslld $12,%ymm6,%ymm6
81
vpsrld $20,%ymm1,%ymm1
82
vpor %ymm6,%ymm1,%ymm1
83
84
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
85
vpaddd %ymm1,%ymm0,%ymm0
86
vpxor %ymm0,%ymm3,%ymm3
87
vpshufb %ymm4,%ymm3,%ymm3
88
89
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
90
vpaddd %ymm3,%ymm2,%ymm2
91
vpxor %ymm2,%ymm1,%ymm1
92
vmovdqa %ymm1,%ymm7
93
vpslld $7,%ymm7,%ymm7
94
vpsrld $25,%ymm1,%ymm1
95
vpor %ymm7,%ymm1,%ymm1
96
97
# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
98
vpshufd $0x39,%ymm1,%ymm1
99
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
100
vpshufd $0x4e,%ymm2,%ymm2
101
# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
102
vpshufd $0x93,%ymm3,%ymm3
103
104
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
105
vpaddd %ymm1,%ymm0,%ymm0
106
vpxor %ymm0,%ymm3,%ymm3
107
vpshufb %ymm5,%ymm3,%ymm3
108
109
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
110
vpaddd %ymm3,%ymm2,%ymm2
111
vpxor %ymm2,%ymm1,%ymm1
112
vmovdqa %ymm1,%ymm6
113
vpslld $12,%ymm6,%ymm6
114
vpsrld $20,%ymm1,%ymm1
115
vpor %ymm6,%ymm1,%ymm1
116
117
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
118
vpaddd %ymm1,%ymm0,%ymm0
119
vpxor %ymm0,%ymm3,%ymm3
120
vpshufb %ymm4,%ymm3,%ymm3
121
122
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
123
vpaddd %ymm3,%ymm2,%ymm2
124
vpxor %ymm2,%ymm1,%ymm1
125
vmovdqa %ymm1,%ymm7
126
vpslld $7,%ymm7,%ymm7
127
vpsrld $25,%ymm1,%ymm1
128
vpor %ymm7,%ymm1,%ymm1
129
130
# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
131
vpshufd $0x93,%ymm1,%ymm1
132
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
133
vpshufd $0x4e,%ymm2,%ymm2
134
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
135
vpshufd $0x39,%ymm3,%ymm3
136
137
sub $2,%r8d
138
jnz .Ldoubleround
139
140
# o0 = i0 ^ (x0 + s0)
141
vpaddd %ymm8,%ymm0,%ymm7
142
cmp $0x10,%rax
143
jl .Lxorpart2
144
vpxor 0x00(%rdx),%xmm7,%xmm6
145
vmovdqu %xmm6,0x00(%rsi)
146
vextracti128 $1,%ymm7,%xmm0
147
# o1 = i1 ^ (x1 + s1)
148
vpaddd %ymm9,%ymm1,%ymm7
149
cmp $0x20,%rax
150
jl .Lxorpart2
151
vpxor 0x10(%rdx),%xmm7,%xmm6
152
vmovdqu %xmm6,0x10(%rsi)
153
vextracti128 $1,%ymm7,%xmm1
154
# o2 = i2 ^ (x2 + s2)
155
vpaddd %ymm10,%ymm2,%ymm7
156
cmp $0x30,%rax
157
jl .Lxorpart2
158
vpxor 0x20(%rdx),%xmm7,%xmm6
159
vmovdqu %xmm6,0x20(%rsi)
160
vextracti128 $1,%ymm7,%xmm2
161
# o3 = i3 ^ (x3 + s3)
162
vpaddd %ymm11,%ymm3,%ymm7
163
cmp $0x40,%rax
164
jl .Lxorpart2
165
vpxor 0x30(%rdx),%xmm7,%xmm6
166
vmovdqu %xmm6,0x30(%rsi)
167
vextracti128 $1,%ymm7,%xmm3
168
169
# xor and write second block
170
vmovdqa %xmm0,%xmm7
171
cmp $0x50,%rax
172
jl .Lxorpart2
173
vpxor 0x40(%rdx),%xmm7,%xmm6
174
vmovdqu %xmm6,0x40(%rsi)
175
176
vmovdqa %xmm1,%xmm7
177
cmp $0x60,%rax
178
jl .Lxorpart2
179
vpxor 0x50(%rdx),%xmm7,%xmm6
180
vmovdqu %xmm6,0x50(%rsi)
181
182
vmovdqa %xmm2,%xmm7
183
cmp $0x70,%rax
184
jl .Lxorpart2
185
vpxor 0x60(%rdx),%xmm7,%xmm6
186
vmovdqu %xmm6,0x60(%rsi)
187
188
vmovdqa %xmm3,%xmm7
189
cmp $0x80,%rax
190
jl .Lxorpart2
191
vpxor 0x70(%rdx),%xmm7,%xmm6
192
vmovdqu %xmm6,0x70(%rsi)
193
194
.Ldone2:
195
vzeroupper
196
RET
197
198
.Lxorpart2:
199
# xor remaining bytes from partial register into output
200
mov %rax,%r9
201
and $0x0f,%r9
202
jz .Ldone2
203
and $~0x0f,%rax
204
205
mov %rsi,%r11
206
207
lea 8(%rsp),%r10
208
sub $0x10,%rsp
209
and $~31,%rsp
210
211
lea (%rdx,%rax),%rsi
212
mov %rsp,%rdi
213
mov %r9,%rcx
214
rep movsb
215
216
vpxor 0x00(%rsp),%xmm7,%xmm7
217
vmovdqa %xmm7,0x00(%rsp)
218
219
mov %rsp,%rsi
220
lea (%r11,%rax),%rdi
221
mov %r9,%rcx
222
rep movsb
223
224
lea -8(%r10),%rsp
225
jmp .Ldone2
226
227
SYM_FUNC_END(chacha_2block_xor_avx2)
228
229
SYM_FUNC_START(chacha_4block_xor_avx2)
230
# %rdi: Input state matrix, s
231
# %rsi: up to 4 data blocks output, o
232
# %rdx: up to 4 data blocks input, i
233
# %rcx: input/output length in bytes
234
# %r8d: nrounds
235
236
# This function encrypts four ChaCha blocks by loading the state
237
# matrix four times across eight AVX registers. It performs matrix
238
# operations on four words in two matrices in parallel, sequentially
239
# to the operations on the four words of the other two matrices. The
240
# required word shuffling has a rather high latency, we can do the
241
# arithmetic on two matrix-pairs without much slowdown.
242
243
vzeroupper
244
245
# x0..3[0-4] = s0..3
246
vbroadcasti128 0x00(%rdi),%ymm0
247
vbroadcasti128 0x10(%rdi),%ymm1
248
vbroadcasti128 0x20(%rdi),%ymm2
249
vbroadcasti128 0x30(%rdi),%ymm3
250
251
vmovdqa %ymm0,%ymm4
252
vmovdqa %ymm1,%ymm5
253
vmovdqa %ymm2,%ymm6
254
vmovdqa %ymm3,%ymm7
255
256
vpaddd CTR2BL(%rip),%ymm3,%ymm3
257
vpaddd CTR4BL(%rip),%ymm7,%ymm7
258
259
vmovdqa %ymm0,%ymm11
260
vmovdqa %ymm1,%ymm12
261
vmovdqa %ymm2,%ymm13
262
vmovdqa %ymm3,%ymm14
263
vmovdqa %ymm7,%ymm15
264
265
vmovdqa ROT8(%rip),%ymm8
266
vmovdqa ROT16(%rip),%ymm9
267
268
mov %rcx,%rax
269
270
.Ldoubleround4:
271
272
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
273
vpaddd %ymm1,%ymm0,%ymm0
274
vpxor %ymm0,%ymm3,%ymm3
275
vpshufb %ymm9,%ymm3,%ymm3
276
277
vpaddd %ymm5,%ymm4,%ymm4
278
vpxor %ymm4,%ymm7,%ymm7
279
vpshufb %ymm9,%ymm7,%ymm7
280
281
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
282
vpaddd %ymm3,%ymm2,%ymm2
283
vpxor %ymm2,%ymm1,%ymm1
284
vmovdqa %ymm1,%ymm10
285
vpslld $12,%ymm10,%ymm10
286
vpsrld $20,%ymm1,%ymm1
287
vpor %ymm10,%ymm1,%ymm1
288
289
vpaddd %ymm7,%ymm6,%ymm6
290
vpxor %ymm6,%ymm5,%ymm5
291
vmovdqa %ymm5,%ymm10
292
vpslld $12,%ymm10,%ymm10
293
vpsrld $20,%ymm5,%ymm5
294
vpor %ymm10,%ymm5,%ymm5
295
296
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
297
vpaddd %ymm1,%ymm0,%ymm0
298
vpxor %ymm0,%ymm3,%ymm3
299
vpshufb %ymm8,%ymm3,%ymm3
300
301
vpaddd %ymm5,%ymm4,%ymm4
302
vpxor %ymm4,%ymm7,%ymm7
303
vpshufb %ymm8,%ymm7,%ymm7
304
305
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
306
vpaddd %ymm3,%ymm2,%ymm2
307
vpxor %ymm2,%ymm1,%ymm1
308
vmovdqa %ymm1,%ymm10
309
vpslld $7,%ymm10,%ymm10
310
vpsrld $25,%ymm1,%ymm1
311
vpor %ymm10,%ymm1,%ymm1
312
313
vpaddd %ymm7,%ymm6,%ymm6
314
vpxor %ymm6,%ymm5,%ymm5
315
vmovdqa %ymm5,%ymm10
316
vpslld $7,%ymm10,%ymm10
317
vpsrld $25,%ymm5,%ymm5
318
vpor %ymm10,%ymm5,%ymm5
319
320
# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
321
vpshufd $0x39,%ymm1,%ymm1
322
vpshufd $0x39,%ymm5,%ymm5
323
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
324
vpshufd $0x4e,%ymm2,%ymm2
325
vpshufd $0x4e,%ymm6,%ymm6
326
# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
327
vpshufd $0x93,%ymm3,%ymm3
328
vpshufd $0x93,%ymm7,%ymm7
329
330
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
331
vpaddd %ymm1,%ymm0,%ymm0
332
vpxor %ymm0,%ymm3,%ymm3
333
vpshufb %ymm9,%ymm3,%ymm3
334
335
vpaddd %ymm5,%ymm4,%ymm4
336
vpxor %ymm4,%ymm7,%ymm7
337
vpshufb %ymm9,%ymm7,%ymm7
338
339
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
340
vpaddd %ymm3,%ymm2,%ymm2
341
vpxor %ymm2,%ymm1,%ymm1
342
vmovdqa %ymm1,%ymm10
343
vpslld $12,%ymm10,%ymm10
344
vpsrld $20,%ymm1,%ymm1
345
vpor %ymm10,%ymm1,%ymm1
346
347
vpaddd %ymm7,%ymm6,%ymm6
348
vpxor %ymm6,%ymm5,%ymm5
349
vmovdqa %ymm5,%ymm10
350
vpslld $12,%ymm10,%ymm10
351
vpsrld $20,%ymm5,%ymm5
352
vpor %ymm10,%ymm5,%ymm5
353
354
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
355
vpaddd %ymm1,%ymm0,%ymm0
356
vpxor %ymm0,%ymm3,%ymm3
357
vpshufb %ymm8,%ymm3,%ymm3
358
359
vpaddd %ymm5,%ymm4,%ymm4
360
vpxor %ymm4,%ymm7,%ymm7
361
vpshufb %ymm8,%ymm7,%ymm7
362
363
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
364
vpaddd %ymm3,%ymm2,%ymm2
365
vpxor %ymm2,%ymm1,%ymm1
366
vmovdqa %ymm1,%ymm10
367
vpslld $7,%ymm10,%ymm10
368
vpsrld $25,%ymm1,%ymm1
369
vpor %ymm10,%ymm1,%ymm1
370
371
vpaddd %ymm7,%ymm6,%ymm6
372
vpxor %ymm6,%ymm5,%ymm5
373
vmovdqa %ymm5,%ymm10
374
vpslld $7,%ymm10,%ymm10
375
vpsrld $25,%ymm5,%ymm5
376
vpor %ymm10,%ymm5,%ymm5
377
378
# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
379
vpshufd $0x93,%ymm1,%ymm1
380
vpshufd $0x93,%ymm5,%ymm5
381
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
382
vpshufd $0x4e,%ymm2,%ymm2
383
vpshufd $0x4e,%ymm6,%ymm6
384
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
385
vpshufd $0x39,%ymm3,%ymm3
386
vpshufd $0x39,%ymm7,%ymm7
387
388
sub $2,%r8d
389
jnz .Ldoubleround4
390
391
# o0 = i0 ^ (x0 + s0), first block
392
vpaddd %ymm11,%ymm0,%ymm10
393
cmp $0x10,%rax
394
jl .Lxorpart4
395
vpxor 0x00(%rdx),%xmm10,%xmm9
396
vmovdqu %xmm9,0x00(%rsi)
397
vextracti128 $1,%ymm10,%xmm0
398
# o1 = i1 ^ (x1 + s1), first block
399
vpaddd %ymm12,%ymm1,%ymm10
400
cmp $0x20,%rax
401
jl .Lxorpart4
402
vpxor 0x10(%rdx),%xmm10,%xmm9
403
vmovdqu %xmm9,0x10(%rsi)
404
vextracti128 $1,%ymm10,%xmm1
405
# o2 = i2 ^ (x2 + s2), first block
406
vpaddd %ymm13,%ymm2,%ymm10
407
cmp $0x30,%rax
408
jl .Lxorpart4
409
vpxor 0x20(%rdx),%xmm10,%xmm9
410
vmovdqu %xmm9,0x20(%rsi)
411
vextracti128 $1,%ymm10,%xmm2
412
# o3 = i3 ^ (x3 + s3), first block
413
vpaddd %ymm14,%ymm3,%ymm10
414
cmp $0x40,%rax
415
jl .Lxorpart4
416
vpxor 0x30(%rdx),%xmm10,%xmm9
417
vmovdqu %xmm9,0x30(%rsi)
418
vextracti128 $1,%ymm10,%xmm3
419
420
# xor and write second block
421
vmovdqa %xmm0,%xmm10
422
cmp $0x50,%rax
423
jl .Lxorpart4
424
vpxor 0x40(%rdx),%xmm10,%xmm9
425
vmovdqu %xmm9,0x40(%rsi)
426
427
vmovdqa %xmm1,%xmm10
428
cmp $0x60,%rax
429
jl .Lxorpart4
430
vpxor 0x50(%rdx),%xmm10,%xmm9
431
vmovdqu %xmm9,0x50(%rsi)
432
433
vmovdqa %xmm2,%xmm10
434
cmp $0x70,%rax
435
jl .Lxorpart4
436
vpxor 0x60(%rdx),%xmm10,%xmm9
437
vmovdqu %xmm9,0x60(%rsi)
438
439
vmovdqa %xmm3,%xmm10
440
cmp $0x80,%rax
441
jl .Lxorpart4
442
vpxor 0x70(%rdx),%xmm10,%xmm9
443
vmovdqu %xmm9,0x70(%rsi)
444
445
# o0 = i0 ^ (x0 + s0), third block
446
vpaddd %ymm11,%ymm4,%ymm10
447
cmp $0x90,%rax
448
jl .Lxorpart4
449
vpxor 0x80(%rdx),%xmm10,%xmm9
450
vmovdqu %xmm9,0x80(%rsi)
451
vextracti128 $1,%ymm10,%xmm4
452
# o1 = i1 ^ (x1 + s1), third block
453
vpaddd %ymm12,%ymm5,%ymm10
454
cmp $0xa0,%rax
455
jl .Lxorpart4
456
vpxor 0x90(%rdx),%xmm10,%xmm9
457
vmovdqu %xmm9,0x90(%rsi)
458
vextracti128 $1,%ymm10,%xmm5
459
# o2 = i2 ^ (x2 + s2), third block
460
vpaddd %ymm13,%ymm6,%ymm10
461
cmp $0xb0,%rax
462
jl .Lxorpart4
463
vpxor 0xa0(%rdx),%xmm10,%xmm9
464
vmovdqu %xmm9,0xa0(%rsi)
465
vextracti128 $1,%ymm10,%xmm6
466
# o3 = i3 ^ (x3 + s3), third block
467
vpaddd %ymm15,%ymm7,%ymm10
468
cmp $0xc0,%rax
469
jl .Lxorpart4
470
vpxor 0xb0(%rdx),%xmm10,%xmm9
471
vmovdqu %xmm9,0xb0(%rsi)
472
vextracti128 $1,%ymm10,%xmm7
473
474
# xor and write fourth block
475
vmovdqa %xmm4,%xmm10
476
cmp $0xd0,%rax
477
jl .Lxorpart4
478
vpxor 0xc0(%rdx),%xmm10,%xmm9
479
vmovdqu %xmm9,0xc0(%rsi)
480
481
vmovdqa %xmm5,%xmm10
482
cmp $0xe0,%rax
483
jl .Lxorpart4
484
vpxor 0xd0(%rdx),%xmm10,%xmm9
485
vmovdqu %xmm9,0xd0(%rsi)
486
487
vmovdqa %xmm6,%xmm10
488
cmp $0xf0,%rax
489
jl .Lxorpart4
490
vpxor 0xe0(%rdx),%xmm10,%xmm9
491
vmovdqu %xmm9,0xe0(%rsi)
492
493
vmovdqa %xmm7,%xmm10
494
cmp $0x100,%rax
495
jl .Lxorpart4
496
vpxor 0xf0(%rdx),%xmm10,%xmm9
497
vmovdqu %xmm9,0xf0(%rsi)
498
499
.Ldone4:
500
vzeroupper
501
RET
502
503
.Lxorpart4:
504
# xor remaining bytes from partial register into output
505
mov %rax,%r9
506
and $0x0f,%r9
507
jz .Ldone4
508
and $~0x0f,%rax
509
510
mov %rsi,%r11
511
512
lea 8(%rsp),%r10
513
sub $0x10,%rsp
514
and $~31,%rsp
515
516
lea (%rdx,%rax),%rsi
517
mov %rsp,%rdi
518
mov %r9,%rcx
519
rep movsb
520
521
vpxor 0x00(%rsp),%xmm10,%xmm10
522
vmovdqa %xmm10,0x00(%rsp)
523
524
mov %rsp,%rsi
525
lea (%r11,%rax),%rdi
526
mov %r9,%rcx
527
rep movsb
528
529
lea -8(%r10),%rsp
530
jmp .Ldone4
531
532
SYM_FUNC_END(chacha_4block_xor_avx2)
533
534
SYM_FUNC_START(chacha_8block_xor_avx2)
535
# %rdi: Input state matrix, s
536
# %rsi: up to 8 data blocks output, o
537
# %rdx: up to 8 data blocks input, i
538
# %rcx: input/output length in bytes
539
# %r8d: nrounds
540
541
# This function encrypts eight consecutive ChaCha blocks by loading
542
# the state matrix in AVX registers eight times. As we need some
543
# scratch registers, we save the first four registers on the stack. The
544
# algorithm performs each operation on the corresponding word of each
545
# state matrix, hence requires no word shuffling. For final XORing step
546
# we transpose the matrix by interleaving 32-, 64- and then 128-bit
547
# words, which allows us to do XOR in AVX registers. 8/16-bit word
548
# rotation is done with the slightly better performing byte shuffling,
549
# 7/12-bit word rotation uses traditional shift+OR.
550
551
vzeroupper
552
# 4 * 32 byte stack, 32-byte aligned
553
lea 8(%rsp),%r10
554
and $~31, %rsp
555
sub $0x80, %rsp
556
mov %rcx,%rax
557
558
# x0..15[0-7] = s[0..15]
559
vpbroadcastd 0x00(%rdi),%ymm0
560
vpbroadcastd 0x04(%rdi),%ymm1
561
vpbroadcastd 0x08(%rdi),%ymm2
562
vpbroadcastd 0x0c(%rdi),%ymm3
563
vpbroadcastd 0x10(%rdi),%ymm4
564
vpbroadcastd 0x14(%rdi),%ymm5
565
vpbroadcastd 0x18(%rdi),%ymm6
566
vpbroadcastd 0x1c(%rdi),%ymm7
567
vpbroadcastd 0x20(%rdi),%ymm8
568
vpbroadcastd 0x24(%rdi),%ymm9
569
vpbroadcastd 0x28(%rdi),%ymm10
570
vpbroadcastd 0x2c(%rdi),%ymm11
571
vpbroadcastd 0x30(%rdi),%ymm12
572
vpbroadcastd 0x34(%rdi),%ymm13
573
vpbroadcastd 0x38(%rdi),%ymm14
574
vpbroadcastd 0x3c(%rdi),%ymm15
575
# x0..3 on stack
576
vmovdqa %ymm0,0x00(%rsp)
577
vmovdqa %ymm1,0x20(%rsp)
578
vmovdqa %ymm2,0x40(%rsp)
579
vmovdqa %ymm3,0x60(%rsp)
580
581
vmovdqa CTRINC(%rip),%ymm1
582
vmovdqa ROT8(%rip),%ymm2
583
vmovdqa ROT16(%rip),%ymm3
584
585
# x12 += counter values 0-3
586
vpaddd %ymm1,%ymm12,%ymm12
587
588
.Ldoubleround8:
589
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
590
vpaddd 0x00(%rsp),%ymm4,%ymm0
591
vmovdqa %ymm0,0x00(%rsp)
592
vpxor %ymm0,%ymm12,%ymm12
593
vpshufb %ymm3,%ymm12,%ymm12
594
# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
595
vpaddd 0x20(%rsp),%ymm5,%ymm0
596
vmovdqa %ymm0,0x20(%rsp)
597
vpxor %ymm0,%ymm13,%ymm13
598
vpshufb %ymm3,%ymm13,%ymm13
599
# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
600
vpaddd 0x40(%rsp),%ymm6,%ymm0
601
vmovdqa %ymm0,0x40(%rsp)
602
vpxor %ymm0,%ymm14,%ymm14
603
vpshufb %ymm3,%ymm14,%ymm14
604
# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
605
vpaddd 0x60(%rsp),%ymm7,%ymm0
606
vmovdqa %ymm0,0x60(%rsp)
607
vpxor %ymm0,%ymm15,%ymm15
608
vpshufb %ymm3,%ymm15,%ymm15
609
610
# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
611
vpaddd %ymm12,%ymm8,%ymm8
612
vpxor %ymm8,%ymm4,%ymm4
613
vpslld $12,%ymm4,%ymm0
614
vpsrld $20,%ymm4,%ymm4
615
vpor %ymm0,%ymm4,%ymm4
616
# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
617
vpaddd %ymm13,%ymm9,%ymm9
618
vpxor %ymm9,%ymm5,%ymm5
619
vpslld $12,%ymm5,%ymm0
620
vpsrld $20,%ymm5,%ymm5
621
vpor %ymm0,%ymm5,%ymm5
622
# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
623
vpaddd %ymm14,%ymm10,%ymm10
624
vpxor %ymm10,%ymm6,%ymm6
625
vpslld $12,%ymm6,%ymm0
626
vpsrld $20,%ymm6,%ymm6
627
vpor %ymm0,%ymm6,%ymm6
628
# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
629
vpaddd %ymm15,%ymm11,%ymm11
630
vpxor %ymm11,%ymm7,%ymm7
631
vpslld $12,%ymm7,%ymm0
632
vpsrld $20,%ymm7,%ymm7
633
vpor %ymm0,%ymm7,%ymm7
634
635
# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
636
vpaddd 0x00(%rsp),%ymm4,%ymm0
637
vmovdqa %ymm0,0x00(%rsp)
638
vpxor %ymm0,%ymm12,%ymm12
639
vpshufb %ymm2,%ymm12,%ymm12
640
# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
641
vpaddd 0x20(%rsp),%ymm5,%ymm0
642
vmovdqa %ymm0,0x20(%rsp)
643
vpxor %ymm0,%ymm13,%ymm13
644
vpshufb %ymm2,%ymm13,%ymm13
645
# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
646
vpaddd 0x40(%rsp),%ymm6,%ymm0
647
vmovdqa %ymm0,0x40(%rsp)
648
vpxor %ymm0,%ymm14,%ymm14
649
vpshufb %ymm2,%ymm14,%ymm14
650
# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
651
vpaddd 0x60(%rsp),%ymm7,%ymm0
652
vmovdqa %ymm0,0x60(%rsp)
653
vpxor %ymm0,%ymm15,%ymm15
654
vpshufb %ymm2,%ymm15,%ymm15
655
656
# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
657
vpaddd %ymm12,%ymm8,%ymm8
658
vpxor %ymm8,%ymm4,%ymm4
659
vpslld $7,%ymm4,%ymm0
660
vpsrld $25,%ymm4,%ymm4
661
vpor %ymm0,%ymm4,%ymm4
662
# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
663
vpaddd %ymm13,%ymm9,%ymm9
664
vpxor %ymm9,%ymm5,%ymm5
665
vpslld $7,%ymm5,%ymm0
666
vpsrld $25,%ymm5,%ymm5
667
vpor %ymm0,%ymm5,%ymm5
668
# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
669
vpaddd %ymm14,%ymm10,%ymm10
670
vpxor %ymm10,%ymm6,%ymm6
671
vpslld $7,%ymm6,%ymm0
672
vpsrld $25,%ymm6,%ymm6
673
vpor %ymm0,%ymm6,%ymm6
674
# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
675
vpaddd %ymm15,%ymm11,%ymm11
676
vpxor %ymm11,%ymm7,%ymm7
677
vpslld $7,%ymm7,%ymm0
678
vpsrld $25,%ymm7,%ymm7
679
vpor %ymm0,%ymm7,%ymm7
680
681
# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
682
vpaddd 0x00(%rsp),%ymm5,%ymm0
683
vmovdqa %ymm0,0x00(%rsp)
684
vpxor %ymm0,%ymm15,%ymm15
685
vpshufb %ymm3,%ymm15,%ymm15
686
# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
687
vpaddd 0x20(%rsp),%ymm6,%ymm0
688
vmovdqa %ymm0,0x20(%rsp)
689
vpxor %ymm0,%ymm12,%ymm12
690
vpshufb %ymm3,%ymm12,%ymm12
691
# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
692
vpaddd 0x40(%rsp),%ymm7,%ymm0
693
vmovdqa %ymm0,0x40(%rsp)
694
vpxor %ymm0,%ymm13,%ymm13
695
vpshufb %ymm3,%ymm13,%ymm13
696
# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
697
vpaddd 0x60(%rsp),%ymm4,%ymm0
698
vmovdqa %ymm0,0x60(%rsp)
699
vpxor %ymm0,%ymm14,%ymm14
700
vpshufb %ymm3,%ymm14,%ymm14
701
702
# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
703
vpaddd %ymm15,%ymm10,%ymm10
704
vpxor %ymm10,%ymm5,%ymm5
705
vpslld $12,%ymm5,%ymm0
706
vpsrld $20,%ymm5,%ymm5
707
vpor %ymm0,%ymm5,%ymm5
708
# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
709
vpaddd %ymm12,%ymm11,%ymm11
710
vpxor %ymm11,%ymm6,%ymm6
711
vpslld $12,%ymm6,%ymm0
712
vpsrld $20,%ymm6,%ymm6
713
vpor %ymm0,%ymm6,%ymm6
714
# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
715
vpaddd %ymm13,%ymm8,%ymm8
716
vpxor %ymm8,%ymm7,%ymm7
717
vpslld $12,%ymm7,%ymm0
718
vpsrld $20,%ymm7,%ymm7
719
vpor %ymm0,%ymm7,%ymm7
720
# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
721
vpaddd %ymm14,%ymm9,%ymm9
722
vpxor %ymm9,%ymm4,%ymm4
723
vpslld $12,%ymm4,%ymm0
724
vpsrld $20,%ymm4,%ymm4
725
vpor %ymm0,%ymm4,%ymm4
726
727
# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
728
vpaddd 0x00(%rsp),%ymm5,%ymm0
729
vmovdqa %ymm0,0x00(%rsp)
730
vpxor %ymm0,%ymm15,%ymm15
731
vpshufb %ymm2,%ymm15,%ymm15
732
# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
733
vpaddd 0x20(%rsp),%ymm6,%ymm0
734
vmovdqa %ymm0,0x20(%rsp)
735
vpxor %ymm0,%ymm12,%ymm12
736
vpshufb %ymm2,%ymm12,%ymm12
737
# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
738
vpaddd 0x40(%rsp),%ymm7,%ymm0
739
vmovdqa %ymm0,0x40(%rsp)
740
vpxor %ymm0,%ymm13,%ymm13
741
vpshufb %ymm2,%ymm13,%ymm13
742
# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
743
vpaddd 0x60(%rsp),%ymm4,%ymm0
744
vmovdqa %ymm0,0x60(%rsp)
745
vpxor %ymm0,%ymm14,%ymm14
746
vpshufb %ymm2,%ymm14,%ymm14
747
748
# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
749
vpaddd %ymm15,%ymm10,%ymm10
750
vpxor %ymm10,%ymm5,%ymm5
751
vpslld $7,%ymm5,%ymm0
752
vpsrld $25,%ymm5,%ymm5
753
vpor %ymm0,%ymm5,%ymm5
754
# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
755
vpaddd %ymm12,%ymm11,%ymm11
756
vpxor %ymm11,%ymm6,%ymm6
757
vpslld $7,%ymm6,%ymm0
758
vpsrld $25,%ymm6,%ymm6
759
vpor %ymm0,%ymm6,%ymm6
760
# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
761
vpaddd %ymm13,%ymm8,%ymm8
762
vpxor %ymm8,%ymm7,%ymm7
763
vpslld $7,%ymm7,%ymm0
764
vpsrld $25,%ymm7,%ymm7
765
vpor %ymm0,%ymm7,%ymm7
766
# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
767
vpaddd %ymm14,%ymm9,%ymm9
768
vpxor %ymm9,%ymm4,%ymm4
769
vpslld $7,%ymm4,%ymm0
770
vpsrld $25,%ymm4,%ymm4
771
vpor %ymm0,%ymm4,%ymm4
772
773
sub $2,%r8d
774
jnz .Ldoubleround8
775
776
# x0..15[0-3] += s[0..15]
777
vpbroadcastd 0x00(%rdi),%ymm0
778
vpaddd 0x00(%rsp),%ymm0,%ymm0
779
vmovdqa %ymm0,0x00(%rsp)
780
vpbroadcastd 0x04(%rdi),%ymm0
781
vpaddd 0x20(%rsp),%ymm0,%ymm0
782
vmovdqa %ymm0,0x20(%rsp)
783
vpbroadcastd 0x08(%rdi),%ymm0
784
vpaddd 0x40(%rsp),%ymm0,%ymm0
785
vmovdqa %ymm0,0x40(%rsp)
786
vpbroadcastd 0x0c(%rdi),%ymm0
787
vpaddd 0x60(%rsp),%ymm0,%ymm0
788
vmovdqa %ymm0,0x60(%rsp)
789
vpbroadcastd 0x10(%rdi),%ymm0
790
vpaddd %ymm0,%ymm4,%ymm4
791
vpbroadcastd 0x14(%rdi),%ymm0
792
vpaddd %ymm0,%ymm5,%ymm5
793
vpbroadcastd 0x18(%rdi),%ymm0
794
vpaddd %ymm0,%ymm6,%ymm6
795
vpbroadcastd 0x1c(%rdi),%ymm0
796
vpaddd %ymm0,%ymm7,%ymm7
797
vpbroadcastd 0x20(%rdi),%ymm0
798
vpaddd %ymm0,%ymm8,%ymm8
799
vpbroadcastd 0x24(%rdi),%ymm0
800
vpaddd %ymm0,%ymm9,%ymm9
801
vpbroadcastd 0x28(%rdi),%ymm0
802
vpaddd %ymm0,%ymm10,%ymm10
803
vpbroadcastd 0x2c(%rdi),%ymm0
804
vpaddd %ymm0,%ymm11,%ymm11
805
vpbroadcastd 0x30(%rdi),%ymm0
806
vpaddd %ymm0,%ymm12,%ymm12
807
vpbroadcastd 0x34(%rdi),%ymm0
808
vpaddd %ymm0,%ymm13,%ymm13
809
vpbroadcastd 0x38(%rdi),%ymm0
810
vpaddd %ymm0,%ymm14,%ymm14
811
vpbroadcastd 0x3c(%rdi),%ymm0
812
vpaddd %ymm0,%ymm15,%ymm15
813
814
# x12 += counter values 0-3
815
vpaddd %ymm1,%ymm12,%ymm12
816
817
# interleave 32-bit words in state n, n+1
818
vmovdqa 0x00(%rsp),%ymm0
819
vmovdqa 0x20(%rsp),%ymm1
820
vpunpckldq %ymm1,%ymm0,%ymm2
821
vpunpckhdq %ymm1,%ymm0,%ymm1
822
vmovdqa %ymm2,0x00(%rsp)
823
vmovdqa %ymm1,0x20(%rsp)
824
vmovdqa 0x40(%rsp),%ymm0
825
vmovdqa 0x60(%rsp),%ymm1
826
vpunpckldq %ymm1,%ymm0,%ymm2
827
vpunpckhdq %ymm1,%ymm0,%ymm1
828
vmovdqa %ymm2,0x40(%rsp)
829
vmovdqa %ymm1,0x60(%rsp)
830
vmovdqa %ymm4,%ymm0
831
vpunpckldq %ymm5,%ymm0,%ymm4
832
vpunpckhdq %ymm5,%ymm0,%ymm5
833
vmovdqa %ymm6,%ymm0
834
vpunpckldq %ymm7,%ymm0,%ymm6
835
vpunpckhdq %ymm7,%ymm0,%ymm7
836
vmovdqa %ymm8,%ymm0
837
vpunpckldq %ymm9,%ymm0,%ymm8
838
vpunpckhdq %ymm9,%ymm0,%ymm9
839
vmovdqa %ymm10,%ymm0
840
vpunpckldq %ymm11,%ymm0,%ymm10
841
vpunpckhdq %ymm11,%ymm0,%ymm11
842
vmovdqa %ymm12,%ymm0
843
vpunpckldq %ymm13,%ymm0,%ymm12
844
vpunpckhdq %ymm13,%ymm0,%ymm13
845
vmovdqa %ymm14,%ymm0
846
vpunpckldq %ymm15,%ymm0,%ymm14
847
vpunpckhdq %ymm15,%ymm0,%ymm15
848
849
# interleave 64-bit words in state n, n+2
850
vmovdqa 0x00(%rsp),%ymm0
851
vmovdqa 0x40(%rsp),%ymm2
852
vpunpcklqdq %ymm2,%ymm0,%ymm1
853
vpunpckhqdq %ymm2,%ymm0,%ymm2
854
vmovdqa %ymm1,0x00(%rsp)
855
vmovdqa %ymm2,0x40(%rsp)
856
vmovdqa 0x20(%rsp),%ymm0
857
vmovdqa 0x60(%rsp),%ymm2
858
vpunpcklqdq %ymm2,%ymm0,%ymm1
859
vpunpckhqdq %ymm2,%ymm0,%ymm2
860
vmovdqa %ymm1,0x20(%rsp)
861
vmovdqa %ymm2,0x60(%rsp)
862
vmovdqa %ymm4,%ymm0
863
vpunpcklqdq %ymm6,%ymm0,%ymm4
864
vpunpckhqdq %ymm6,%ymm0,%ymm6
865
vmovdqa %ymm5,%ymm0
866
vpunpcklqdq %ymm7,%ymm0,%ymm5
867
vpunpckhqdq %ymm7,%ymm0,%ymm7
868
vmovdqa %ymm8,%ymm0
869
vpunpcklqdq %ymm10,%ymm0,%ymm8
870
vpunpckhqdq %ymm10,%ymm0,%ymm10
871
vmovdqa %ymm9,%ymm0
872
vpunpcklqdq %ymm11,%ymm0,%ymm9
873
vpunpckhqdq %ymm11,%ymm0,%ymm11
874
vmovdqa %ymm12,%ymm0
875
vpunpcklqdq %ymm14,%ymm0,%ymm12
876
vpunpckhqdq %ymm14,%ymm0,%ymm14
877
vmovdqa %ymm13,%ymm0
878
vpunpcklqdq %ymm15,%ymm0,%ymm13
879
vpunpckhqdq %ymm15,%ymm0,%ymm15
880
881
# interleave 128-bit words in state n, n+4
882
# xor/write first four blocks
883
vmovdqa 0x00(%rsp),%ymm1
884
vperm2i128 $0x20,%ymm4,%ymm1,%ymm0
885
cmp $0x0020,%rax
886
jl .Lxorpart8
887
vpxor 0x0000(%rdx),%ymm0,%ymm0
888
vmovdqu %ymm0,0x0000(%rsi)
889
vperm2i128 $0x31,%ymm4,%ymm1,%ymm4
890
891
vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
892
cmp $0x0040,%rax
893
jl .Lxorpart8
894
vpxor 0x0020(%rdx),%ymm0,%ymm0
895
vmovdqu %ymm0,0x0020(%rsi)
896
vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
897
898
vmovdqa 0x40(%rsp),%ymm1
899
vperm2i128 $0x20,%ymm6,%ymm1,%ymm0
900
cmp $0x0060,%rax
901
jl .Lxorpart8
902
vpxor 0x0040(%rdx),%ymm0,%ymm0
903
vmovdqu %ymm0,0x0040(%rsi)
904
vperm2i128 $0x31,%ymm6,%ymm1,%ymm6
905
906
vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
907
cmp $0x0080,%rax
908
jl .Lxorpart8
909
vpxor 0x0060(%rdx),%ymm0,%ymm0
910
vmovdqu %ymm0,0x0060(%rsi)
911
vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
912
913
vmovdqa 0x20(%rsp),%ymm1
914
vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
915
cmp $0x00a0,%rax
916
jl .Lxorpart8
917
vpxor 0x0080(%rdx),%ymm0,%ymm0
918
vmovdqu %ymm0,0x0080(%rsi)
919
vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
920
921
vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
922
cmp $0x00c0,%rax
923
jl .Lxorpart8
924
vpxor 0x00a0(%rdx),%ymm0,%ymm0
925
vmovdqu %ymm0,0x00a0(%rsi)
926
vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
927
928
vmovdqa 0x60(%rsp),%ymm1
929
vperm2i128 $0x20,%ymm7,%ymm1,%ymm0
930
cmp $0x00e0,%rax
931
jl .Lxorpart8
932
vpxor 0x00c0(%rdx),%ymm0,%ymm0
933
vmovdqu %ymm0,0x00c0(%rsi)
934
vperm2i128 $0x31,%ymm7,%ymm1,%ymm7
935
936
vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
937
cmp $0x0100,%rax
938
jl .Lxorpart8
939
vpxor 0x00e0(%rdx),%ymm0,%ymm0
940
vmovdqu %ymm0,0x00e0(%rsi)
941
vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
942
943
# xor remaining blocks, write to output
944
vmovdqa %ymm4,%ymm0
945
cmp $0x0120,%rax
946
jl .Lxorpart8
947
vpxor 0x0100(%rdx),%ymm0,%ymm0
948
vmovdqu %ymm0,0x0100(%rsi)
949
950
vmovdqa %ymm12,%ymm0
951
cmp $0x0140,%rax
952
jl .Lxorpart8
953
vpxor 0x0120(%rdx),%ymm0,%ymm0
954
vmovdqu %ymm0,0x0120(%rsi)
955
956
vmovdqa %ymm6,%ymm0
957
cmp $0x0160,%rax
958
jl .Lxorpart8
959
vpxor 0x0140(%rdx),%ymm0,%ymm0
960
vmovdqu %ymm0,0x0140(%rsi)
961
962
vmovdqa %ymm14,%ymm0
963
cmp $0x0180,%rax
964
jl .Lxorpart8
965
vpxor 0x0160(%rdx),%ymm0,%ymm0
966
vmovdqu %ymm0,0x0160(%rsi)
967
968
vmovdqa %ymm5,%ymm0
969
cmp $0x01a0,%rax
970
jl .Lxorpart8
971
vpxor 0x0180(%rdx),%ymm0,%ymm0
972
vmovdqu %ymm0,0x0180(%rsi)
973
974
vmovdqa %ymm13,%ymm0
975
cmp $0x01c0,%rax
976
jl .Lxorpart8
977
vpxor 0x01a0(%rdx),%ymm0,%ymm0
978
vmovdqu %ymm0,0x01a0(%rsi)
979
980
vmovdqa %ymm7,%ymm0
981
cmp $0x01e0,%rax
982
jl .Lxorpart8
983
vpxor 0x01c0(%rdx),%ymm0,%ymm0
984
vmovdqu %ymm0,0x01c0(%rsi)
985
986
vmovdqa %ymm15,%ymm0
987
cmp $0x0200,%rax
988
jl .Lxorpart8
989
vpxor 0x01e0(%rdx),%ymm0,%ymm0
990
vmovdqu %ymm0,0x01e0(%rsi)
991
992
.Ldone8:
993
vzeroupper
994
lea -8(%r10),%rsp
995
RET
996
997
.Lxorpart8:
998
# xor remaining bytes from partial register into output
999
mov %rax,%r9
1000
and $0x1f,%r9
1001
jz .Ldone8
1002
and $~0x1f,%rax
1003
1004
mov %rsi,%r11
1005
1006
lea (%rdx,%rax),%rsi
1007
mov %rsp,%rdi
1008
mov %r9,%rcx
1009
rep movsb
1010
1011
vpxor 0x00(%rsp),%ymm0,%ymm0
1012
vmovdqa %ymm0,0x00(%rsp)
1013
1014
mov %rsp,%rsi
1015
lea (%r11,%rax),%rdi
1016
mov %r9,%rcx
1017
rep movsb
1018
1019
jmp .Ldone8
1020
1021
SYM_FUNC_END(chacha_8block_xor_avx2)
1022
1023