Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/crypto/openssl/amd64/poly1305-x86_64.S
39482 views
1
/* Do not modify. This file is auto-generated from poly1305-x86_64.pl. */
2
.text
3
4
5
6
.globl poly1305_init
7
.hidden poly1305_init
8
.globl poly1305_blocks
9
.hidden poly1305_blocks
10
.globl poly1305_emit
11
.hidden poly1305_emit
12
13
.type poly1305_init,@function
14
.align 32
15
poly1305_init:
16
.cfi_startproc
17
xorq %rax,%rax
18
movq %rax,0(%rdi)
19
movq %rax,8(%rdi)
20
movq %rax,16(%rdi)
21
22
cmpq $0,%rsi
23
je .Lno_key
24
25
leaq poly1305_blocks(%rip),%r10
26
leaq poly1305_emit(%rip),%r11
27
movq OPENSSL_ia32cap_P+4(%rip),%r9
28
leaq poly1305_blocks_avx(%rip),%rax
29
leaq poly1305_emit_avx(%rip),%rcx
30
btq $28,%r9
31
cmovcq %rax,%r10
32
cmovcq %rcx,%r11
33
leaq poly1305_blocks_avx2(%rip),%rax
34
btq $37,%r9
35
cmovcq %rax,%r10
36
movq $0x0ffffffc0fffffff,%rax
37
movq $0x0ffffffc0ffffffc,%rcx
38
andq 0(%rsi),%rax
39
andq 8(%rsi),%rcx
40
movq %rax,24(%rdi)
41
movq %rcx,32(%rdi)
42
movq %r10,0(%rdx)
43
movq %r11,8(%rdx)
44
movl $1,%eax
45
.Lno_key:
46
.byte 0xf3,0xc3
47
.cfi_endproc
48
.size poly1305_init,.-poly1305_init
49
50
.type poly1305_blocks,@function
51
.align 32
52
poly1305_blocks:
53
.cfi_startproc
54
.byte 243,15,30,250
55
.Lblocks:
56
shrq $4,%rdx
57
jz .Lno_data
58
59
pushq %rbx
60
.cfi_adjust_cfa_offset 8
61
.cfi_offset %rbx,-16
62
pushq %rbp
63
.cfi_adjust_cfa_offset 8
64
.cfi_offset %rbp,-24
65
pushq %r12
66
.cfi_adjust_cfa_offset 8
67
.cfi_offset %r12,-32
68
pushq %r13
69
.cfi_adjust_cfa_offset 8
70
.cfi_offset %r13,-40
71
pushq %r14
72
.cfi_adjust_cfa_offset 8
73
.cfi_offset %r14,-48
74
pushq %r15
75
.cfi_adjust_cfa_offset 8
76
.cfi_offset %r15,-56
77
.Lblocks_body:
78
79
movq %rdx,%r15
80
81
movq 24(%rdi),%r11
82
movq 32(%rdi),%r13
83
84
movq 0(%rdi),%r14
85
movq 8(%rdi),%rbx
86
movq 16(%rdi),%rbp
87
88
movq %r13,%r12
89
shrq $2,%r13
90
movq %r12,%rax
91
addq %r12,%r13
92
jmp .Loop
93
94
.align 32
95
.Loop:
96
addq 0(%rsi),%r14
97
adcq 8(%rsi),%rbx
98
leaq 16(%rsi),%rsi
99
adcq %rcx,%rbp
100
mulq %r14
101
movq %rax,%r9
102
movq %r11,%rax
103
movq %rdx,%r10
104
105
mulq %r14
106
movq %rax,%r14
107
movq %r11,%rax
108
movq %rdx,%r8
109
110
mulq %rbx
111
addq %rax,%r9
112
movq %r13,%rax
113
adcq %rdx,%r10
114
115
mulq %rbx
116
movq %rbp,%rbx
117
addq %rax,%r14
118
adcq %rdx,%r8
119
120
imulq %r13,%rbx
121
addq %rbx,%r9
122
movq %r8,%rbx
123
adcq $0,%r10
124
125
imulq %r11,%rbp
126
addq %r9,%rbx
127
movq $-4,%rax
128
adcq %rbp,%r10
129
130
andq %r10,%rax
131
movq %r10,%rbp
132
shrq $2,%r10
133
andq $3,%rbp
134
addq %r10,%rax
135
addq %rax,%r14
136
adcq $0,%rbx
137
adcq $0,%rbp
138
movq %r12,%rax
139
decq %r15
140
jnz .Loop
141
142
movq %r14,0(%rdi)
143
movq %rbx,8(%rdi)
144
movq %rbp,16(%rdi)
145
146
movq 0(%rsp),%r15
147
.cfi_restore %r15
148
movq 8(%rsp),%r14
149
.cfi_restore %r14
150
movq 16(%rsp),%r13
151
.cfi_restore %r13
152
movq 24(%rsp),%r12
153
.cfi_restore %r12
154
movq 32(%rsp),%rbp
155
.cfi_restore %rbp
156
movq 40(%rsp),%rbx
157
.cfi_restore %rbx
158
leaq 48(%rsp),%rsp
159
.cfi_adjust_cfa_offset -48
160
.Lno_data:
161
.Lblocks_epilogue:
162
.byte 0xf3,0xc3
163
.cfi_endproc
164
.size poly1305_blocks,.-poly1305_blocks
165
166
.type poly1305_emit,@function
167
.align 32
168
poly1305_emit:
169
.cfi_startproc
170
.byte 243,15,30,250
171
.Lemit:
172
movq 0(%rdi),%r8
173
movq 8(%rdi),%r9
174
movq 16(%rdi),%r10
175
176
movq %r8,%rax
177
addq $5,%r8
178
movq %r9,%rcx
179
adcq $0,%r9
180
adcq $0,%r10
181
shrq $2,%r10
182
cmovnzq %r8,%rax
183
cmovnzq %r9,%rcx
184
185
addq 0(%rdx),%rax
186
adcq 8(%rdx),%rcx
187
movq %rax,0(%rsi)
188
movq %rcx,8(%rsi)
189
190
.byte 0xf3,0xc3
191
.cfi_endproc
192
.size poly1305_emit,.-poly1305_emit
193
.type __poly1305_block,@function
194
.align 32
195
__poly1305_block:
196
.cfi_startproc
197
mulq %r14
198
movq %rax,%r9
199
movq %r11,%rax
200
movq %rdx,%r10
201
202
mulq %r14
203
movq %rax,%r14
204
movq %r11,%rax
205
movq %rdx,%r8
206
207
mulq %rbx
208
addq %rax,%r9
209
movq %r13,%rax
210
adcq %rdx,%r10
211
212
mulq %rbx
213
movq %rbp,%rbx
214
addq %rax,%r14
215
adcq %rdx,%r8
216
217
imulq %r13,%rbx
218
addq %rbx,%r9
219
movq %r8,%rbx
220
adcq $0,%r10
221
222
imulq %r11,%rbp
223
addq %r9,%rbx
224
movq $-4,%rax
225
adcq %rbp,%r10
226
227
andq %r10,%rax
228
movq %r10,%rbp
229
shrq $2,%r10
230
andq $3,%rbp
231
addq %r10,%rax
232
addq %rax,%r14
233
adcq $0,%rbx
234
adcq $0,%rbp
235
.byte 0xf3,0xc3
236
.cfi_endproc
237
.size __poly1305_block,.-__poly1305_block
238
239
.type __poly1305_init_avx,@function
240
.align 32
241
__poly1305_init_avx:
242
.cfi_startproc
243
movq %r11,%r14
244
movq %r12,%rbx
245
xorq %rbp,%rbp
246
247
leaq 48+64(%rdi),%rdi
248
249
movq %r12,%rax
250
call __poly1305_block
251
252
movl $0x3ffffff,%eax
253
movl $0x3ffffff,%edx
254
movq %r14,%r8
255
andl %r14d,%eax
256
movq %r11,%r9
257
andl %r11d,%edx
258
movl %eax,-64(%rdi)
259
shrq $26,%r8
260
movl %edx,-60(%rdi)
261
shrq $26,%r9
262
263
movl $0x3ffffff,%eax
264
movl $0x3ffffff,%edx
265
andl %r8d,%eax
266
andl %r9d,%edx
267
movl %eax,-48(%rdi)
268
leal (%rax,%rax,4),%eax
269
movl %edx,-44(%rdi)
270
leal (%rdx,%rdx,4),%edx
271
movl %eax,-32(%rdi)
272
shrq $26,%r8
273
movl %edx,-28(%rdi)
274
shrq $26,%r9
275
276
movq %rbx,%rax
277
movq %r12,%rdx
278
shlq $12,%rax
279
shlq $12,%rdx
280
orq %r8,%rax
281
orq %r9,%rdx
282
andl $0x3ffffff,%eax
283
andl $0x3ffffff,%edx
284
movl %eax,-16(%rdi)
285
leal (%rax,%rax,4),%eax
286
movl %edx,-12(%rdi)
287
leal (%rdx,%rdx,4),%edx
288
movl %eax,0(%rdi)
289
movq %rbx,%r8
290
movl %edx,4(%rdi)
291
movq %r12,%r9
292
293
movl $0x3ffffff,%eax
294
movl $0x3ffffff,%edx
295
shrq $14,%r8
296
shrq $14,%r9
297
andl %r8d,%eax
298
andl %r9d,%edx
299
movl %eax,16(%rdi)
300
leal (%rax,%rax,4),%eax
301
movl %edx,20(%rdi)
302
leal (%rdx,%rdx,4),%edx
303
movl %eax,32(%rdi)
304
shrq $26,%r8
305
movl %edx,36(%rdi)
306
shrq $26,%r9
307
308
movq %rbp,%rax
309
shlq $24,%rax
310
orq %rax,%r8
311
movl %r8d,48(%rdi)
312
leaq (%r8,%r8,4),%r8
313
movl %r9d,52(%rdi)
314
leaq (%r9,%r9,4),%r9
315
movl %r8d,64(%rdi)
316
movl %r9d,68(%rdi)
317
318
movq %r12,%rax
319
call __poly1305_block
320
321
movl $0x3ffffff,%eax
322
movq %r14,%r8
323
andl %r14d,%eax
324
shrq $26,%r8
325
movl %eax,-52(%rdi)
326
327
movl $0x3ffffff,%edx
328
andl %r8d,%edx
329
movl %edx,-36(%rdi)
330
leal (%rdx,%rdx,4),%edx
331
shrq $26,%r8
332
movl %edx,-20(%rdi)
333
334
movq %rbx,%rax
335
shlq $12,%rax
336
orq %r8,%rax
337
andl $0x3ffffff,%eax
338
movl %eax,-4(%rdi)
339
leal (%rax,%rax,4),%eax
340
movq %rbx,%r8
341
movl %eax,12(%rdi)
342
343
movl $0x3ffffff,%edx
344
shrq $14,%r8
345
andl %r8d,%edx
346
movl %edx,28(%rdi)
347
leal (%rdx,%rdx,4),%edx
348
shrq $26,%r8
349
movl %edx,44(%rdi)
350
351
movq %rbp,%rax
352
shlq $24,%rax
353
orq %rax,%r8
354
movl %r8d,60(%rdi)
355
leaq (%r8,%r8,4),%r8
356
movl %r8d,76(%rdi)
357
358
movq %r12,%rax
359
call __poly1305_block
360
361
movl $0x3ffffff,%eax
362
movq %r14,%r8
363
andl %r14d,%eax
364
shrq $26,%r8
365
movl %eax,-56(%rdi)
366
367
movl $0x3ffffff,%edx
368
andl %r8d,%edx
369
movl %edx,-40(%rdi)
370
leal (%rdx,%rdx,4),%edx
371
shrq $26,%r8
372
movl %edx,-24(%rdi)
373
374
movq %rbx,%rax
375
shlq $12,%rax
376
orq %r8,%rax
377
andl $0x3ffffff,%eax
378
movl %eax,-8(%rdi)
379
leal (%rax,%rax,4),%eax
380
movq %rbx,%r8
381
movl %eax,8(%rdi)
382
383
movl $0x3ffffff,%edx
384
shrq $14,%r8
385
andl %r8d,%edx
386
movl %edx,24(%rdi)
387
leal (%rdx,%rdx,4),%edx
388
shrq $26,%r8
389
movl %edx,40(%rdi)
390
391
movq %rbp,%rax
392
shlq $24,%rax
393
orq %rax,%r8
394
movl %r8d,56(%rdi)
395
leaq (%r8,%r8,4),%r8
396
movl %r8d,72(%rdi)
397
398
leaq -48-64(%rdi),%rdi
399
.byte 0xf3,0xc3
400
.cfi_endproc
401
.size __poly1305_init_avx,.-__poly1305_init_avx
402
403
.type poly1305_blocks_avx,@function
404
.align 32
405
poly1305_blocks_avx:
406
.cfi_startproc
407
.byte 243,15,30,250
408
movl 20(%rdi),%r8d
409
cmpq $128,%rdx
410
jae .Lblocks_avx
411
testl %r8d,%r8d
412
jz .Lblocks
413
414
.Lblocks_avx:
415
andq $-16,%rdx
416
jz .Lno_data_avx
417
418
vzeroupper
419
420
testl %r8d,%r8d
421
jz .Lbase2_64_avx
422
423
testq $31,%rdx
424
jz .Leven_avx
425
426
pushq %rbx
427
.cfi_adjust_cfa_offset 8
428
.cfi_offset %rbx,-16
429
pushq %rbp
430
.cfi_adjust_cfa_offset 8
431
.cfi_offset %rbp,-24
432
pushq %r12
433
.cfi_adjust_cfa_offset 8
434
.cfi_offset %r12,-32
435
pushq %r13
436
.cfi_adjust_cfa_offset 8
437
.cfi_offset %r13,-40
438
pushq %r14
439
.cfi_adjust_cfa_offset 8
440
.cfi_offset %r14,-48
441
pushq %r15
442
.cfi_adjust_cfa_offset 8
443
.cfi_offset %r15,-56
444
.Lblocks_avx_body:
445
446
movq %rdx,%r15
447
448
movq 0(%rdi),%r8
449
movq 8(%rdi),%r9
450
movl 16(%rdi),%ebp
451
452
movq 24(%rdi),%r11
453
movq 32(%rdi),%r13
454
455
456
movl %r8d,%r14d
457
andq $-2147483648,%r8
458
movq %r9,%r12
459
movl %r9d,%ebx
460
andq $-2147483648,%r9
461
462
shrq $6,%r8
463
shlq $52,%r12
464
addq %r8,%r14
465
shrq $12,%rbx
466
shrq $18,%r9
467
addq %r12,%r14
468
adcq %r9,%rbx
469
470
movq %rbp,%r8
471
shlq $40,%r8
472
shrq $24,%rbp
473
addq %r8,%rbx
474
adcq $0,%rbp
475
476
movq $-4,%r9
477
movq %rbp,%r8
478
andq %rbp,%r9
479
shrq $2,%r8
480
andq $3,%rbp
481
addq %r9,%r8
482
addq %r8,%r14
483
adcq $0,%rbx
484
adcq $0,%rbp
485
486
movq %r13,%r12
487
movq %r13,%rax
488
shrq $2,%r13
489
addq %r12,%r13
490
491
addq 0(%rsi),%r14
492
adcq 8(%rsi),%rbx
493
leaq 16(%rsi),%rsi
494
adcq %rcx,%rbp
495
496
call __poly1305_block
497
498
testq %rcx,%rcx
499
jz .Lstore_base2_64_avx
500
501
502
movq %r14,%rax
503
movq %r14,%rdx
504
shrq $52,%r14
505
movq %rbx,%r11
506
movq %rbx,%r12
507
shrq $26,%rdx
508
andq $0x3ffffff,%rax
509
shlq $12,%r11
510
andq $0x3ffffff,%rdx
511
shrq $14,%rbx
512
orq %r11,%r14
513
shlq $24,%rbp
514
andq $0x3ffffff,%r14
515
shrq $40,%r12
516
andq $0x3ffffff,%rbx
517
orq %r12,%rbp
518
519
subq $16,%r15
520
jz .Lstore_base2_26_avx
521
522
vmovd %eax,%xmm0
523
vmovd %edx,%xmm1
524
vmovd %r14d,%xmm2
525
vmovd %ebx,%xmm3
526
vmovd %ebp,%xmm4
527
jmp .Lproceed_avx
528
529
.align 32
530
.Lstore_base2_64_avx:
531
movq %r14,0(%rdi)
532
movq %rbx,8(%rdi)
533
movq %rbp,16(%rdi)
534
jmp .Ldone_avx
535
536
.align 16
537
.Lstore_base2_26_avx:
538
movl %eax,0(%rdi)
539
movl %edx,4(%rdi)
540
movl %r14d,8(%rdi)
541
movl %ebx,12(%rdi)
542
movl %ebp,16(%rdi)
543
.align 16
544
.Ldone_avx:
545
movq 0(%rsp),%r15
546
.cfi_restore %r15
547
movq 8(%rsp),%r14
548
.cfi_restore %r14
549
movq 16(%rsp),%r13
550
.cfi_restore %r13
551
movq 24(%rsp),%r12
552
.cfi_restore %r12
553
movq 32(%rsp),%rbp
554
.cfi_restore %rbp
555
movq 40(%rsp),%rbx
556
.cfi_restore %rbx
557
leaq 48(%rsp),%rsp
558
.cfi_adjust_cfa_offset -48
559
.Lno_data_avx:
560
.Lblocks_avx_epilogue:
561
.byte 0xf3,0xc3
562
.cfi_endproc
563
564
.align 32
565
.Lbase2_64_avx:
566
.cfi_startproc
567
pushq %rbx
568
.cfi_adjust_cfa_offset 8
569
.cfi_offset %rbx,-16
570
pushq %rbp
571
.cfi_adjust_cfa_offset 8
572
.cfi_offset %rbp,-24
573
pushq %r12
574
.cfi_adjust_cfa_offset 8
575
.cfi_offset %r12,-32
576
pushq %r13
577
.cfi_adjust_cfa_offset 8
578
.cfi_offset %r13,-40
579
pushq %r14
580
.cfi_adjust_cfa_offset 8
581
.cfi_offset %r14,-48
582
pushq %r15
583
.cfi_adjust_cfa_offset 8
584
.cfi_offset %r15,-56
585
.Lbase2_64_avx_body:
586
587
movq %rdx,%r15
588
589
movq 24(%rdi),%r11
590
movq 32(%rdi),%r13
591
592
movq 0(%rdi),%r14
593
movq 8(%rdi),%rbx
594
movl 16(%rdi),%ebp
595
596
movq %r13,%r12
597
movq %r13,%rax
598
shrq $2,%r13
599
addq %r12,%r13
600
601
testq $31,%rdx
602
jz .Linit_avx
603
604
addq 0(%rsi),%r14
605
adcq 8(%rsi),%rbx
606
leaq 16(%rsi),%rsi
607
adcq %rcx,%rbp
608
subq $16,%r15
609
610
call __poly1305_block
611
612
.Linit_avx:
613
614
movq %r14,%rax
615
movq %r14,%rdx
616
shrq $52,%r14
617
movq %rbx,%r8
618
movq %rbx,%r9
619
shrq $26,%rdx
620
andq $0x3ffffff,%rax
621
shlq $12,%r8
622
andq $0x3ffffff,%rdx
623
shrq $14,%rbx
624
orq %r8,%r14
625
shlq $24,%rbp
626
andq $0x3ffffff,%r14
627
shrq $40,%r9
628
andq $0x3ffffff,%rbx
629
orq %r9,%rbp
630
631
vmovd %eax,%xmm0
632
vmovd %edx,%xmm1
633
vmovd %r14d,%xmm2
634
vmovd %ebx,%xmm3
635
vmovd %ebp,%xmm4
636
movl $1,20(%rdi)
637
638
call __poly1305_init_avx
639
640
.Lproceed_avx:
641
movq %r15,%rdx
642
643
movq 0(%rsp),%r15
644
.cfi_restore %r15
645
movq 8(%rsp),%r14
646
.cfi_restore %r14
647
movq 16(%rsp),%r13
648
.cfi_restore %r13
649
movq 24(%rsp),%r12
650
.cfi_restore %r12
651
movq 32(%rsp),%rbp
652
.cfi_restore %rbp
653
movq 40(%rsp),%rbx
654
.cfi_restore %rbx
655
leaq 48(%rsp),%rax
656
leaq 48(%rsp),%rsp
657
.cfi_adjust_cfa_offset -48
658
.Lbase2_64_avx_epilogue:
659
jmp .Ldo_avx
660
.cfi_endproc
661
662
.align 32
663
.Leven_avx:
664
.cfi_startproc
665
vmovd 0(%rdi),%xmm0
666
vmovd 4(%rdi),%xmm1
667
vmovd 8(%rdi),%xmm2
668
vmovd 12(%rdi),%xmm3
669
vmovd 16(%rdi),%xmm4
670
671
.Ldo_avx:
672
leaq -88(%rsp),%r11
673
.cfi_def_cfa %r11,0x60
674
subq $0x178,%rsp
675
subq $64,%rdx
676
leaq -32(%rsi),%rax
677
cmovcq %rax,%rsi
678
679
vmovdqu 48(%rdi),%xmm14
680
leaq 112(%rdi),%rdi
681
leaq .Lconst(%rip),%rcx
682
683
684
685
vmovdqu 32(%rsi),%xmm5
686
vmovdqu 48(%rsi),%xmm6
687
vmovdqa 64(%rcx),%xmm15
688
689
vpsrldq $6,%xmm5,%xmm7
690
vpsrldq $6,%xmm6,%xmm8
691
vpunpckhqdq %xmm6,%xmm5,%xmm9
692
vpunpcklqdq %xmm6,%xmm5,%xmm5
693
vpunpcklqdq %xmm8,%xmm7,%xmm8
694
695
vpsrlq $40,%xmm9,%xmm9
696
vpsrlq $26,%xmm5,%xmm6
697
vpand %xmm15,%xmm5,%xmm5
698
vpsrlq $4,%xmm8,%xmm7
699
vpand %xmm15,%xmm6,%xmm6
700
vpsrlq $30,%xmm8,%xmm8
701
vpand %xmm15,%xmm7,%xmm7
702
vpand %xmm15,%xmm8,%xmm8
703
vpor 32(%rcx),%xmm9,%xmm9
704
705
jbe .Lskip_loop_avx
706
707
708
vmovdqu -48(%rdi),%xmm11
709
vmovdqu -32(%rdi),%xmm12
710
vpshufd $0xEE,%xmm14,%xmm13
711
vpshufd $0x44,%xmm14,%xmm10
712
vmovdqa %xmm13,-144(%r11)
713
vmovdqa %xmm10,0(%rsp)
714
vpshufd $0xEE,%xmm11,%xmm14
715
vmovdqu -16(%rdi),%xmm10
716
vpshufd $0x44,%xmm11,%xmm11
717
vmovdqa %xmm14,-128(%r11)
718
vmovdqa %xmm11,16(%rsp)
719
vpshufd $0xEE,%xmm12,%xmm13
720
vmovdqu 0(%rdi),%xmm11
721
vpshufd $0x44,%xmm12,%xmm12
722
vmovdqa %xmm13,-112(%r11)
723
vmovdqa %xmm12,32(%rsp)
724
vpshufd $0xEE,%xmm10,%xmm14
725
vmovdqu 16(%rdi),%xmm12
726
vpshufd $0x44,%xmm10,%xmm10
727
vmovdqa %xmm14,-96(%r11)
728
vmovdqa %xmm10,48(%rsp)
729
vpshufd $0xEE,%xmm11,%xmm13
730
vmovdqu 32(%rdi),%xmm10
731
vpshufd $0x44,%xmm11,%xmm11
732
vmovdqa %xmm13,-80(%r11)
733
vmovdqa %xmm11,64(%rsp)
734
vpshufd $0xEE,%xmm12,%xmm14
735
vmovdqu 48(%rdi),%xmm11
736
vpshufd $0x44,%xmm12,%xmm12
737
vmovdqa %xmm14,-64(%r11)
738
vmovdqa %xmm12,80(%rsp)
739
vpshufd $0xEE,%xmm10,%xmm13
740
vmovdqu 64(%rdi),%xmm12
741
vpshufd $0x44,%xmm10,%xmm10
742
vmovdqa %xmm13,-48(%r11)
743
vmovdqa %xmm10,96(%rsp)
744
vpshufd $0xEE,%xmm11,%xmm14
745
vpshufd $0x44,%xmm11,%xmm11
746
vmovdqa %xmm14,-32(%r11)
747
vmovdqa %xmm11,112(%rsp)
748
vpshufd $0xEE,%xmm12,%xmm13
749
vmovdqa 0(%rsp),%xmm14
750
vpshufd $0x44,%xmm12,%xmm12
751
vmovdqa %xmm13,-16(%r11)
752
vmovdqa %xmm12,128(%rsp)
753
754
jmp .Loop_avx
755
756
.align 32
757
.Loop_avx:
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
vpmuludq %xmm5,%xmm14,%xmm10
779
vpmuludq %xmm6,%xmm14,%xmm11
780
vmovdqa %xmm2,32(%r11)
781
vpmuludq %xmm7,%xmm14,%xmm12
782
vmovdqa 16(%rsp),%xmm2
783
vpmuludq %xmm8,%xmm14,%xmm13
784
vpmuludq %xmm9,%xmm14,%xmm14
785
786
vmovdqa %xmm0,0(%r11)
787
vpmuludq 32(%rsp),%xmm9,%xmm0
788
vmovdqa %xmm1,16(%r11)
789
vpmuludq %xmm8,%xmm2,%xmm1
790
vpaddq %xmm0,%xmm10,%xmm10
791
vpaddq %xmm1,%xmm14,%xmm14
792
vmovdqa %xmm3,48(%r11)
793
vpmuludq %xmm7,%xmm2,%xmm0
794
vpmuludq %xmm6,%xmm2,%xmm1
795
vpaddq %xmm0,%xmm13,%xmm13
796
vmovdqa 48(%rsp),%xmm3
797
vpaddq %xmm1,%xmm12,%xmm12
798
vmovdqa %xmm4,64(%r11)
799
vpmuludq %xmm5,%xmm2,%xmm2
800
vpmuludq %xmm7,%xmm3,%xmm0
801
vpaddq %xmm2,%xmm11,%xmm11
802
803
vmovdqa 64(%rsp),%xmm4
804
vpaddq %xmm0,%xmm14,%xmm14
805
vpmuludq %xmm6,%xmm3,%xmm1
806
vpmuludq %xmm5,%xmm3,%xmm3
807
vpaddq %xmm1,%xmm13,%xmm13
808
vmovdqa 80(%rsp),%xmm2
809
vpaddq %xmm3,%xmm12,%xmm12
810
vpmuludq %xmm9,%xmm4,%xmm0
811
vpmuludq %xmm8,%xmm4,%xmm4
812
vpaddq %xmm0,%xmm11,%xmm11
813
vmovdqa 96(%rsp),%xmm3
814
vpaddq %xmm4,%xmm10,%xmm10
815
816
vmovdqa 128(%rsp),%xmm4
817
vpmuludq %xmm6,%xmm2,%xmm1
818
vpmuludq %xmm5,%xmm2,%xmm2
819
vpaddq %xmm1,%xmm14,%xmm14
820
vpaddq %xmm2,%xmm13,%xmm13
821
vpmuludq %xmm9,%xmm3,%xmm0
822
vpmuludq %xmm8,%xmm3,%xmm1
823
vpaddq %xmm0,%xmm12,%xmm12
824
vmovdqu 0(%rsi),%xmm0
825
vpaddq %xmm1,%xmm11,%xmm11
826
vpmuludq %xmm7,%xmm3,%xmm3
827
vpmuludq %xmm7,%xmm4,%xmm7
828
vpaddq %xmm3,%xmm10,%xmm10
829
830
vmovdqu 16(%rsi),%xmm1
831
vpaddq %xmm7,%xmm11,%xmm11
832
vpmuludq %xmm8,%xmm4,%xmm8
833
vpmuludq %xmm9,%xmm4,%xmm9
834
vpsrldq $6,%xmm0,%xmm2
835
vpaddq %xmm8,%xmm12,%xmm12
836
vpaddq %xmm9,%xmm13,%xmm13
837
vpsrldq $6,%xmm1,%xmm3
838
vpmuludq 112(%rsp),%xmm5,%xmm9
839
vpmuludq %xmm6,%xmm4,%xmm5
840
vpunpckhqdq %xmm1,%xmm0,%xmm4
841
vpaddq %xmm9,%xmm14,%xmm14
842
vmovdqa -144(%r11),%xmm9
843
vpaddq %xmm5,%xmm10,%xmm10
844
845
vpunpcklqdq %xmm1,%xmm0,%xmm0
846
vpunpcklqdq %xmm3,%xmm2,%xmm3
847
848
849
vpsrldq $5,%xmm4,%xmm4
850
vpsrlq $26,%xmm0,%xmm1
851
vpand %xmm15,%xmm0,%xmm0
852
vpsrlq $4,%xmm3,%xmm2
853
vpand %xmm15,%xmm1,%xmm1
854
vpand 0(%rcx),%xmm4,%xmm4
855
vpsrlq $30,%xmm3,%xmm3
856
vpand %xmm15,%xmm2,%xmm2
857
vpand %xmm15,%xmm3,%xmm3
858
vpor 32(%rcx),%xmm4,%xmm4
859
860
vpaddq 0(%r11),%xmm0,%xmm0
861
vpaddq 16(%r11),%xmm1,%xmm1
862
vpaddq 32(%r11),%xmm2,%xmm2
863
vpaddq 48(%r11),%xmm3,%xmm3
864
vpaddq 64(%r11),%xmm4,%xmm4
865
866
leaq 32(%rsi),%rax
867
leaq 64(%rsi),%rsi
868
subq $64,%rdx
869
cmovcq %rax,%rsi
870
871
872
873
874
875
876
877
878
879
880
vpmuludq %xmm0,%xmm9,%xmm5
881
vpmuludq %xmm1,%xmm9,%xmm6
882
vpaddq %xmm5,%xmm10,%xmm10
883
vpaddq %xmm6,%xmm11,%xmm11
884
vmovdqa -128(%r11),%xmm7
885
vpmuludq %xmm2,%xmm9,%xmm5
886
vpmuludq %xmm3,%xmm9,%xmm6
887
vpaddq %xmm5,%xmm12,%xmm12
888
vpaddq %xmm6,%xmm13,%xmm13
889
vpmuludq %xmm4,%xmm9,%xmm9
890
vpmuludq -112(%r11),%xmm4,%xmm5
891
vpaddq %xmm9,%xmm14,%xmm14
892
893
vpaddq %xmm5,%xmm10,%xmm10
894
vpmuludq %xmm2,%xmm7,%xmm6
895
vpmuludq %xmm3,%xmm7,%xmm5
896
vpaddq %xmm6,%xmm13,%xmm13
897
vmovdqa -96(%r11),%xmm8
898
vpaddq %xmm5,%xmm14,%xmm14
899
vpmuludq %xmm1,%xmm7,%xmm6
900
vpmuludq %xmm0,%xmm7,%xmm7
901
vpaddq %xmm6,%xmm12,%xmm12
902
vpaddq %xmm7,%xmm11,%xmm11
903
904
vmovdqa -80(%r11),%xmm9
905
vpmuludq %xmm2,%xmm8,%xmm5
906
vpmuludq %xmm1,%xmm8,%xmm6
907
vpaddq %xmm5,%xmm14,%xmm14
908
vpaddq %xmm6,%xmm13,%xmm13
909
vmovdqa -64(%r11),%xmm7
910
vpmuludq %xmm0,%xmm8,%xmm8
911
vpmuludq %xmm4,%xmm9,%xmm5
912
vpaddq %xmm8,%xmm12,%xmm12
913
vpaddq %xmm5,%xmm11,%xmm11
914
vmovdqa -48(%r11),%xmm8
915
vpmuludq %xmm3,%xmm9,%xmm9
916
vpmuludq %xmm1,%xmm7,%xmm6
917
vpaddq %xmm9,%xmm10,%xmm10
918
919
vmovdqa -16(%r11),%xmm9
920
vpaddq %xmm6,%xmm14,%xmm14
921
vpmuludq %xmm0,%xmm7,%xmm7
922
vpmuludq %xmm4,%xmm8,%xmm5
923
vpaddq %xmm7,%xmm13,%xmm13
924
vpaddq %xmm5,%xmm12,%xmm12
925
vmovdqu 32(%rsi),%xmm5
926
vpmuludq %xmm3,%xmm8,%xmm7
927
vpmuludq %xmm2,%xmm8,%xmm8
928
vpaddq %xmm7,%xmm11,%xmm11
929
vmovdqu 48(%rsi),%xmm6
930
vpaddq %xmm8,%xmm10,%xmm10
931
932
vpmuludq %xmm2,%xmm9,%xmm2
933
vpmuludq %xmm3,%xmm9,%xmm3
934
vpsrldq $6,%xmm5,%xmm7
935
vpaddq %xmm2,%xmm11,%xmm11
936
vpmuludq %xmm4,%xmm9,%xmm4
937
vpsrldq $6,%xmm6,%xmm8
938
vpaddq %xmm3,%xmm12,%xmm2
939
vpaddq %xmm4,%xmm13,%xmm3
940
vpmuludq -32(%r11),%xmm0,%xmm4
941
vpmuludq %xmm1,%xmm9,%xmm0
942
vpunpckhqdq %xmm6,%xmm5,%xmm9
943
vpaddq %xmm4,%xmm14,%xmm4
944
vpaddq %xmm0,%xmm10,%xmm0
945
946
vpunpcklqdq %xmm6,%xmm5,%xmm5
947
vpunpcklqdq %xmm8,%xmm7,%xmm8
948
949
950
vpsrldq $5,%xmm9,%xmm9
951
vpsrlq $26,%xmm5,%xmm6
952
vmovdqa 0(%rsp),%xmm14
953
vpand %xmm15,%xmm5,%xmm5
954
vpsrlq $4,%xmm8,%xmm7
955
vpand %xmm15,%xmm6,%xmm6
956
vpand 0(%rcx),%xmm9,%xmm9
957
vpsrlq $30,%xmm8,%xmm8
958
vpand %xmm15,%xmm7,%xmm7
959
vpand %xmm15,%xmm8,%xmm8
960
vpor 32(%rcx),%xmm9,%xmm9
961
962
963
964
965
966
vpsrlq $26,%xmm3,%xmm13
967
vpand %xmm15,%xmm3,%xmm3
968
vpaddq %xmm13,%xmm4,%xmm4
969
970
vpsrlq $26,%xmm0,%xmm10
971
vpand %xmm15,%xmm0,%xmm0
972
vpaddq %xmm10,%xmm11,%xmm1
973
974
vpsrlq $26,%xmm4,%xmm10
975
vpand %xmm15,%xmm4,%xmm4
976
977
vpsrlq $26,%xmm1,%xmm11
978
vpand %xmm15,%xmm1,%xmm1
979
vpaddq %xmm11,%xmm2,%xmm2
980
981
vpaddq %xmm10,%xmm0,%xmm0
982
vpsllq $2,%xmm10,%xmm10
983
vpaddq %xmm10,%xmm0,%xmm0
984
985
vpsrlq $26,%xmm2,%xmm12
986
vpand %xmm15,%xmm2,%xmm2
987
vpaddq %xmm12,%xmm3,%xmm3
988
989
vpsrlq $26,%xmm0,%xmm10
990
vpand %xmm15,%xmm0,%xmm0
991
vpaddq %xmm10,%xmm1,%xmm1
992
993
vpsrlq $26,%xmm3,%xmm13
994
vpand %xmm15,%xmm3,%xmm3
995
vpaddq %xmm13,%xmm4,%xmm4
996
997
ja .Loop_avx
998
999
.Lskip_loop_avx:
1000
1001
1002
1003
vpshufd $0x10,%xmm14,%xmm14
1004
addq $32,%rdx
1005
jnz .Long_tail_avx
1006
1007
vpaddq %xmm2,%xmm7,%xmm7
1008
vpaddq %xmm0,%xmm5,%xmm5
1009
vpaddq %xmm1,%xmm6,%xmm6
1010
vpaddq %xmm3,%xmm8,%xmm8
1011
vpaddq %xmm4,%xmm9,%xmm9
1012
1013
.Long_tail_avx:
1014
vmovdqa %xmm2,32(%r11)
1015
vmovdqa %xmm0,0(%r11)
1016
vmovdqa %xmm1,16(%r11)
1017
vmovdqa %xmm3,48(%r11)
1018
vmovdqa %xmm4,64(%r11)
1019
1020
1021
1022
1023
1024
1025
1026
vpmuludq %xmm7,%xmm14,%xmm12
1027
vpmuludq %xmm5,%xmm14,%xmm10
1028
vpshufd $0x10,-48(%rdi),%xmm2
1029
vpmuludq %xmm6,%xmm14,%xmm11
1030
vpmuludq %xmm8,%xmm14,%xmm13
1031
vpmuludq %xmm9,%xmm14,%xmm14
1032
1033
vpmuludq %xmm8,%xmm2,%xmm0
1034
vpaddq %xmm0,%xmm14,%xmm14
1035
vpshufd $0x10,-32(%rdi),%xmm3
1036
vpmuludq %xmm7,%xmm2,%xmm1
1037
vpaddq %xmm1,%xmm13,%xmm13
1038
vpshufd $0x10,-16(%rdi),%xmm4
1039
vpmuludq %xmm6,%xmm2,%xmm0
1040
vpaddq %xmm0,%xmm12,%xmm12
1041
vpmuludq %xmm5,%xmm2,%xmm2
1042
vpaddq %xmm2,%xmm11,%xmm11
1043
vpmuludq %xmm9,%xmm3,%xmm3
1044
vpaddq %xmm3,%xmm10,%xmm10
1045
1046
vpshufd $0x10,0(%rdi),%xmm2
1047
vpmuludq %xmm7,%xmm4,%xmm1
1048
vpaddq %xmm1,%xmm14,%xmm14
1049
vpmuludq %xmm6,%xmm4,%xmm0
1050
vpaddq %xmm0,%xmm13,%xmm13
1051
vpshufd $0x10,16(%rdi),%xmm3
1052
vpmuludq %xmm5,%xmm4,%xmm4
1053
vpaddq %xmm4,%xmm12,%xmm12
1054
vpmuludq %xmm9,%xmm2,%xmm1
1055
vpaddq %xmm1,%xmm11,%xmm11
1056
vpshufd $0x10,32(%rdi),%xmm4
1057
vpmuludq %xmm8,%xmm2,%xmm2
1058
vpaddq %xmm2,%xmm10,%xmm10
1059
1060
vpmuludq %xmm6,%xmm3,%xmm0
1061
vpaddq %xmm0,%xmm14,%xmm14
1062
vpmuludq %xmm5,%xmm3,%xmm3
1063
vpaddq %xmm3,%xmm13,%xmm13
1064
vpshufd $0x10,48(%rdi),%xmm2
1065
vpmuludq %xmm9,%xmm4,%xmm1
1066
vpaddq %xmm1,%xmm12,%xmm12
1067
vpshufd $0x10,64(%rdi),%xmm3
1068
vpmuludq %xmm8,%xmm4,%xmm0
1069
vpaddq %xmm0,%xmm11,%xmm11
1070
vpmuludq %xmm7,%xmm4,%xmm4
1071
vpaddq %xmm4,%xmm10,%xmm10
1072
1073
vpmuludq %xmm5,%xmm2,%xmm2
1074
vpaddq %xmm2,%xmm14,%xmm14
1075
vpmuludq %xmm9,%xmm3,%xmm1
1076
vpaddq %xmm1,%xmm13,%xmm13
1077
vpmuludq %xmm8,%xmm3,%xmm0
1078
vpaddq %xmm0,%xmm12,%xmm12
1079
vpmuludq %xmm7,%xmm3,%xmm1
1080
vpaddq %xmm1,%xmm11,%xmm11
1081
vpmuludq %xmm6,%xmm3,%xmm3
1082
vpaddq %xmm3,%xmm10,%xmm10
1083
1084
jz .Lshort_tail_avx
1085
1086
vmovdqu 0(%rsi),%xmm0
1087
vmovdqu 16(%rsi),%xmm1
1088
1089
vpsrldq $6,%xmm0,%xmm2
1090
vpsrldq $6,%xmm1,%xmm3
1091
vpunpckhqdq %xmm1,%xmm0,%xmm4
1092
vpunpcklqdq %xmm1,%xmm0,%xmm0
1093
vpunpcklqdq %xmm3,%xmm2,%xmm3
1094
1095
vpsrlq $40,%xmm4,%xmm4
1096
vpsrlq $26,%xmm0,%xmm1
1097
vpand %xmm15,%xmm0,%xmm0
1098
vpsrlq $4,%xmm3,%xmm2
1099
vpand %xmm15,%xmm1,%xmm1
1100
vpsrlq $30,%xmm3,%xmm3
1101
vpand %xmm15,%xmm2,%xmm2
1102
vpand %xmm15,%xmm3,%xmm3
1103
vpor 32(%rcx),%xmm4,%xmm4
1104
1105
vpshufd $0x32,-64(%rdi),%xmm9
1106
vpaddq 0(%r11),%xmm0,%xmm0
1107
vpaddq 16(%r11),%xmm1,%xmm1
1108
vpaddq 32(%r11),%xmm2,%xmm2
1109
vpaddq 48(%r11),%xmm3,%xmm3
1110
vpaddq 64(%r11),%xmm4,%xmm4
1111
1112
1113
1114
1115
vpmuludq %xmm0,%xmm9,%xmm5
1116
vpaddq %xmm5,%xmm10,%xmm10
1117
vpmuludq %xmm1,%xmm9,%xmm6
1118
vpaddq %xmm6,%xmm11,%xmm11
1119
vpmuludq %xmm2,%xmm9,%xmm5
1120
vpaddq %xmm5,%xmm12,%xmm12
1121
vpshufd $0x32,-48(%rdi),%xmm7
1122
vpmuludq %xmm3,%xmm9,%xmm6
1123
vpaddq %xmm6,%xmm13,%xmm13
1124
vpmuludq %xmm4,%xmm9,%xmm9
1125
vpaddq %xmm9,%xmm14,%xmm14
1126
1127
vpmuludq %xmm3,%xmm7,%xmm5
1128
vpaddq %xmm5,%xmm14,%xmm14
1129
vpshufd $0x32,-32(%rdi),%xmm8
1130
vpmuludq %xmm2,%xmm7,%xmm6
1131
vpaddq %xmm6,%xmm13,%xmm13
1132
vpshufd $0x32,-16(%rdi),%xmm9
1133
vpmuludq %xmm1,%xmm7,%xmm5
1134
vpaddq %xmm5,%xmm12,%xmm12
1135
vpmuludq %xmm0,%xmm7,%xmm7
1136
vpaddq %xmm7,%xmm11,%xmm11
1137
vpmuludq %xmm4,%xmm8,%xmm8
1138
vpaddq %xmm8,%xmm10,%xmm10
1139
1140
vpshufd $0x32,0(%rdi),%xmm7
1141
vpmuludq %xmm2,%xmm9,%xmm6
1142
vpaddq %xmm6,%xmm14,%xmm14
1143
vpmuludq %xmm1,%xmm9,%xmm5
1144
vpaddq %xmm5,%xmm13,%xmm13
1145
vpshufd $0x32,16(%rdi),%xmm8
1146
vpmuludq %xmm0,%xmm9,%xmm9
1147
vpaddq %xmm9,%xmm12,%xmm12
1148
vpmuludq %xmm4,%xmm7,%xmm6
1149
vpaddq %xmm6,%xmm11,%xmm11
1150
vpshufd $0x32,32(%rdi),%xmm9
1151
vpmuludq %xmm3,%xmm7,%xmm7
1152
vpaddq %xmm7,%xmm10,%xmm10
1153
1154
vpmuludq %xmm1,%xmm8,%xmm5
1155
vpaddq %xmm5,%xmm14,%xmm14
1156
vpmuludq %xmm0,%xmm8,%xmm8
1157
vpaddq %xmm8,%xmm13,%xmm13
1158
vpshufd $0x32,48(%rdi),%xmm7
1159
vpmuludq %xmm4,%xmm9,%xmm6
1160
vpaddq %xmm6,%xmm12,%xmm12
1161
vpshufd $0x32,64(%rdi),%xmm8
1162
vpmuludq %xmm3,%xmm9,%xmm5
1163
vpaddq %xmm5,%xmm11,%xmm11
1164
vpmuludq %xmm2,%xmm9,%xmm9
1165
vpaddq %xmm9,%xmm10,%xmm10
1166
1167
vpmuludq %xmm0,%xmm7,%xmm7
1168
vpaddq %xmm7,%xmm14,%xmm14
1169
vpmuludq %xmm4,%xmm8,%xmm6
1170
vpaddq %xmm6,%xmm13,%xmm13
1171
vpmuludq %xmm3,%xmm8,%xmm5
1172
vpaddq %xmm5,%xmm12,%xmm12
1173
vpmuludq %xmm2,%xmm8,%xmm6
1174
vpaddq %xmm6,%xmm11,%xmm11
1175
vpmuludq %xmm1,%xmm8,%xmm8
1176
vpaddq %xmm8,%xmm10,%xmm10
1177
1178
.Lshort_tail_avx:
1179
1180
1181
1182
vpsrldq $8,%xmm14,%xmm9
1183
vpsrldq $8,%xmm13,%xmm8
1184
vpsrldq $8,%xmm11,%xmm6
1185
vpsrldq $8,%xmm10,%xmm5
1186
vpsrldq $8,%xmm12,%xmm7
1187
vpaddq %xmm8,%xmm13,%xmm13
1188
vpaddq %xmm9,%xmm14,%xmm14
1189
vpaddq %xmm5,%xmm10,%xmm10
1190
vpaddq %xmm6,%xmm11,%xmm11
1191
vpaddq %xmm7,%xmm12,%xmm12
1192
1193
1194
1195
1196
vpsrlq $26,%xmm13,%xmm3
1197
vpand %xmm15,%xmm13,%xmm13
1198
vpaddq %xmm3,%xmm14,%xmm14
1199
1200
vpsrlq $26,%xmm10,%xmm0
1201
vpand %xmm15,%xmm10,%xmm10
1202
vpaddq %xmm0,%xmm11,%xmm11
1203
1204
vpsrlq $26,%xmm14,%xmm4
1205
vpand %xmm15,%xmm14,%xmm14
1206
1207
vpsrlq $26,%xmm11,%xmm1
1208
vpand %xmm15,%xmm11,%xmm11
1209
vpaddq %xmm1,%xmm12,%xmm12
1210
1211
vpaddq %xmm4,%xmm10,%xmm10
1212
vpsllq $2,%xmm4,%xmm4
1213
vpaddq %xmm4,%xmm10,%xmm10
1214
1215
vpsrlq $26,%xmm12,%xmm2
1216
vpand %xmm15,%xmm12,%xmm12
1217
vpaddq %xmm2,%xmm13,%xmm13
1218
1219
vpsrlq $26,%xmm10,%xmm0
1220
vpand %xmm15,%xmm10,%xmm10
1221
vpaddq %xmm0,%xmm11,%xmm11
1222
1223
vpsrlq $26,%xmm13,%xmm3
1224
vpand %xmm15,%xmm13,%xmm13
1225
vpaddq %xmm3,%xmm14,%xmm14
1226
1227
vmovd %xmm10,-112(%rdi)
1228
vmovd %xmm11,-108(%rdi)
1229
vmovd %xmm12,-104(%rdi)
1230
vmovd %xmm13,-100(%rdi)
1231
vmovd %xmm14,-96(%rdi)
1232
leaq 88(%r11),%rsp
1233
.cfi_def_cfa %rsp,8
1234
vzeroupper
1235
.byte 0xf3,0xc3
1236
.cfi_endproc
1237
.size poly1305_blocks_avx,.-poly1305_blocks_avx
1238
1239
.type poly1305_emit_avx,@function
1240
.align 32
1241
poly1305_emit_avx:
1242
.cfi_startproc
1243
.byte 243,15,30,250
1244
cmpl $0,20(%rdi)
1245
je .Lemit
1246
1247
movl 0(%rdi),%eax
1248
movl 4(%rdi),%ecx
1249
movl 8(%rdi),%r8d
1250
movl 12(%rdi),%r11d
1251
movl 16(%rdi),%r10d
1252
1253
shlq $26,%rcx
1254
movq %r8,%r9
1255
shlq $52,%r8
1256
addq %rcx,%rax
1257
shrq $12,%r9
1258
addq %rax,%r8
1259
adcq $0,%r9
1260
1261
shlq $14,%r11
1262
movq %r10,%rax
1263
shrq $24,%r10
1264
addq %r11,%r9
1265
shlq $40,%rax
1266
addq %rax,%r9
1267
adcq $0,%r10
1268
1269
movq %r10,%rax
1270
movq %r10,%rcx
1271
andq $3,%r10
1272
shrq $2,%rax
1273
andq $-4,%rcx
1274
addq %rcx,%rax
1275
addq %rax,%r8
1276
adcq $0,%r9
1277
adcq $0,%r10
1278
1279
movq %r8,%rax
1280
addq $5,%r8
1281
movq %r9,%rcx
1282
adcq $0,%r9
1283
adcq $0,%r10
1284
shrq $2,%r10
1285
cmovnzq %r8,%rax
1286
cmovnzq %r9,%rcx
1287
1288
addq 0(%rdx),%rax
1289
adcq 8(%rdx),%rcx
1290
movq %rax,0(%rsi)
1291
movq %rcx,8(%rsi)
1292
1293
.byte 0xf3,0xc3
1294
.cfi_endproc
1295
.size poly1305_emit_avx,.-poly1305_emit_avx
1296
.type poly1305_blocks_avx2,@function
1297
.align 32
1298
poly1305_blocks_avx2:
1299
.cfi_startproc
1300
.byte 243,15,30,250
1301
movl 20(%rdi),%r8d
1302
cmpq $128,%rdx
1303
jae .Lblocks_avx2
1304
testl %r8d,%r8d
1305
jz .Lblocks
1306
1307
.Lblocks_avx2:
1308
andq $-16,%rdx
1309
jz .Lno_data_avx2
1310
1311
vzeroupper
1312
1313
testl %r8d,%r8d
1314
jz .Lbase2_64_avx2
1315
1316
testq $63,%rdx
1317
jz .Leven_avx2
1318
1319
pushq %rbx
1320
.cfi_adjust_cfa_offset 8
1321
.cfi_offset %rbx,-16
1322
pushq %rbp
1323
.cfi_adjust_cfa_offset 8
1324
.cfi_offset %rbp,-24
1325
pushq %r12
1326
.cfi_adjust_cfa_offset 8
1327
.cfi_offset %r12,-32
1328
pushq %r13
1329
.cfi_adjust_cfa_offset 8
1330
.cfi_offset %r13,-40
1331
pushq %r14
1332
.cfi_adjust_cfa_offset 8
1333
.cfi_offset %r14,-48
1334
pushq %r15
1335
.cfi_adjust_cfa_offset 8
1336
.cfi_offset %r15,-56
1337
.Lblocks_avx2_body:
1338
1339
movq %rdx,%r15
1340
1341
movq 0(%rdi),%r8
1342
movq 8(%rdi),%r9
1343
movl 16(%rdi),%ebp
1344
1345
movq 24(%rdi),%r11
1346
movq 32(%rdi),%r13
1347
1348
1349
movl %r8d,%r14d
1350
andq $-2147483648,%r8
1351
movq %r9,%r12
1352
movl %r9d,%ebx
1353
andq $-2147483648,%r9
1354
1355
shrq $6,%r8
1356
shlq $52,%r12
1357
addq %r8,%r14
1358
shrq $12,%rbx
1359
shrq $18,%r9
1360
addq %r12,%r14
1361
adcq %r9,%rbx
1362
1363
movq %rbp,%r8
1364
shlq $40,%r8
1365
shrq $24,%rbp
1366
addq %r8,%rbx
1367
adcq $0,%rbp
1368
1369
movq $-4,%r9
1370
movq %rbp,%r8
1371
andq %rbp,%r9
1372
shrq $2,%r8
1373
andq $3,%rbp
1374
addq %r9,%r8
1375
addq %r8,%r14
1376
adcq $0,%rbx
1377
adcq $0,%rbp
1378
1379
movq %r13,%r12
1380
movq %r13,%rax
1381
shrq $2,%r13
1382
addq %r12,%r13
1383
1384
.Lbase2_26_pre_avx2:
1385
addq 0(%rsi),%r14
1386
adcq 8(%rsi),%rbx
1387
leaq 16(%rsi),%rsi
1388
adcq %rcx,%rbp
1389
subq $16,%r15
1390
1391
call __poly1305_block
1392
movq %r12,%rax
1393
1394
testq $63,%r15
1395
jnz .Lbase2_26_pre_avx2
1396
1397
testq %rcx,%rcx
1398
jz .Lstore_base2_64_avx2
1399
1400
1401
movq %r14,%rax
1402
movq %r14,%rdx
1403
shrq $52,%r14
1404
movq %rbx,%r11
1405
movq %rbx,%r12
1406
shrq $26,%rdx
1407
andq $0x3ffffff,%rax
1408
shlq $12,%r11
1409
andq $0x3ffffff,%rdx
1410
shrq $14,%rbx
1411
orq %r11,%r14
1412
shlq $24,%rbp
1413
andq $0x3ffffff,%r14
1414
shrq $40,%r12
1415
andq $0x3ffffff,%rbx
1416
orq %r12,%rbp
1417
1418
testq %r15,%r15
1419
jz .Lstore_base2_26_avx2
1420
1421
vmovd %eax,%xmm0
1422
vmovd %edx,%xmm1
1423
vmovd %r14d,%xmm2
1424
vmovd %ebx,%xmm3
1425
vmovd %ebp,%xmm4
1426
jmp .Lproceed_avx2
1427
1428
.align 32
1429
.Lstore_base2_64_avx2:
1430
movq %r14,0(%rdi)
1431
movq %rbx,8(%rdi)
1432
movq %rbp,16(%rdi)
1433
jmp .Ldone_avx2
1434
1435
.align 16
1436
.Lstore_base2_26_avx2:
1437
movl %eax,0(%rdi)
1438
movl %edx,4(%rdi)
1439
movl %r14d,8(%rdi)
1440
movl %ebx,12(%rdi)
1441
movl %ebp,16(%rdi)
1442
.align 16
1443
.Ldone_avx2:
1444
movq 0(%rsp),%r15
1445
.cfi_restore %r15
1446
movq 8(%rsp),%r14
1447
.cfi_restore %r14
1448
movq 16(%rsp),%r13
1449
.cfi_restore %r13
1450
movq 24(%rsp),%r12
1451
.cfi_restore %r12
1452
movq 32(%rsp),%rbp
1453
.cfi_restore %rbp
1454
movq 40(%rsp),%rbx
1455
.cfi_restore %rbx
1456
leaq 48(%rsp),%rsp
1457
.cfi_adjust_cfa_offset -48
1458
.Lno_data_avx2:
1459
.Lblocks_avx2_epilogue:
1460
.byte 0xf3,0xc3
1461
.cfi_endproc
1462
1463
.align 32
1464
.Lbase2_64_avx2:
1465
.cfi_startproc
1466
pushq %rbx
1467
.cfi_adjust_cfa_offset 8
1468
.cfi_offset %rbx,-16
1469
pushq %rbp
1470
.cfi_adjust_cfa_offset 8
1471
.cfi_offset %rbp,-24
1472
pushq %r12
1473
.cfi_adjust_cfa_offset 8
1474
.cfi_offset %r12,-32
1475
pushq %r13
1476
.cfi_adjust_cfa_offset 8
1477
.cfi_offset %r13,-40
1478
pushq %r14
1479
.cfi_adjust_cfa_offset 8
1480
.cfi_offset %r14,-48
1481
pushq %r15
1482
.cfi_adjust_cfa_offset 8
1483
.cfi_offset %r15,-56
1484
.Lbase2_64_avx2_body:
1485
1486
movq %rdx,%r15
1487
1488
movq 24(%rdi),%r11
1489
movq 32(%rdi),%r13
1490
1491
movq 0(%rdi),%r14
1492
movq 8(%rdi),%rbx
1493
movl 16(%rdi),%ebp
1494
1495
movq %r13,%r12
1496
movq %r13,%rax
1497
shrq $2,%r13
1498
addq %r12,%r13
1499
1500
testq $63,%rdx
1501
jz .Linit_avx2
1502
1503
.Lbase2_64_pre_avx2:
1504
addq 0(%rsi),%r14
1505
adcq 8(%rsi),%rbx
1506
leaq 16(%rsi),%rsi
1507
adcq %rcx,%rbp
1508
subq $16,%r15
1509
1510
call __poly1305_block
1511
movq %r12,%rax
1512
1513
testq $63,%r15
1514
jnz .Lbase2_64_pre_avx2
1515
1516
.Linit_avx2:
1517
1518
movq %r14,%rax
1519
movq %r14,%rdx
1520
shrq $52,%r14
1521
movq %rbx,%r8
1522
movq %rbx,%r9
1523
shrq $26,%rdx
1524
andq $0x3ffffff,%rax
1525
shlq $12,%r8
1526
andq $0x3ffffff,%rdx
1527
shrq $14,%rbx
1528
orq %r8,%r14
1529
shlq $24,%rbp
1530
andq $0x3ffffff,%r14
1531
shrq $40,%r9
1532
andq $0x3ffffff,%rbx
1533
orq %r9,%rbp
1534
1535
vmovd %eax,%xmm0
1536
vmovd %edx,%xmm1
1537
vmovd %r14d,%xmm2
1538
vmovd %ebx,%xmm3
1539
vmovd %ebp,%xmm4
1540
movl $1,20(%rdi)
1541
1542
call __poly1305_init_avx
1543
1544
.Lproceed_avx2:
1545
movq %r15,%rdx
1546
movl OPENSSL_ia32cap_P+8(%rip),%r10d
1547
movl $3221291008,%r11d
1548
1549
movq 0(%rsp),%r15
1550
.cfi_restore %r15
1551
movq 8(%rsp),%r14
1552
.cfi_restore %r14
1553
movq 16(%rsp),%r13
1554
.cfi_restore %r13
1555
movq 24(%rsp),%r12
1556
.cfi_restore %r12
1557
movq 32(%rsp),%rbp
1558
.cfi_restore %rbp
1559
movq 40(%rsp),%rbx
1560
.cfi_restore %rbx
1561
leaq 48(%rsp),%rax
1562
leaq 48(%rsp),%rsp
1563
.cfi_adjust_cfa_offset -48
1564
.Lbase2_64_avx2_epilogue:
1565
jmp .Ldo_avx2
1566
.cfi_endproc
1567
1568
.align 32
1569
.Leven_avx2:
1570
.cfi_startproc
1571
movl OPENSSL_ia32cap_P+8(%rip),%r10d
1572
vmovd 0(%rdi),%xmm0
1573
vmovd 4(%rdi),%xmm1
1574
vmovd 8(%rdi),%xmm2
1575
vmovd 12(%rdi),%xmm3
1576
vmovd 16(%rdi),%xmm4
1577
1578
.Ldo_avx2:
1579
leaq -8(%rsp),%r11
1580
.cfi_def_cfa %r11,16
1581
subq $0x128,%rsp
1582
leaq .Lconst(%rip),%rcx
1583
leaq 48+64(%rdi),%rdi
1584
vmovdqa 96(%rcx),%ymm7
1585
1586
1587
vmovdqu -64(%rdi),%xmm9
1588
andq $-512,%rsp
1589
vmovdqu -48(%rdi),%xmm10
1590
vmovdqu -32(%rdi),%xmm6
1591
vmovdqu -16(%rdi),%xmm11
1592
vmovdqu 0(%rdi),%xmm12
1593
vmovdqu 16(%rdi),%xmm13
1594
leaq 144(%rsp),%rax
1595
vmovdqu 32(%rdi),%xmm14
1596
vpermd %ymm9,%ymm7,%ymm9
1597
vmovdqu 48(%rdi),%xmm15
1598
vpermd %ymm10,%ymm7,%ymm10
1599
vmovdqu 64(%rdi),%xmm5
1600
vpermd %ymm6,%ymm7,%ymm6
1601
vmovdqa %ymm9,0(%rsp)
1602
vpermd %ymm11,%ymm7,%ymm11
1603
vmovdqa %ymm10,32-144(%rax)
1604
vpermd %ymm12,%ymm7,%ymm12
1605
vmovdqa %ymm6,64-144(%rax)
1606
vpermd %ymm13,%ymm7,%ymm13
1607
vmovdqa %ymm11,96-144(%rax)
1608
vpermd %ymm14,%ymm7,%ymm14
1609
vmovdqa %ymm12,128-144(%rax)
1610
vpermd %ymm15,%ymm7,%ymm15
1611
vmovdqa %ymm13,160-144(%rax)
1612
vpermd %ymm5,%ymm7,%ymm5
1613
vmovdqa %ymm14,192-144(%rax)
1614
vmovdqa %ymm15,224-144(%rax)
1615
vmovdqa %ymm5,256-144(%rax)
1616
vmovdqa 64(%rcx),%ymm5
1617
1618
1619
1620
vmovdqu 0(%rsi),%xmm7
1621
vmovdqu 16(%rsi),%xmm8
1622
vinserti128 $1,32(%rsi),%ymm7,%ymm7
1623
vinserti128 $1,48(%rsi),%ymm8,%ymm8
1624
leaq 64(%rsi),%rsi
1625
1626
vpsrldq $6,%ymm7,%ymm9
1627
vpsrldq $6,%ymm8,%ymm10
1628
vpunpckhqdq %ymm8,%ymm7,%ymm6
1629
vpunpcklqdq %ymm10,%ymm9,%ymm9
1630
vpunpcklqdq %ymm8,%ymm7,%ymm7
1631
1632
vpsrlq $30,%ymm9,%ymm10
1633
vpsrlq $4,%ymm9,%ymm9
1634
vpsrlq $26,%ymm7,%ymm8
1635
vpsrlq $40,%ymm6,%ymm6
1636
vpand %ymm5,%ymm9,%ymm9
1637
vpand %ymm5,%ymm7,%ymm7
1638
vpand %ymm5,%ymm8,%ymm8
1639
vpand %ymm5,%ymm10,%ymm10
1640
vpor 32(%rcx),%ymm6,%ymm6
1641
1642
vpaddq %ymm2,%ymm9,%ymm2
1643
subq $64,%rdx
1644
jz .Ltail_avx2
1645
jmp .Loop_avx2
1646
1647
.align 32
1648
.Loop_avx2:
1649
1650
1651
1652
1653
1654
1655
1656
1657
vpaddq %ymm0,%ymm7,%ymm0
1658
vmovdqa 0(%rsp),%ymm7
1659
vpaddq %ymm1,%ymm8,%ymm1
1660
vmovdqa 32(%rsp),%ymm8
1661
vpaddq %ymm3,%ymm10,%ymm3
1662
vmovdqa 96(%rsp),%ymm9
1663
vpaddq %ymm4,%ymm6,%ymm4
1664
vmovdqa 48(%rax),%ymm10
1665
vmovdqa 112(%rax),%ymm5
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
vpmuludq %ymm2,%ymm7,%ymm13
1683
vpmuludq %ymm2,%ymm8,%ymm14
1684
vpmuludq %ymm2,%ymm9,%ymm15
1685
vpmuludq %ymm2,%ymm10,%ymm11
1686
vpmuludq %ymm2,%ymm5,%ymm12
1687
1688
vpmuludq %ymm0,%ymm8,%ymm6
1689
vpmuludq %ymm1,%ymm8,%ymm2
1690
vpaddq %ymm6,%ymm12,%ymm12
1691
vpaddq %ymm2,%ymm13,%ymm13
1692
vpmuludq %ymm3,%ymm8,%ymm6
1693
vpmuludq 64(%rsp),%ymm4,%ymm2
1694
vpaddq %ymm6,%ymm15,%ymm15
1695
vpaddq %ymm2,%ymm11,%ymm11
1696
vmovdqa -16(%rax),%ymm8
1697
1698
vpmuludq %ymm0,%ymm7,%ymm6
1699
vpmuludq %ymm1,%ymm7,%ymm2
1700
vpaddq %ymm6,%ymm11,%ymm11
1701
vpaddq %ymm2,%ymm12,%ymm12
1702
vpmuludq %ymm3,%ymm7,%ymm6
1703
vpmuludq %ymm4,%ymm7,%ymm2
1704
vmovdqu 0(%rsi),%xmm7
1705
vpaddq %ymm6,%ymm14,%ymm14
1706
vpaddq %ymm2,%ymm15,%ymm15
1707
vinserti128 $1,32(%rsi),%ymm7,%ymm7
1708
1709
vpmuludq %ymm3,%ymm8,%ymm6
1710
vpmuludq %ymm4,%ymm8,%ymm2
1711
vmovdqu 16(%rsi),%xmm8
1712
vpaddq %ymm6,%ymm11,%ymm11
1713
vpaddq %ymm2,%ymm12,%ymm12
1714
vmovdqa 16(%rax),%ymm2
1715
vpmuludq %ymm1,%ymm9,%ymm6
1716
vpmuludq %ymm0,%ymm9,%ymm9
1717
vpaddq %ymm6,%ymm14,%ymm14
1718
vpaddq %ymm9,%ymm13,%ymm13
1719
vinserti128 $1,48(%rsi),%ymm8,%ymm8
1720
leaq 64(%rsi),%rsi
1721
1722
vpmuludq %ymm1,%ymm2,%ymm6
1723
vpmuludq %ymm0,%ymm2,%ymm2
1724
vpsrldq $6,%ymm7,%ymm9
1725
vpaddq %ymm6,%ymm15,%ymm15
1726
vpaddq %ymm2,%ymm14,%ymm14
1727
vpmuludq %ymm3,%ymm10,%ymm6
1728
vpmuludq %ymm4,%ymm10,%ymm2
1729
vpsrldq $6,%ymm8,%ymm10
1730
vpaddq %ymm6,%ymm12,%ymm12
1731
vpaddq %ymm2,%ymm13,%ymm13
1732
vpunpckhqdq %ymm8,%ymm7,%ymm6
1733
1734
vpmuludq %ymm3,%ymm5,%ymm3
1735
vpmuludq %ymm4,%ymm5,%ymm4
1736
vpunpcklqdq %ymm8,%ymm7,%ymm7
1737
vpaddq %ymm3,%ymm13,%ymm2
1738
vpaddq %ymm4,%ymm14,%ymm3
1739
vpunpcklqdq %ymm10,%ymm9,%ymm10
1740
vpmuludq 80(%rax),%ymm0,%ymm4
1741
vpmuludq %ymm1,%ymm5,%ymm0
1742
vmovdqa 64(%rcx),%ymm5
1743
vpaddq %ymm4,%ymm15,%ymm4
1744
vpaddq %ymm0,%ymm11,%ymm0
1745
1746
1747
1748
1749
vpsrlq $26,%ymm3,%ymm14
1750
vpand %ymm5,%ymm3,%ymm3
1751
vpaddq %ymm14,%ymm4,%ymm4
1752
1753
vpsrlq $26,%ymm0,%ymm11
1754
vpand %ymm5,%ymm0,%ymm0
1755
vpaddq %ymm11,%ymm12,%ymm1
1756
1757
vpsrlq $26,%ymm4,%ymm15
1758
vpand %ymm5,%ymm4,%ymm4
1759
1760
vpsrlq $4,%ymm10,%ymm9
1761
1762
vpsrlq $26,%ymm1,%ymm12
1763
vpand %ymm5,%ymm1,%ymm1
1764
vpaddq %ymm12,%ymm2,%ymm2
1765
1766
vpaddq %ymm15,%ymm0,%ymm0
1767
vpsllq $2,%ymm15,%ymm15
1768
vpaddq %ymm15,%ymm0,%ymm0
1769
1770
vpand %ymm5,%ymm9,%ymm9
1771
vpsrlq $26,%ymm7,%ymm8
1772
1773
vpsrlq $26,%ymm2,%ymm13
1774
vpand %ymm5,%ymm2,%ymm2
1775
vpaddq %ymm13,%ymm3,%ymm3
1776
1777
vpaddq %ymm9,%ymm2,%ymm2
1778
vpsrlq $30,%ymm10,%ymm10
1779
1780
vpsrlq $26,%ymm0,%ymm11
1781
vpand %ymm5,%ymm0,%ymm0
1782
vpaddq %ymm11,%ymm1,%ymm1
1783
1784
vpsrlq $40,%ymm6,%ymm6
1785
1786
vpsrlq $26,%ymm3,%ymm14
1787
vpand %ymm5,%ymm3,%ymm3
1788
vpaddq %ymm14,%ymm4,%ymm4
1789
1790
vpand %ymm5,%ymm7,%ymm7
1791
vpand %ymm5,%ymm8,%ymm8
1792
vpand %ymm5,%ymm10,%ymm10
1793
vpor 32(%rcx),%ymm6,%ymm6
1794
1795
subq $64,%rdx
1796
jnz .Loop_avx2
1797
1798
.byte 0x66,0x90
1799
.Ltail_avx2:
1800
1801
1802
1803
1804
1805
1806
1807
vpaddq %ymm0,%ymm7,%ymm0
1808
vmovdqu 4(%rsp),%ymm7
1809
vpaddq %ymm1,%ymm8,%ymm1
1810
vmovdqu 36(%rsp),%ymm8
1811
vpaddq %ymm3,%ymm10,%ymm3
1812
vmovdqu 100(%rsp),%ymm9
1813
vpaddq %ymm4,%ymm6,%ymm4
1814
vmovdqu 52(%rax),%ymm10
1815
vmovdqu 116(%rax),%ymm5
1816
1817
vpmuludq %ymm2,%ymm7,%ymm13
1818
vpmuludq %ymm2,%ymm8,%ymm14
1819
vpmuludq %ymm2,%ymm9,%ymm15
1820
vpmuludq %ymm2,%ymm10,%ymm11
1821
vpmuludq %ymm2,%ymm5,%ymm12
1822
1823
vpmuludq %ymm0,%ymm8,%ymm6
1824
vpmuludq %ymm1,%ymm8,%ymm2
1825
vpaddq %ymm6,%ymm12,%ymm12
1826
vpaddq %ymm2,%ymm13,%ymm13
1827
vpmuludq %ymm3,%ymm8,%ymm6
1828
vpmuludq 68(%rsp),%ymm4,%ymm2
1829
vpaddq %ymm6,%ymm15,%ymm15
1830
vpaddq %ymm2,%ymm11,%ymm11
1831
1832
vpmuludq %ymm0,%ymm7,%ymm6
1833
vpmuludq %ymm1,%ymm7,%ymm2
1834
vpaddq %ymm6,%ymm11,%ymm11
1835
vmovdqu -12(%rax),%ymm8
1836
vpaddq %ymm2,%ymm12,%ymm12
1837
vpmuludq %ymm3,%ymm7,%ymm6
1838
vpmuludq %ymm4,%ymm7,%ymm2
1839
vpaddq %ymm6,%ymm14,%ymm14
1840
vpaddq %ymm2,%ymm15,%ymm15
1841
1842
vpmuludq %ymm3,%ymm8,%ymm6
1843
vpmuludq %ymm4,%ymm8,%ymm2
1844
vpaddq %ymm6,%ymm11,%ymm11
1845
vpaddq %ymm2,%ymm12,%ymm12
1846
vmovdqu 20(%rax),%ymm2
1847
vpmuludq %ymm1,%ymm9,%ymm6
1848
vpmuludq %ymm0,%ymm9,%ymm9
1849
vpaddq %ymm6,%ymm14,%ymm14
1850
vpaddq %ymm9,%ymm13,%ymm13
1851
1852
vpmuludq %ymm1,%ymm2,%ymm6
1853
vpmuludq %ymm0,%ymm2,%ymm2
1854
vpaddq %ymm6,%ymm15,%ymm15
1855
vpaddq %ymm2,%ymm14,%ymm14
1856
vpmuludq %ymm3,%ymm10,%ymm6
1857
vpmuludq %ymm4,%ymm10,%ymm2
1858
vpaddq %ymm6,%ymm12,%ymm12
1859
vpaddq %ymm2,%ymm13,%ymm13
1860
1861
vpmuludq %ymm3,%ymm5,%ymm3
1862
vpmuludq %ymm4,%ymm5,%ymm4
1863
vpaddq %ymm3,%ymm13,%ymm2
1864
vpaddq %ymm4,%ymm14,%ymm3
1865
vpmuludq 84(%rax),%ymm0,%ymm4
1866
vpmuludq %ymm1,%ymm5,%ymm0
1867
vmovdqa 64(%rcx),%ymm5
1868
vpaddq %ymm4,%ymm15,%ymm4
1869
vpaddq %ymm0,%ymm11,%ymm0
1870
1871
1872
1873
1874
vpsrldq $8,%ymm12,%ymm8
1875
vpsrldq $8,%ymm2,%ymm9
1876
vpsrldq $8,%ymm3,%ymm10
1877
vpsrldq $8,%ymm4,%ymm6
1878
vpsrldq $8,%ymm0,%ymm7
1879
vpaddq %ymm8,%ymm12,%ymm12
1880
vpaddq %ymm9,%ymm2,%ymm2
1881
vpaddq %ymm10,%ymm3,%ymm3
1882
vpaddq %ymm6,%ymm4,%ymm4
1883
vpaddq %ymm7,%ymm0,%ymm0
1884
1885
vpermq $0x2,%ymm3,%ymm10
1886
vpermq $0x2,%ymm4,%ymm6
1887
vpermq $0x2,%ymm0,%ymm7
1888
vpermq $0x2,%ymm12,%ymm8
1889
vpermq $0x2,%ymm2,%ymm9
1890
vpaddq %ymm10,%ymm3,%ymm3
1891
vpaddq %ymm6,%ymm4,%ymm4
1892
vpaddq %ymm7,%ymm0,%ymm0
1893
vpaddq %ymm8,%ymm12,%ymm12
1894
vpaddq %ymm9,%ymm2,%ymm2
1895
1896
1897
1898
1899
vpsrlq $26,%ymm3,%ymm14
1900
vpand %ymm5,%ymm3,%ymm3
1901
vpaddq %ymm14,%ymm4,%ymm4
1902
1903
vpsrlq $26,%ymm0,%ymm11
1904
vpand %ymm5,%ymm0,%ymm0
1905
vpaddq %ymm11,%ymm12,%ymm1
1906
1907
vpsrlq $26,%ymm4,%ymm15
1908
vpand %ymm5,%ymm4,%ymm4
1909
1910
vpsrlq $26,%ymm1,%ymm12
1911
vpand %ymm5,%ymm1,%ymm1
1912
vpaddq %ymm12,%ymm2,%ymm2
1913
1914
vpaddq %ymm15,%ymm0,%ymm0
1915
vpsllq $2,%ymm15,%ymm15
1916
vpaddq %ymm15,%ymm0,%ymm0
1917
1918
vpsrlq $26,%ymm2,%ymm13
1919
vpand %ymm5,%ymm2,%ymm2
1920
vpaddq %ymm13,%ymm3,%ymm3
1921
1922
vpsrlq $26,%ymm0,%ymm11
1923
vpand %ymm5,%ymm0,%ymm0
1924
vpaddq %ymm11,%ymm1,%ymm1
1925
1926
vpsrlq $26,%ymm3,%ymm14
1927
vpand %ymm5,%ymm3,%ymm3
1928
vpaddq %ymm14,%ymm4,%ymm4
1929
1930
vmovd %xmm0,-112(%rdi)
1931
vmovd %xmm1,-108(%rdi)
1932
vmovd %xmm2,-104(%rdi)
1933
vmovd %xmm3,-100(%rdi)
1934
vmovd %xmm4,-96(%rdi)
1935
leaq 8(%r11),%rsp
1936
.cfi_def_cfa %rsp,8
1937
vzeroupper
1938
.byte 0xf3,0xc3
1939
.cfi_endproc
1940
.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
1941
.section .rodata
1942
.align 64
1943
.Lconst:
1944
.Lmask24:
1945
.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
1946
.L129:
1947
.long 16777216,0,16777216,0,16777216,0,16777216,0
1948
.Lmask26:
1949
.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
1950
.Lpermd_avx2:
1951
.long 2,2,2,3,2,0,2,1
1952
.Lpermd_avx512:
1953
.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
1954
1955
.L2_44_inp_permd:
1956
.long 0,1,1,2,2,3,7,7
1957
.L2_44_inp_shift:
1958
.quad 0,12,24,64
1959
.L2_44_mask:
1960
.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
1961
.L2_44_shift_rgt:
1962
.quad 44,44,42,64
1963
.L2_44_shift_lft:
1964
.quad 8,8,10,64
1965
1966
.align 64
1967
.Lx_mask44:
1968
.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1969
.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1970
.Lx_mask42:
1971
.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1972
.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1973
.previous
1974
.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1975
.align 16
1976
.globl xor128_encrypt_n_pad
1977
.type xor128_encrypt_n_pad,@function
1978
.align 16
1979
xor128_encrypt_n_pad:
1980
.cfi_startproc
1981
subq %rdx,%rsi
1982
subq %rdx,%rdi
1983
movq %rcx,%r10
1984
shrq $4,%rcx
1985
jz .Ltail_enc
1986
nop
1987
.Loop_enc_xmm:
1988
movdqu (%rsi,%rdx,1),%xmm0
1989
pxor (%rdx),%xmm0
1990
movdqu %xmm0,(%rdi,%rdx,1)
1991
movdqa %xmm0,(%rdx)
1992
leaq 16(%rdx),%rdx
1993
decq %rcx
1994
jnz .Loop_enc_xmm
1995
1996
andq $15,%r10
1997
jz .Ldone_enc
1998
1999
.Ltail_enc:
2000
movq $16,%rcx
2001
subq %r10,%rcx
2002
xorl %eax,%eax
2003
.Loop_enc_byte:
2004
movb (%rsi,%rdx,1),%al
2005
xorb (%rdx),%al
2006
movb %al,(%rdi,%rdx,1)
2007
movb %al,(%rdx)
2008
leaq 1(%rdx),%rdx
2009
decq %r10
2010
jnz .Loop_enc_byte
2011
2012
xorl %eax,%eax
2013
.Loop_enc_pad:
2014
movb %al,(%rdx)
2015
leaq 1(%rdx),%rdx
2016
decq %rcx
2017
jnz .Loop_enc_pad
2018
2019
.Ldone_enc:
2020
movq %rdx,%rax
2021
.byte 0xf3,0xc3
2022
.cfi_endproc
2023
.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
2024
2025
.globl xor128_decrypt_n_pad
2026
.type xor128_decrypt_n_pad,@function
2027
.align 16
2028
xor128_decrypt_n_pad:
2029
.cfi_startproc
2030
subq %rdx,%rsi
2031
subq %rdx,%rdi
2032
movq %rcx,%r10
2033
shrq $4,%rcx
2034
jz .Ltail_dec
2035
nop
2036
.Loop_dec_xmm:
2037
movdqu (%rsi,%rdx,1),%xmm0
2038
movdqa (%rdx),%xmm1
2039
pxor %xmm0,%xmm1
2040
movdqu %xmm1,(%rdi,%rdx,1)
2041
movdqa %xmm0,(%rdx)
2042
leaq 16(%rdx),%rdx
2043
decq %rcx
2044
jnz .Loop_dec_xmm
2045
2046
pxor %xmm1,%xmm1
2047
andq $15,%r10
2048
jz .Ldone_dec
2049
2050
.Ltail_dec:
2051
movq $16,%rcx
2052
subq %r10,%rcx
2053
xorl %eax,%eax
2054
xorq %r11,%r11
2055
.Loop_dec_byte:
2056
movb (%rsi,%rdx,1),%r11b
2057
movb (%rdx),%al
2058
xorb %r11b,%al
2059
movb %al,(%rdi,%rdx,1)
2060
movb %r11b,(%rdx)
2061
leaq 1(%rdx),%rdx
2062
decq %r10
2063
jnz .Loop_dec_byte
2064
2065
xorl %eax,%eax
2066
.Loop_dec_pad:
2067
movb %al,(%rdx)
2068
leaq 1(%rdx),%rdx
2069
decq %rcx
2070
jnz .Loop_dec_pad
2071
2072
.Ldone_dec:
2073
movq %rdx,%rax
2074
.byte 0xf3,0xc3
2075
.cfi_endproc
2076
.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
2077
.section ".note.gnu.property", "a"
2078
.p2align 3
2079
.long 1f - 0f
2080
.long 4f - 1f
2081
.long 5
2082
0:
2083
# "GNU" encoded with .byte, since .asciz isn't supported
2084
# on Solaris.
2085
.byte 0x47
2086
.byte 0x4e
2087
.byte 0x55
2088
.byte 0
2089
1:
2090
.p2align 3
2091
.long 0xc0000002
2092
.long 3f - 2f
2093
2:
2094
.long 3
2095
3:
2096
.p2align 3
2097
4:
2098
2099