Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/x86/poly1305-x86_64-cryptogams.pl
26292 views
1
#!/usr/bin/env perl
2
# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
3
#
4
# Copyright (C) 2017-2018 Samuel Neves <[email protected]>. All Rights Reserved.
5
# Copyright (C) 2017-2019 Jason A. Donenfeld <[email protected]>. All Rights Reserved.
6
# Copyright (C) 2006-2017 CRYPTOGAMS by <[email protected]>. All Rights Reserved.
7
#
8
# This code is taken from the OpenSSL project but the author, Andy Polyakov,
9
# has relicensed it under the licenses specified in the SPDX header above.
10
# The original headers, including the original license headers, are
11
# included below for completeness.
12
#
13
# ====================================================================
14
# Written by Andy Polyakov <[email protected]> for the OpenSSL
15
# project. The module is, however, dual licensed under OpenSSL and
16
# CRYPTOGAMS licenses depending on where you obtain it. For further
17
# details see http://www.openssl.org/~appro/cryptogams/.
18
# ====================================================================
19
#
20
# This module implements Poly1305 hash for x86_64.
21
#
22
# March 2015
23
#
24
# Initial release.
25
#
26
# December 2016
27
#
28
# Add AVX512F+VL+BW code path.
29
#
30
# November 2017
31
#
32
# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
33
# executed even on Knights Landing. Trigger for modification was
34
# observation that AVX512 code paths can negatively affect overall
35
# Skylake-X system performance. Since we are likely to suppress
36
# AVX512F capability flag [at least on Skylake-X], conversion serves
37
# as kind of "investment protection". Note that next *lake processor,
38
# Cannonlake, has AVX512IFMA code path to execute...
39
#
40
# Numbers are cycles per processed byte with poly1305_blocks alone,
41
# measured with rdtsc at fixed clock frequency.
42
#
43
# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
44
# P4 4.46/+120% -
45
# Core 2 2.41/+90% -
46
# Westmere 1.88/+120% -
47
# Sandy Bridge 1.39/+140% 1.10
48
# Haswell 1.14/+175% 1.11 0.65
49
# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
50
# Silvermont 2.83/+95% -
51
# Knights L 3.60/? 1.65 1.10 0.41(***)
52
# Goldmont 1.70/+180% -
53
# VIA Nano 1.82/+150% -
54
# Sledgehammer 1.38/+160% -
55
# Bulldozer 2.30/+130% 0.97
56
# Ryzen 1.15/+200% 1.08 1.18
57
#
58
# (*) improvement coefficients relative to clang are more modest and
59
# are ~50% on most processors, in both cases we are comparing to
60
# __int128 code;
61
# (**) SSE2 implementation was attempted, but among non-AVX processors
62
# it was faster than integer-only code only on older Intel P4 and
63
# Core processors, 50-30%, less newer processor is, but slower on
64
# contemporary ones, for example almost 2x slower on Atom, and as
65
# former are naturally disappearing, SSE2 is deemed unnecessary;
66
# (***) strangely enough performance seems to vary from core to core,
67
# listed result is best case;
68
69
$flavour = shift;
70
$output = shift;
71
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
72
73
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
74
$kernel=0; $kernel=1 if (!$flavour && !$output);
75
76
if (!$kernel) {
77
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
78
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
79
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
80
die "can't locate x86_64-xlate.pl";
81
82
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
83
*STDOUT=*OUT;
84
85
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
86
=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
87
$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
88
}
89
90
if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
91
`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
92
$avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
93
$avx += 1 if ($1==2.11 && $2>=8);
94
}
95
96
if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
97
`ml64 2>&1` =~ /Version ([0-9]+)\./) {
98
$avx = ($1>=10) + ($1>=11);
99
}
100
101
if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
102
$avx = ($2>=3.0) + ($2>3.0);
103
}
104
} else {
105
$avx = 4; # The kernel uses ifdefs for this.
106
}
107
108
sub declare_function() {
109
my ($name, $align, $nargs) = @_;
110
if($kernel) {
111
$code .= "SYM_FUNC_START($name)\n";
112
$code .= ".L$name:\n";
113
} else {
114
$code .= ".globl $name\n";
115
$code .= ".type $name,\@function,$nargs\n";
116
$code .= ".align $align\n";
117
$code .= "$name:\n";
118
}
119
}
120
121
sub declare_typed_function() {
122
my ($name, $align, $nargs) = @_;
123
if($kernel) {
124
$code .= "SYM_TYPED_FUNC_START($name)\n";
125
$code .= ".L$name:\n";
126
} else {
127
$code .= ".globl $name\n";
128
$code .= ".type $name,\@function,$nargs\n";
129
$code .= ".align $align\n";
130
$code .= "$name:\n";
131
}
132
}
133
134
sub end_function() {
135
my ($name) = @_;
136
if($kernel) {
137
$code .= "SYM_FUNC_END($name)\n";
138
} else {
139
$code .= ".size $name,.-$name\n";
140
}
141
}
142
143
$code.=<<___ if $kernel;
144
#include <linux/cfi_types.h>
145
___
146
147
if ($avx) {
148
$code.=<<___ if $kernel;
149
.section .rodata
150
___
151
$code.=<<___;
152
.align 64
153
.Lconst:
154
.Lmask24:
155
.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
156
.L129:
157
.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
158
.Lmask26:
159
.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
160
.Lpermd_avx2:
161
.long 2,2,2,3,2,0,2,1
162
.Lpermd_avx512:
163
.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
164
165
.L2_44_inp_permd:
166
.long 0,1,1,2,2,3,7,7
167
.L2_44_inp_shift:
168
.quad 0,12,24,64
169
.L2_44_mask:
170
.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
171
.L2_44_shift_rgt:
172
.quad 44,44,42,64
173
.L2_44_shift_lft:
174
.quad 8,8,10,64
175
176
.align 64
177
.Lx_mask44:
178
.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
179
.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
180
.Lx_mask42:
181
.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
182
.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
183
___
184
}
185
$code.=<<___ if (!$kernel);
186
.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
187
.align 16
188
___
189
190
my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
191
my ($mac,$nonce)=($inp,$len); # *_emit arguments
192
my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
193
my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
194
195
sub poly1305_iteration {
196
# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
197
# output: $h0-$h2 *= $r0-$r1
198
$code.=<<___;
199
mulq $h0 # h0*r1
200
mov %rax,$d2
201
mov $r0,%rax
202
mov %rdx,$d3
203
204
mulq $h0 # h0*r0
205
mov %rax,$h0 # future $h0
206
mov $r0,%rax
207
mov %rdx,$d1
208
209
mulq $h1 # h1*r0
210
add %rax,$d2
211
mov $s1,%rax
212
adc %rdx,$d3
213
214
mulq $h1 # h1*s1
215
mov $h2,$h1 # borrow $h1
216
add %rax,$h0
217
adc %rdx,$d1
218
219
imulq $s1,$h1 # h2*s1
220
add $h1,$d2
221
mov $d1,$h1
222
adc \$0,$d3
223
224
imulq $r0,$h2 # h2*r0
225
add $d2,$h1
226
mov \$-4,%rax # mask value
227
adc $h2,$d3
228
229
and $d3,%rax # last reduction step
230
mov $d3,$h2
231
shr \$2,$d3
232
and \$3,$h2
233
add $d3,%rax
234
add %rax,$h0
235
adc \$0,$h1
236
adc \$0,$h2
237
___
238
}
239
240
########################################################################
241
# Layout of opaque area is following.
242
#
243
# unsigned __int64 h[3]; # current hash value base 2^64
244
# unsigned __int64 r[2]; # key value base 2^64
245
246
$code.=<<___;
247
.text
248
___
249
$code.=<<___ if (!$kernel);
250
.extern OPENSSL_ia32cap_P
251
252
.globl poly1305_block_init_arch
253
.hidden poly1305_block_init_arch
254
.globl poly1305_blocks_x86_64
255
.hidden poly1305_blocks_x86_64
256
.globl poly1305_emit_x86_64
257
.hidden poly1305_emit_x86_64
258
___
259
&declare_typed_function("poly1305_block_init_arch", 32, 3);
260
$code.=<<___;
261
xor %eax,%eax
262
mov %rax,0($ctx) # initialize hash value
263
mov %rax,8($ctx)
264
mov %rax,16($ctx)
265
266
test $inp,$inp
267
je .Lno_key
268
___
269
$code.=<<___ if (!$kernel);
270
lea poly1305_blocks_x86_64(%rip),%r10
271
lea poly1305_emit_x86_64(%rip),%r11
272
___
273
$code.=<<___ if (!$kernel && $avx);
274
mov OPENSSL_ia32cap_P+4(%rip),%r9
275
lea poly1305_blocks_avx(%rip),%rax
276
lea poly1305_emit_avx(%rip),%rcx
277
bt \$`60-32`,%r9 # AVX?
278
cmovc %rax,%r10
279
cmovc %rcx,%r11
280
___
281
$code.=<<___ if (!$kernel && $avx>1);
282
lea poly1305_blocks_avx2(%rip),%rax
283
bt \$`5+32`,%r9 # AVX2?
284
cmovc %rax,%r10
285
___
286
$code.=<<___ if (!$kernel && $avx>3);
287
mov \$`(1<<31|1<<21|1<<16)`,%rax
288
shr \$32,%r9
289
and %rax,%r9
290
cmp %rax,%r9
291
je .Linit_base2_44
292
___
293
$code.=<<___;
294
mov \$0x0ffffffc0fffffff,%rax
295
mov \$0x0ffffffc0ffffffc,%rcx
296
and 0($inp),%rax
297
and 8($inp),%rcx
298
mov %rax,24($ctx)
299
mov %rcx,32($ctx)
300
___
301
$code.=<<___ if (!$kernel && $flavour !~ /elf32/);
302
mov %r10,0(%rdx)
303
mov %r11,8(%rdx)
304
___
305
$code.=<<___ if (!$kernel && $flavour =~ /elf32/);
306
mov %r10d,0(%rdx)
307
mov %r11d,4(%rdx)
308
___
309
$code.=<<___;
310
mov \$1,%eax
311
.Lno_key:
312
RET
313
___
314
&end_function("poly1305_block_init_arch");
315
316
&declare_function("poly1305_blocks_x86_64", 32, 4);
317
$code.=<<___;
318
.cfi_startproc
319
.Lblocks:
320
shr \$4,$len
321
jz .Lno_data # too short
322
323
push %rbx
324
.cfi_push %rbx
325
push %r12
326
.cfi_push %r12
327
push %r13
328
.cfi_push %r13
329
push %r14
330
.cfi_push %r14
331
push %r15
332
.cfi_push %r15
333
push $ctx
334
.cfi_push $ctx
335
.Lblocks_body:
336
337
mov $len,%r15 # reassign $len
338
339
mov 24($ctx),$r0 # load r
340
mov 32($ctx),$s1
341
342
mov 0($ctx),$h0 # load hash value
343
mov 8($ctx),$h1
344
mov 16($ctx),$h2
345
346
mov $s1,$r1
347
shr \$2,$s1
348
mov $r1,%rax
349
add $r1,$s1 # s1 = r1 + (r1 >> 2)
350
jmp .Loop
351
352
.align 32
353
.Loop:
354
add 0($inp),$h0 # accumulate input
355
adc 8($inp),$h1
356
lea 16($inp),$inp
357
adc $padbit,$h2
358
___
359
360
&poly1305_iteration();
361
362
$code.=<<___;
363
mov $r1,%rax
364
dec %r15 # len-=16
365
jnz .Loop
366
367
mov 0(%rsp),$ctx
368
.cfi_restore $ctx
369
370
mov $h0,0($ctx) # store hash value
371
mov $h1,8($ctx)
372
mov $h2,16($ctx)
373
374
mov 8(%rsp),%r15
375
.cfi_restore %r15
376
mov 16(%rsp),%r14
377
.cfi_restore %r14
378
mov 24(%rsp),%r13
379
.cfi_restore %r13
380
mov 32(%rsp),%r12
381
.cfi_restore %r12
382
mov 40(%rsp),%rbx
383
.cfi_restore %rbx
384
lea 48(%rsp),%rsp
385
.cfi_adjust_cfa_offset -48
386
.Lno_data:
387
.Lblocks_epilogue:
388
RET
389
.cfi_endproc
390
___
391
&end_function("poly1305_blocks_x86_64");
392
393
&declare_function("poly1305_emit_x86_64", 32, 3);
394
$code.=<<___;
395
.Lemit:
396
mov 0($ctx),%r8 # load hash value
397
mov 8($ctx),%r9
398
mov 16($ctx),%r10
399
400
mov %r8,%rax
401
add \$5,%r8 # compare to modulus
402
mov %r9,%rcx
403
adc \$0,%r9
404
adc \$0,%r10
405
shr \$2,%r10 # did 130-bit value overflow?
406
cmovnz %r8,%rax
407
cmovnz %r9,%rcx
408
409
add 0($nonce),%rax # accumulate nonce
410
adc 8($nonce),%rcx
411
mov %rax,0($mac) # write result
412
mov %rcx,8($mac)
413
414
RET
415
___
416
&end_function("poly1305_emit_x86_64");
417
if ($avx) {
418
419
########################################################################
420
# Layout of opaque area is following.
421
#
422
# unsigned __int32 h[5]; # current hash value base 2^26
423
# unsigned __int32 is_base2_26;
424
# unsigned __int64 r[2]; # key value base 2^64
425
# unsigned __int64 pad;
426
# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
427
#
428
# where r^n are base 2^26 digits of degrees of multiplier key. There are
429
# 5 digits, but last four are interleaved with multiples of 5, totalling
430
# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
431
432
my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
433
map("%xmm$_",(0..15));
434
435
$code.=<<___;
436
.type __poly1305_block,\@abi-omnipotent
437
.align 32
438
__poly1305_block:
439
push $ctx
440
___
441
&poly1305_iteration();
442
$code.=<<___;
443
pop $ctx
444
RET
445
.size __poly1305_block,.-__poly1305_block
446
447
.type __poly1305_init_avx,\@abi-omnipotent
448
.align 32
449
__poly1305_init_avx:
450
push %rbp
451
mov %rsp,%rbp
452
mov $r0,$h0
453
mov $r1,$h1
454
xor $h2,$h2
455
456
lea 48+64($ctx),$ctx # size optimization
457
458
mov $r1,%rax
459
call __poly1305_block # r^2
460
461
mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
462
mov \$0x3ffffff,%edx
463
mov $h0,$d1
464
and $h0#d,%eax
465
mov $r0,$d2
466
and $r0#d,%edx
467
mov %eax,`16*0+0-64`($ctx)
468
shr \$26,$d1
469
mov %edx,`16*0+4-64`($ctx)
470
shr \$26,$d2
471
472
mov \$0x3ffffff,%eax
473
mov \$0x3ffffff,%edx
474
and $d1#d,%eax
475
and $d2#d,%edx
476
mov %eax,`16*1+0-64`($ctx)
477
lea (%rax,%rax,4),%eax # *5
478
mov %edx,`16*1+4-64`($ctx)
479
lea (%rdx,%rdx,4),%edx # *5
480
mov %eax,`16*2+0-64`($ctx)
481
shr \$26,$d1
482
mov %edx,`16*2+4-64`($ctx)
483
shr \$26,$d2
484
485
mov $h1,%rax
486
mov $r1,%rdx
487
shl \$12,%rax
488
shl \$12,%rdx
489
or $d1,%rax
490
or $d2,%rdx
491
and \$0x3ffffff,%eax
492
and \$0x3ffffff,%edx
493
mov %eax,`16*3+0-64`($ctx)
494
lea (%rax,%rax,4),%eax # *5
495
mov %edx,`16*3+4-64`($ctx)
496
lea (%rdx,%rdx,4),%edx # *5
497
mov %eax,`16*4+0-64`($ctx)
498
mov $h1,$d1
499
mov %edx,`16*4+4-64`($ctx)
500
mov $r1,$d2
501
502
mov \$0x3ffffff,%eax
503
mov \$0x3ffffff,%edx
504
shr \$14,$d1
505
shr \$14,$d2
506
and $d1#d,%eax
507
and $d2#d,%edx
508
mov %eax,`16*5+0-64`($ctx)
509
lea (%rax,%rax,4),%eax # *5
510
mov %edx,`16*5+4-64`($ctx)
511
lea (%rdx,%rdx,4),%edx # *5
512
mov %eax,`16*6+0-64`($ctx)
513
shr \$26,$d1
514
mov %edx,`16*6+4-64`($ctx)
515
shr \$26,$d2
516
517
mov $h2,%rax
518
shl \$24,%rax
519
or %rax,$d1
520
mov $d1#d,`16*7+0-64`($ctx)
521
lea ($d1,$d1,4),$d1 # *5
522
mov $d2#d,`16*7+4-64`($ctx)
523
lea ($d2,$d2,4),$d2 # *5
524
mov $d1#d,`16*8+0-64`($ctx)
525
mov $d2#d,`16*8+4-64`($ctx)
526
527
mov $r1,%rax
528
call __poly1305_block # r^3
529
530
mov \$0x3ffffff,%eax # save r^3 base 2^26
531
mov $h0,$d1
532
and $h0#d,%eax
533
shr \$26,$d1
534
mov %eax,`16*0+12-64`($ctx)
535
536
mov \$0x3ffffff,%edx
537
and $d1#d,%edx
538
mov %edx,`16*1+12-64`($ctx)
539
lea (%rdx,%rdx,4),%edx # *5
540
shr \$26,$d1
541
mov %edx,`16*2+12-64`($ctx)
542
543
mov $h1,%rax
544
shl \$12,%rax
545
or $d1,%rax
546
and \$0x3ffffff,%eax
547
mov %eax,`16*3+12-64`($ctx)
548
lea (%rax,%rax,4),%eax # *5
549
mov $h1,$d1
550
mov %eax,`16*4+12-64`($ctx)
551
552
mov \$0x3ffffff,%edx
553
shr \$14,$d1
554
and $d1#d,%edx
555
mov %edx,`16*5+12-64`($ctx)
556
lea (%rdx,%rdx,4),%edx # *5
557
shr \$26,$d1
558
mov %edx,`16*6+12-64`($ctx)
559
560
mov $h2,%rax
561
shl \$24,%rax
562
or %rax,$d1
563
mov $d1#d,`16*7+12-64`($ctx)
564
lea ($d1,$d1,4),$d1 # *5
565
mov $d1#d,`16*8+12-64`($ctx)
566
567
mov $r1,%rax
568
call __poly1305_block # r^4
569
570
mov \$0x3ffffff,%eax # save r^4 base 2^26
571
mov $h0,$d1
572
and $h0#d,%eax
573
shr \$26,$d1
574
mov %eax,`16*0+8-64`($ctx)
575
576
mov \$0x3ffffff,%edx
577
and $d1#d,%edx
578
mov %edx,`16*1+8-64`($ctx)
579
lea (%rdx,%rdx,4),%edx # *5
580
shr \$26,$d1
581
mov %edx,`16*2+8-64`($ctx)
582
583
mov $h1,%rax
584
shl \$12,%rax
585
or $d1,%rax
586
and \$0x3ffffff,%eax
587
mov %eax,`16*3+8-64`($ctx)
588
lea (%rax,%rax,4),%eax # *5
589
mov $h1,$d1
590
mov %eax,`16*4+8-64`($ctx)
591
592
mov \$0x3ffffff,%edx
593
shr \$14,$d1
594
and $d1#d,%edx
595
mov %edx,`16*5+8-64`($ctx)
596
lea (%rdx,%rdx,4),%edx # *5
597
shr \$26,$d1
598
mov %edx,`16*6+8-64`($ctx)
599
600
mov $h2,%rax
601
shl \$24,%rax
602
or %rax,$d1
603
mov $d1#d,`16*7+8-64`($ctx)
604
lea ($d1,$d1,4),$d1 # *5
605
mov $d1#d,`16*8+8-64`($ctx)
606
607
lea -48-64($ctx),$ctx # size [de-]optimization
608
pop %rbp
609
RET
610
.size __poly1305_init_avx,.-__poly1305_init_avx
611
___
612
613
&declare_function("poly1305_blocks_avx", 32, 4);
614
$code.=<<___;
615
.cfi_startproc
616
mov 20($ctx),%r8d # is_base2_26
617
cmp \$128,$len
618
jae .Lblocks_avx
619
test %r8d,%r8d
620
jz .Lblocks
621
622
.Lblocks_avx:
623
and \$-16,$len
624
jz .Lno_data_avx
625
626
vzeroupper
627
628
test %r8d,%r8d
629
jz .Lbase2_64_avx
630
631
test \$31,$len
632
jz .Leven_avx
633
634
push %rbp
635
.cfi_push %rbp
636
mov %rsp,%rbp
637
push %rbx
638
.cfi_push %rbx
639
push %r12
640
.cfi_push %r12
641
push %r13
642
.cfi_push %r13
643
push %r14
644
.cfi_push %r14
645
push %r15
646
.cfi_push %r15
647
.Lblocks_avx_body:
648
649
mov $len,%r15 # reassign $len
650
651
mov 0($ctx),$d1 # load hash value
652
mov 8($ctx),$d2
653
mov 16($ctx),$h2#d
654
655
mov 24($ctx),$r0 # load r
656
mov 32($ctx),$s1
657
658
################################# base 2^26 -> base 2^64
659
mov $d1#d,$h0#d
660
and \$`-1*(1<<31)`,$d1
661
mov $d2,$r1 # borrow $r1
662
mov $d2#d,$h1#d
663
and \$`-1*(1<<31)`,$d2
664
665
shr \$6,$d1
666
shl \$52,$r1
667
add $d1,$h0
668
shr \$12,$h1
669
shr \$18,$d2
670
add $r1,$h0
671
adc $d2,$h1
672
673
mov $h2,$d1
674
shl \$40,$d1
675
shr \$24,$h2
676
add $d1,$h1
677
adc \$0,$h2 # can be partially reduced...
678
679
mov \$-4,$d2 # ... so reduce
680
mov $h2,$d1
681
and $h2,$d2
682
shr \$2,$d1
683
and \$3,$h2
684
add $d2,$d1 # =*5
685
add $d1,$h0
686
adc \$0,$h1
687
adc \$0,$h2
688
689
mov $s1,$r1
690
mov $s1,%rax
691
shr \$2,$s1
692
add $r1,$s1 # s1 = r1 + (r1 >> 2)
693
694
add 0($inp),$h0 # accumulate input
695
adc 8($inp),$h1
696
lea 16($inp),$inp
697
adc $padbit,$h2
698
699
call __poly1305_block
700
701
test $padbit,$padbit # if $padbit is zero,
702
jz .Lstore_base2_64_avx # store hash in base 2^64 format
703
704
################################# base 2^64 -> base 2^26
705
mov $h0,%rax
706
mov $h0,%rdx
707
shr \$52,$h0
708
mov $h1,$r0
709
mov $h1,$r1
710
shr \$26,%rdx
711
and \$0x3ffffff,%rax # h[0]
712
shl \$12,$r0
713
and \$0x3ffffff,%rdx # h[1]
714
shr \$14,$h1
715
or $r0,$h0
716
shl \$24,$h2
717
and \$0x3ffffff,$h0 # h[2]
718
shr \$40,$r1
719
and \$0x3ffffff,$h1 # h[3]
720
or $r1,$h2 # h[4]
721
722
sub \$16,%r15
723
jz .Lstore_base2_26_avx
724
725
vmovd %rax#d,$H0
726
vmovd %rdx#d,$H1
727
vmovd $h0#d,$H2
728
vmovd $h1#d,$H3
729
vmovd $h2#d,$H4
730
jmp .Lproceed_avx
731
732
.align 32
733
.Lstore_base2_64_avx:
734
mov $h0,0($ctx)
735
mov $h1,8($ctx)
736
mov $h2,16($ctx) # note that is_base2_26 is zeroed
737
jmp .Ldone_avx
738
739
.align 16
740
.Lstore_base2_26_avx:
741
mov %rax#d,0($ctx) # store hash value base 2^26
742
mov %rdx#d,4($ctx)
743
mov $h0#d,8($ctx)
744
mov $h1#d,12($ctx)
745
mov $h2#d,16($ctx)
746
.align 16
747
.Ldone_avx:
748
pop %r15
749
.cfi_restore %r15
750
pop %r14
751
.cfi_restore %r14
752
pop %r13
753
.cfi_restore %r13
754
pop %r12
755
.cfi_restore %r12
756
pop %rbx
757
.cfi_restore %rbx
758
pop %rbp
759
.cfi_restore %rbp
760
.Lno_data_avx:
761
.Lblocks_avx_epilogue:
762
RET
763
.cfi_endproc
764
765
.align 32
766
.Lbase2_64_avx:
767
.cfi_startproc
768
push %rbp
769
.cfi_push %rbp
770
mov %rsp,%rbp
771
push %rbx
772
.cfi_push %rbx
773
push %r12
774
.cfi_push %r12
775
push %r13
776
.cfi_push %r13
777
push %r14
778
.cfi_push %r14
779
push %r15
780
.cfi_push %r15
781
.Lbase2_64_avx_body:
782
783
mov $len,%r15 # reassign $len
784
785
mov 24($ctx),$r0 # load r
786
mov 32($ctx),$s1
787
788
mov 0($ctx),$h0 # load hash value
789
mov 8($ctx),$h1
790
mov 16($ctx),$h2#d
791
792
mov $s1,$r1
793
mov $s1,%rax
794
shr \$2,$s1
795
add $r1,$s1 # s1 = r1 + (r1 >> 2)
796
797
test \$31,$len
798
jz .Linit_avx
799
800
add 0($inp),$h0 # accumulate input
801
adc 8($inp),$h1
802
lea 16($inp),$inp
803
adc $padbit,$h2
804
sub \$16,%r15
805
806
call __poly1305_block
807
808
.Linit_avx:
809
################################# base 2^64 -> base 2^26
810
mov $h0,%rax
811
mov $h0,%rdx
812
shr \$52,$h0
813
mov $h1,$d1
814
mov $h1,$d2
815
shr \$26,%rdx
816
and \$0x3ffffff,%rax # h[0]
817
shl \$12,$d1
818
and \$0x3ffffff,%rdx # h[1]
819
shr \$14,$h1
820
or $d1,$h0
821
shl \$24,$h2
822
and \$0x3ffffff,$h0 # h[2]
823
shr \$40,$d2
824
and \$0x3ffffff,$h1 # h[3]
825
or $d2,$h2 # h[4]
826
827
vmovd %rax#d,$H0
828
vmovd %rdx#d,$H1
829
vmovd $h0#d,$H2
830
vmovd $h1#d,$H3
831
vmovd $h2#d,$H4
832
movl \$1,20($ctx) # set is_base2_26
833
834
call __poly1305_init_avx
835
836
.Lproceed_avx:
837
mov %r15,$len
838
pop %r15
839
.cfi_restore %r15
840
pop %r14
841
.cfi_restore %r14
842
pop %r13
843
.cfi_restore %r13
844
pop %r12
845
.cfi_restore %r12
846
pop %rbx
847
.cfi_restore %rbx
848
pop %rbp
849
.cfi_restore %rbp
850
.Lbase2_64_avx_epilogue:
851
jmp .Ldo_avx
852
.cfi_endproc
853
854
.align 32
855
.Leven_avx:
856
.cfi_startproc
857
vmovd 4*0($ctx),$H0 # load hash value
858
vmovd 4*1($ctx),$H1
859
vmovd 4*2($ctx),$H2
860
vmovd 4*3($ctx),$H3
861
vmovd 4*4($ctx),$H4
862
863
.Ldo_avx:
864
___
865
$code.=<<___ if (!$win64);
866
lea 8(%rsp),%r10
867
.cfi_def_cfa_register %r10
868
and \$-32,%rsp
869
sub \$-8,%rsp
870
lea -0x58(%rsp),%r11
871
sub \$0x178,%rsp
872
___
873
$code.=<<___ if ($win64);
874
lea -0xf8(%rsp),%r11
875
sub \$0x218,%rsp
876
vmovdqa %xmm6,0x50(%r11)
877
vmovdqa %xmm7,0x60(%r11)
878
vmovdqa %xmm8,0x70(%r11)
879
vmovdqa %xmm9,0x80(%r11)
880
vmovdqa %xmm10,0x90(%r11)
881
vmovdqa %xmm11,0xa0(%r11)
882
vmovdqa %xmm12,0xb0(%r11)
883
vmovdqa %xmm13,0xc0(%r11)
884
vmovdqa %xmm14,0xd0(%r11)
885
vmovdqa %xmm15,0xe0(%r11)
886
.Ldo_avx_body:
887
___
888
$code.=<<___;
889
sub \$64,$len
890
lea -32($inp),%rax
891
cmovc %rax,$inp
892
893
vmovdqu `16*3`($ctx),$D4 # preload r0^2
894
lea `16*3+64`($ctx),$ctx # size optimization
895
lea .Lconst(%rip),%rcx
896
897
################################################################
898
# load input
899
vmovdqu 16*2($inp),$T0
900
vmovdqu 16*3($inp),$T1
901
vmovdqa 64(%rcx),$MASK # .Lmask26
902
903
vpsrldq \$6,$T0,$T2 # splat input
904
vpsrldq \$6,$T1,$T3
905
vpunpckhqdq $T1,$T0,$T4 # 4
906
vpunpcklqdq $T1,$T0,$T0 # 0:1
907
vpunpcklqdq $T3,$T2,$T3 # 2:3
908
909
vpsrlq \$40,$T4,$T4 # 4
910
vpsrlq \$26,$T0,$T1
911
vpand $MASK,$T0,$T0 # 0
912
vpsrlq \$4,$T3,$T2
913
vpand $MASK,$T1,$T1 # 1
914
vpsrlq \$30,$T3,$T3
915
vpand $MASK,$T2,$T2 # 2
916
vpand $MASK,$T3,$T3 # 3
917
vpor 32(%rcx),$T4,$T4 # padbit, yes, always
918
919
jbe .Lskip_loop_avx
920
921
# expand and copy pre-calculated table to stack
922
vmovdqu `16*1-64`($ctx),$D1
923
vmovdqu `16*2-64`($ctx),$D2
924
vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
925
vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
926
vmovdqa $D3,-0x90(%r11)
927
vmovdqa $D0,0x00(%rsp)
928
vpshufd \$0xEE,$D1,$D4
929
vmovdqu `16*3-64`($ctx),$D0
930
vpshufd \$0x44,$D1,$D1
931
vmovdqa $D4,-0x80(%r11)
932
vmovdqa $D1,0x10(%rsp)
933
vpshufd \$0xEE,$D2,$D3
934
vmovdqu `16*4-64`($ctx),$D1
935
vpshufd \$0x44,$D2,$D2
936
vmovdqa $D3,-0x70(%r11)
937
vmovdqa $D2,0x20(%rsp)
938
vpshufd \$0xEE,$D0,$D4
939
vmovdqu `16*5-64`($ctx),$D2
940
vpshufd \$0x44,$D0,$D0
941
vmovdqa $D4,-0x60(%r11)
942
vmovdqa $D0,0x30(%rsp)
943
vpshufd \$0xEE,$D1,$D3
944
vmovdqu `16*6-64`($ctx),$D0
945
vpshufd \$0x44,$D1,$D1
946
vmovdqa $D3,-0x50(%r11)
947
vmovdqa $D1,0x40(%rsp)
948
vpshufd \$0xEE,$D2,$D4
949
vmovdqu `16*7-64`($ctx),$D1
950
vpshufd \$0x44,$D2,$D2
951
vmovdqa $D4,-0x40(%r11)
952
vmovdqa $D2,0x50(%rsp)
953
vpshufd \$0xEE,$D0,$D3
954
vmovdqu `16*8-64`($ctx),$D2
955
vpshufd \$0x44,$D0,$D0
956
vmovdqa $D3,-0x30(%r11)
957
vmovdqa $D0,0x60(%rsp)
958
vpshufd \$0xEE,$D1,$D4
959
vpshufd \$0x44,$D1,$D1
960
vmovdqa $D4,-0x20(%r11)
961
vmovdqa $D1,0x70(%rsp)
962
vpshufd \$0xEE,$D2,$D3
963
vmovdqa 0x00(%rsp),$D4 # preload r0^2
964
vpshufd \$0x44,$D2,$D2
965
vmovdqa $D3,-0x10(%r11)
966
vmovdqa $D2,0x80(%rsp)
967
968
jmp .Loop_avx
969
970
.align 32
971
.Loop_avx:
972
################################################################
973
# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
974
# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
975
# \___________________/
976
# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
977
# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
978
# \___________________/ \____________________/
979
#
980
# Note that we start with inp[2:3]*r^2. This is because it
981
# doesn't depend on reduction in previous iteration.
982
################################################################
983
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
984
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
985
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
986
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
987
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
988
#
989
# though note that $Tx and $Hx are "reversed" in this section,
990
# and $D4 is preloaded with r0^2...
991
992
vpmuludq $T0,$D4,$D0 # d0 = h0*r0
993
vpmuludq $T1,$D4,$D1 # d1 = h1*r0
994
vmovdqa $H2,0x20(%r11) # offload hash
995
vpmuludq $T2,$D4,$D2 # d3 = h2*r0
996
vmovdqa 0x10(%rsp),$H2 # r1^2
997
vpmuludq $T3,$D4,$D3 # d3 = h3*r0
998
vpmuludq $T4,$D4,$D4 # d4 = h4*r0
999
1000
vmovdqa $H0,0x00(%r11) #
1001
vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
1002
vmovdqa $H1,0x10(%r11) #
1003
vpmuludq $T3,$H2,$H1 # h3*r1
1004
vpaddq $H0,$D0,$D0 # d0 += h4*s1
1005
vpaddq $H1,$D4,$D4 # d4 += h3*r1
1006
vmovdqa $H3,0x30(%r11) #
1007
vpmuludq $T2,$H2,$H0 # h2*r1
1008
vpmuludq $T1,$H2,$H1 # h1*r1
1009
vpaddq $H0,$D3,$D3 # d3 += h2*r1
1010
vmovdqa 0x30(%rsp),$H3 # r2^2
1011
vpaddq $H1,$D2,$D2 # d2 += h1*r1
1012
vmovdqa $H4,0x40(%r11) #
1013
vpmuludq $T0,$H2,$H2 # h0*r1
1014
vpmuludq $T2,$H3,$H0 # h2*r2
1015
vpaddq $H2,$D1,$D1 # d1 += h0*r1
1016
1017
vmovdqa 0x40(%rsp),$H4 # s2^2
1018
vpaddq $H0,$D4,$D4 # d4 += h2*r2
1019
vpmuludq $T1,$H3,$H1 # h1*r2
1020
vpmuludq $T0,$H3,$H3 # h0*r2
1021
vpaddq $H1,$D3,$D3 # d3 += h1*r2
1022
vmovdqa 0x50(%rsp),$H2 # r3^2
1023
vpaddq $H3,$D2,$D2 # d2 += h0*r2
1024
vpmuludq $T4,$H4,$H0 # h4*s2
1025
vpmuludq $T3,$H4,$H4 # h3*s2
1026
vpaddq $H0,$D1,$D1 # d1 += h4*s2
1027
vmovdqa 0x60(%rsp),$H3 # s3^2
1028
vpaddq $H4,$D0,$D0 # d0 += h3*s2
1029
1030
vmovdqa 0x80(%rsp),$H4 # s4^2
1031
vpmuludq $T1,$H2,$H1 # h1*r3
1032
vpmuludq $T0,$H2,$H2 # h0*r3
1033
vpaddq $H1,$D4,$D4 # d4 += h1*r3
1034
vpaddq $H2,$D3,$D3 # d3 += h0*r3
1035
vpmuludq $T4,$H3,$H0 # h4*s3
1036
vpmuludq $T3,$H3,$H1 # h3*s3
1037
vpaddq $H0,$D2,$D2 # d2 += h4*s3
1038
vmovdqu 16*0($inp),$H0 # load input
1039
vpaddq $H1,$D1,$D1 # d1 += h3*s3
1040
vpmuludq $T2,$H3,$H3 # h2*s3
1041
vpmuludq $T2,$H4,$T2 # h2*s4
1042
vpaddq $H3,$D0,$D0 # d0 += h2*s3
1043
1044
vmovdqu 16*1($inp),$H1 #
1045
vpaddq $T2,$D1,$D1 # d1 += h2*s4
1046
vpmuludq $T3,$H4,$T3 # h3*s4
1047
vpmuludq $T4,$H4,$T4 # h4*s4
1048
vpsrldq \$6,$H0,$H2 # splat input
1049
vpaddq $T3,$D2,$D2 # d2 += h3*s4
1050
vpaddq $T4,$D3,$D3 # d3 += h4*s4
1051
vpsrldq \$6,$H1,$H3 #
1052
vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
1053
vpmuludq $T1,$H4,$T0 # h1*s4
1054
vpunpckhqdq $H1,$H0,$H4 # 4
1055
vpaddq $T4,$D4,$D4 # d4 += h0*r4
1056
vmovdqa -0x90(%r11),$T4 # r0^4
1057
vpaddq $T0,$D0,$D0 # d0 += h1*s4
1058
1059
vpunpcklqdq $H1,$H0,$H0 # 0:1
1060
vpunpcklqdq $H3,$H2,$H3 # 2:3
1061
1062
#vpsrlq \$40,$H4,$H4 # 4
1063
vpsrldq \$`40/8`,$H4,$H4 # 4
1064
vpsrlq \$26,$H0,$H1
1065
vpand $MASK,$H0,$H0 # 0
1066
vpsrlq \$4,$H3,$H2
1067
vpand $MASK,$H1,$H1 # 1
1068
vpand 0(%rcx),$H4,$H4 # .Lmask24
1069
vpsrlq \$30,$H3,$H3
1070
vpand $MASK,$H2,$H2 # 2
1071
vpand $MASK,$H3,$H3 # 3
1072
vpor 32(%rcx),$H4,$H4 # padbit, yes, always
1073
1074
vpaddq 0x00(%r11),$H0,$H0 # add hash value
1075
vpaddq 0x10(%r11),$H1,$H1
1076
vpaddq 0x20(%r11),$H2,$H2
1077
vpaddq 0x30(%r11),$H3,$H3
1078
vpaddq 0x40(%r11),$H4,$H4
1079
1080
lea 16*2($inp),%rax
1081
lea 16*4($inp),$inp
1082
sub \$64,$len
1083
cmovc %rax,$inp
1084
1085
################################################################
1086
# Now we accumulate (inp[0:1]+hash)*r^4
1087
################################################################
1088
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1089
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1090
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1091
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1092
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1093
1094
vpmuludq $H0,$T4,$T0 # h0*r0
1095
vpmuludq $H1,$T4,$T1 # h1*r0
1096
vpaddq $T0,$D0,$D0
1097
vpaddq $T1,$D1,$D1
1098
vmovdqa -0x80(%r11),$T2 # r1^4
1099
vpmuludq $H2,$T4,$T0 # h2*r0
1100
vpmuludq $H3,$T4,$T1 # h3*r0
1101
vpaddq $T0,$D2,$D2
1102
vpaddq $T1,$D3,$D3
1103
vpmuludq $H4,$T4,$T4 # h4*r0
1104
vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
1105
vpaddq $T4,$D4,$D4
1106
1107
vpaddq $T0,$D0,$D0 # d0 += h4*s1
1108
vpmuludq $H2,$T2,$T1 # h2*r1
1109
vpmuludq $H3,$T2,$T0 # h3*r1
1110
vpaddq $T1,$D3,$D3 # d3 += h2*r1
1111
vmovdqa -0x60(%r11),$T3 # r2^4
1112
vpaddq $T0,$D4,$D4 # d4 += h3*r1
1113
vpmuludq $H1,$T2,$T1 # h1*r1
1114
vpmuludq $H0,$T2,$T2 # h0*r1
1115
vpaddq $T1,$D2,$D2 # d2 += h1*r1
1116
vpaddq $T2,$D1,$D1 # d1 += h0*r1
1117
1118
vmovdqa -0x50(%r11),$T4 # s2^4
1119
vpmuludq $H2,$T3,$T0 # h2*r2
1120
vpmuludq $H1,$T3,$T1 # h1*r2
1121
vpaddq $T0,$D4,$D4 # d4 += h2*r2
1122
vpaddq $T1,$D3,$D3 # d3 += h1*r2
1123
vmovdqa -0x40(%r11),$T2 # r3^4
1124
vpmuludq $H0,$T3,$T3 # h0*r2
1125
vpmuludq $H4,$T4,$T0 # h4*s2
1126
vpaddq $T3,$D2,$D2 # d2 += h0*r2
1127
vpaddq $T0,$D1,$D1 # d1 += h4*s2
1128
vmovdqa -0x30(%r11),$T3 # s3^4
1129
vpmuludq $H3,$T4,$T4 # h3*s2
1130
vpmuludq $H1,$T2,$T1 # h1*r3
1131
vpaddq $T4,$D0,$D0 # d0 += h3*s2
1132
1133
vmovdqa -0x10(%r11),$T4 # s4^4
1134
vpaddq $T1,$D4,$D4 # d4 += h1*r3
1135
vpmuludq $H0,$T2,$T2 # h0*r3
1136
vpmuludq $H4,$T3,$T0 # h4*s3
1137
vpaddq $T2,$D3,$D3 # d3 += h0*r3
1138
vpaddq $T0,$D2,$D2 # d2 += h4*s3
1139
vmovdqu 16*2($inp),$T0 # load input
1140
vpmuludq $H3,$T3,$T2 # h3*s3
1141
vpmuludq $H2,$T3,$T3 # h2*s3
1142
vpaddq $T2,$D1,$D1 # d1 += h3*s3
1143
vmovdqu 16*3($inp),$T1 #
1144
vpaddq $T3,$D0,$D0 # d0 += h2*s3
1145
1146
vpmuludq $H2,$T4,$H2 # h2*s4
1147
vpmuludq $H3,$T4,$H3 # h3*s4
1148
vpsrldq \$6,$T0,$T2 # splat input
1149
vpaddq $H2,$D1,$D1 # d1 += h2*s4
1150
vpmuludq $H4,$T4,$H4 # h4*s4
1151
vpsrldq \$6,$T1,$T3 #
1152
vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
1153
vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
1154
vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
1155
vpmuludq $H1,$T4,$H0
1156
vpunpckhqdq $T1,$T0,$T4 # 4
1157
vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1158
vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1159
1160
vpunpcklqdq $T1,$T0,$T0 # 0:1
1161
vpunpcklqdq $T3,$T2,$T3 # 2:3
1162
1163
#vpsrlq \$40,$T4,$T4 # 4
1164
vpsrldq \$`40/8`,$T4,$T4 # 4
1165
vpsrlq \$26,$T0,$T1
1166
vmovdqa 0x00(%rsp),$D4 # preload r0^2
1167
vpand $MASK,$T0,$T0 # 0
1168
vpsrlq \$4,$T3,$T2
1169
vpand $MASK,$T1,$T1 # 1
1170
vpand 0(%rcx),$T4,$T4 # .Lmask24
1171
vpsrlq \$30,$T3,$T3
1172
vpand $MASK,$T2,$T2 # 2
1173
vpand $MASK,$T3,$T3 # 3
1174
vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1175
1176
################################################################
1177
# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1178
# and P. Schwabe
1179
1180
vpsrlq \$26,$H3,$D3
1181
vpand $MASK,$H3,$H3
1182
vpaddq $D3,$H4,$H4 # h3 -> h4
1183
1184
vpsrlq \$26,$H0,$D0
1185
vpand $MASK,$H0,$H0
1186
vpaddq $D0,$D1,$H1 # h0 -> h1
1187
1188
vpsrlq \$26,$H4,$D0
1189
vpand $MASK,$H4,$H4
1190
1191
vpsrlq \$26,$H1,$D1
1192
vpand $MASK,$H1,$H1
1193
vpaddq $D1,$H2,$H2 # h1 -> h2
1194
1195
vpaddq $D0,$H0,$H0
1196
vpsllq \$2,$D0,$D0
1197
vpaddq $D0,$H0,$H0 # h4 -> h0
1198
1199
vpsrlq \$26,$H2,$D2
1200
vpand $MASK,$H2,$H2
1201
vpaddq $D2,$H3,$H3 # h2 -> h3
1202
1203
vpsrlq \$26,$H0,$D0
1204
vpand $MASK,$H0,$H0
1205
vpaddq $D0,$H1,$H1 # h0 -> h1
1206
1207
vpsrlq \$26,$H3,$D3
1208
vpand $MASK,$H3,$H3
1209
vpaddq $D3,$H4,$H4 # h3 -> h4
1210
1211
ja .Loop_avx
1212
1213
.Lskip_loop_avx:
1214
################################################################
1215
# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1216
1217
vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
1218
add \$32,$len
1219
jnz .Long_tail_avx
1220
1221
vpaddq $H2,$T2,$T2
1222
vpaddq $H0,$T0,$T0
1223
vpaddq $H1,$T1,$T1
1224
vpaddq $H3,$T3,$T3
1225
vpaddq $H4,$T4,$T4
1226
1227
.Long_tail_avx:
1228
vmovdqa $H2,0x20(%r11)
1229
vmovdqa $H0,0x00(%r11)
1230
vmovdqa $H1,0x10(%r11)
1231
vmovdqa $H3,0x30(%r11)
1232
vmovdqa $H4,0x40(%r11)
1233
1234
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1235
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1236
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1237
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1238
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1239
1240
vpmuludq $T2,$D4,$D2 # d2 = h2*r0
1241
vpmuludq $T0,$D4,$D0 # d0 = h0*r0
1242
vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
1243
vpmuludq $T1,$D4,$D1 # d1 = h1*r0
1244
vpmuludq $T3,$D4,$D3 # d3 = h3*r0
1245
vpmuludq $T4,$D4,$D4 # d4 = h4*r0
1246
1247
vpmuludq $T3,$H2,$H0 # h3*r1
1248
vpaddq $H0,$D4,$D4 # d4 += h3*r1
1249
vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
1250
vpmuludq $T2,$H2,$H1 # h2*r1
1251
vpaddq $H1,$D3,$D3 # d3 += h2*r1
1252
vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
1253
vpmuludq $T1,$H2,$H0 # h1*r1
1254
vpaddq $H0,$D2,$D2 # d2 += h1*r1
1255
vpmuludq $T0,$H2,$H2 # h0*r1
1256
vpaddq $H2,$D1,$D1 # d1 += h0*r1
1257
vpmuludq $T4,$H3,$H3 # h4*s1
1258
vpaddq $H3,$D0,$D0 # d0 += h4*s1
1259
1260
vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
1261
vpmuludq $T2,$H4,$H1 # h2*r2
1262
vpaddq $H1,$D4,$D4 # d4 += h2*r2
1263
vpmuludq $T1,$H4,$H0 # h1*r2
1264
vpaddq $H0,$D3,$D3 # d3 += h1*r2
1265
vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
1266
vpmuludq $T0,$H4,$H4 # h0*r2
1267
vpaddq $H4,$D2,$D2 # d2 += h0*r2
1268
vpmuludq $T4,$H2,$H1 # h4*s2
1269
vpaddq $H1,$D1,$D1 # d1 += h4*s2
1270
vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
1271
vpmuludq $T3,$H2,$H2 # h3*s2
1272
vpaddq $H2,$D0,$D0 # d0 += h3*s2
1273
1274
vpmuludq $T1,$H3,$H0 # h1*r3
1275
vpaddq $H0,$D4,$D4 # d4 += h1*r3
1276
vpmuludq $T0,$H3,$H3 # h0*r3
1277
vpaddq $H3,$D3,$D3 # d3 += h0*r3
1278
vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
1279
vpmuludq $T4,$H4,$H1 # h4*s3
1280
vpaddq $H1,$D2,$D2 # d2 += h4*s3
1281
vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
1282
vpmuludq $T3,$H4,$H0 # h3*s3
1283
vpaddq $H0,$D1,$D1 # d1 += h3*s3
1284
vpmuludq $T2,$H4,$H4 # h2*s3
1285
vpaddq $H4,$D0,$D0 # d0 += h2*s3
1286
1287
vpmuludq $T0,$H2,$H2 # h0*r4
1288
vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
1289
vpmuludq $T4,$H3,$H1 # h4*s4
1290
vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
1291
vpmuludq $T3,$H3,$H0 # h3*s4
1292
vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
1293
vpmuludq $T2,$H3,$H1 # h2*s4
1294
vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
1295
vpmuludq $T1,$H3,$H3 # h1*s4
1296
vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
1297
1298
jz .Lshort_tail_avx
1299
1300
vmovdqu 16*0($inp),$H0 # load input
1301
vmovdqu 16*1($inp),$H1
1302
1303
vpsrldq \$6,$H0,$H2 # splat input
1304
vpsrldq \$6,$H1,$H3
1305
vpunpckhqdq $H1,$H0,$H4 # 4
1306
vpunpcklqdq $H1,$H0,$H0 # 0:1
1307
vpunpcklqdq $H3,$H2,$H3 # 2:3
1308
1309
vpsrlq \$40,$H4,$H4 # 4
1310
vpsrlq \$26,$H0,$H1
1311
vpand $MASK,$H0,$H0 # 0
1312
vpsrlq \$4,$H3,$H2
1313
vpand $MASK,$H1,$H1 # 1
1314
vpsrlq \$30,$H3,$H3
1315
vpand $MASK,$H2,$H2 # 2
1316
vpand $MASK,$H3,$H3 # 3
1317
vpor 32(%rcx),$H4,$H4 # padbit, yes, always
1318
1319
vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
1320
vpaddq 0x00(%r11),$H0,$H0
1321
vpaddq 0x10(%r11),$H1,$H1
1322
vpaddq 0x20(%r11),$H2,$H2
1323
vpaddq 0x30(%r11),$H3,$H3
1324
vpaddq 0x40(%r11),$H4,$H4
1325
1326
################################################################
1327
# multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1328
1329
vpmuludq $H0,$T4,$T0 # h0*r0
1330
vpaddq $T0,$D0,$D0 # d0 += h0*r0
1331
vpmuludq $H1,$T4,$T1 # h1*r0
1332
vpaddq $T1,$D1,$D1 # d1 += h1*r0
1333
vpmuludq $H2,$T4,$T0 # h2*r0
1334
vpaddq $T0,$D2,$D2 # d2 += h2*r0
1335
vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
1336
vpmuludq $H3,$T4,$T1 # h3*r0
1337
vpaddq $T1,$D3,$D3 # d3 += h3*r0
1338
vpmuludq $H4,$T4,$T4 # h4*r0
1339
vpaddq $T4,$D4,$D4 # d4 += h4*r0
1340
1341
vpmuludq $H3,$T2,$T0 # h3*r1
1342
vpaddq $T0,$D4,$D4 # d4 += h3*r1
1343
vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
1344
vpmuludq $H2,$T2,$T1 # h2*r1
1345
vpaddq $T1,$D3,$D3 # d3 += h2*r1
1346
vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
1347
vpmuludq $H1,$T2,$T0 # h1*r1
1348
vpaddq $T0,$D2,$D2 # d2 += h1*r1
1349
vpmuludq $H0,$T2,$T2 # h0*r1
1350
vpaddq $T2,$D1,$D1 # d1 += h0*r1
1351
vpmuludq $H4,$T3,$T3 # h4*s1
1352
vpaddq $T3,$D0,$D0 # d0 += h4*s1
1353
1354
vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
1355
vpmuludq $H2,$T4,$T1 # h2*r2
1356
vpaddq $T1,$D4,$D4 # d4 += h2*r2
1357
vpmuludq $H1,$T4,$T0 # h1*r2
1358
vpaddq $T0,$D3,$D3 # d3 += h1*r2
1359
vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
1360
vpmuludq $H0,$T4,$T4 # h0*r2
1361
vpaddq $T4,$D2,$D2 # d2 += h0*r2
1362
vpmuludq $H4,$T2,$T1 # h4*s2
1363
vpaddq $T1,$D1,$D1 # d1 += h4*s2
1364
vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
1365
vpmuludq $H3,$T2,$T2 # h3*s2
1366
vpaddq $T2,$D0,$D0 # d0 += h3*s2
1367
1368
vpmuludq $H1,$T3,$T0 # h1*r3
1369
vpaddq $T0,$D4,$D4 # d4 += h1*r3
1370
vpmuludq $H0,$T3,$T3 # h0*r3
1371
vpaddq $T3,$D3,$D3 # d3 += h0*r3
1372
vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
1373
vpmuludq $H4,$T4,$T1 # h4*s3
1374
vpaddq $T1,$D2,$D2 # d2 += h4*s3
1375
vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
1376
vpmuludq $H3,$T4,$T0 # h3*s3
1377
vpaddq $T0,$D1,$D1 # d1 += h3*s3
1378
vpmuludq $H2,$T4,$T4 # h2*s3
1379
vpaddq $T4,$D0,$D0 # d0 += h2*s3
1380
1381
vpmuludq $H0,$T2,$T2 # h0*r4
1382
vpaddq $T2,$D4,$D4 # d4 += h0*r4
1383
vpmuludq $H4,$T3,$T1 # h4*s4
1384
vpaddq $T1,$D3,$D3 # d3 += h4*s4
1385
vpmuludq $H3,$T3,$T0 # h3*s4
1386
vpaddq $T0,$D2,$D2 # d2 += h3*s4
1387
vpmuludq $H2,$T3,$T1 # h2*s4
1388
vpaddq $T1,$D1,$D1 # d1 += h2*s4
1389
vpmuludq $H1,$T3,$T3 # h1*s4
1390
vpaddq $T3,$D0,$D0 # d0 += h1*s4
1391
1392
.Lshort_tail_avx:
1393
################################################################
1394
# horizontal addition
1395
1396
vpsrldq \$8,$D4,$T4
1397
vpsrldq \$8,$D3,$T3
1398
vpsrldq \$8,$D1,$T1
1399
vpsrldq \$8,$D0,$T0
1400
vpsrldq \$8,$D2,$T2
1401
vpaddq $T3,$D3,$D3
1402
vpaddq $T4,$D4,$D4
1403
vpaddq $T0,$D0,$D0
1404
vpaddq $T1,$D1,$D1
1405
vpaddq $T2,$D2,$D2
1406
1407
################################################################
1408
# lazy reduction
1409
1410
vpsrlq \$26,$D3,$H3
1411
vpand $MASK,$D3,$D3
1412
vpaddq $H3,$D4,$D4 # h3 -> h4
1413
1414
vpsrlq \$26,$D0,$H0
1415
vpand $MASK,$D0,$D0
1416
vpaddq $H0,$D1,$D1 # h0 -> h1
1417
1418
vpsrlq \$26,$D4,$H4
1419
vpand $MASK,$D4,$D4
1420
1421
vpsrlq \$26,$D1,$H1
1422
vpand $MASK,$D1,$D1
1423
vpaddq $H1,$D2,$D2 # h1 -> h2
1424
1425
vpaddq $H4,$D0,$D0
1426
vpsllq \$2,$H4,$H4
1427
vpaddq $H4,$D0,$D0 # h4 -> h0
1428
1429
vpsrlq \$26,$D2,$H2
1430
vpand $MASK,$D2,$D2
1431
vpaddq $H2,$D3,$D3 # h2 -> h3
1432
1433
vpsrlq \$26,$D0,$H0
1434
vpand $MASK,$D0,$D0
1435
vpaddq $H0,$D1,$D1 # h0 -> h1
1436
1437
vpsrlq \$26,$D3,$H3
1438
vpand $MASK,$D3,$D3
1439
vpaddq $H3,$D4,$D4 # h3 -> h4
1440
1441
vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
1442
vmovd $D1,`4*1-48-64`($ctx)
1443
vmovd $D2,`4*2-48-64`($ctx)
1444
vmovd $D3,`4*3-48-64`($ctx)
1445
vmovd $D4,`4*4-48-64`($ctx)
1446
___
1447
$code.=<<___ if ($win64);
1448
vmovdqa 0x50(%r11),%xmm6
1449
vmovdqa 0x60(%r11),%xmm7
1450
vmovdqa 0x70(%r11),%xmm8
1451
vmovdqa 0x80(%r11),%xmm9
1452
vmovdqa 0x90(%r11),%xmm10
1453
vmovdqa 0xa0(%r11),%xmm11
1454
vmovdqa 0xb0(%r11),%xmm12
1455
vmovdqa 0xc0(%r11),%xmm13
1456
vmovdqa 0xd0(%r11),%xmm14
1457
vmovdqa 0xe0(%r11),%xmm15
1458
lea 0xf8(%r11),%rsp
1459
.Ldo_avx_epilogue:
1460
___
1461
$code.=<<___ if (!$win64);
1462
lea -8(%r10),%rsp
1463
.cfi_def_cfa_register %rsp
1464
___
1465
$code.=<<___;
1466
vzeroupper
1467
RET
1468
.cfi_endproc
1469
___
1470
&end_function("poly1305_blocks_avx");
1471
1472
&declare_function("poly1305_emit_avx", 32, 3);
1473
$code.=<<___;
1474
cmpl \$0,20($ctx) # is_base2_26?
1475
je .Lemit
1476
1477
mov 0($ctx),%eax # load hash value base 2^26
1478
mov 4($ctx),%ecx
1479
mov 8($ctx),%r8d
1480
mov 12($ctx),%r11d
1481
mov 16($ctx),%r10d
1482
1483
shl \$26,%rcx # base 2^26 -> base 2^64
1484
mov %r8,%r9
1485
shl \$52,%r8
1486
add %rcx,%rax
1487
shr \$12,%r9
1488
add %rax,%r8 # h0
1489
adc \$0,%r9
1490
1491
shl \$14,%r11
1492
mov %r10,%rax
1493
shr \$24,%r10
1494
add %r11,%r9
1495
shl \$40,%rax
1496
add %rax,%r9 # h1
1497
adc \$0,%r10 # h2
1498
1499
mov %r10,%rax # could be partially reduced, so reduce
1500
mov %r10,%rcx
1501
and \$3,%r10
1502
shr \$2,%rax
1503
and \$-4,%rcx
1504
add %rcx,%rax
1505
add %rax,%r8
1506
adc \$0,%r9
1507
adc \$0,%r10
1508
1509
mov %r8,%rax
1510
add \$5,%r8 # compare to modulus
1511
mov %r9,%rcx
1512
adc \$0,%r9
1513
adc \$0,%r10
1514
shr \$2,%r10 # did 130-bit value overflow?
1515
cmovnz %r8,%rax
1516
cmovnz %r9,%rcx
1517
1518
add 0($nonce),%rax # accumulate nonce
1519
adc 8($nonce),%rcx
1520
mov %rax,0($mac) # write result
1521
mov %rcx,8($mac)
1522
1523
RET
1524
___
1525
&end_function("poly1305_emit_avx");
1526
1527
if ($avx>1) {
1528
1529
my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1530
map("%ymm$_",(0..15));
1531
my $S4=$MASK;
1532
1533
sub poly1305_blocks_avxN {
1534
my ($avx512) = @_;
1535
my $suffix = $avx512 ? "_avx512" : "";
1536
$code.=<<___;
1537
.cfi_startproc
1538
mov 20($ctx),%r8d # is_base2_26
1539
cmp \$128,$len
1540
jae .Lblocks_avx2$suffix
1541
test %r8d,%r8d
1542
jz .Lblocks
1543
1544
.Lblocks_avx2$suffix:
1545
and \$-16,$len
1546
jz .Lno_data_avx2$suffix
1547
1548
vzeroupper
1549
1550
test %r8d,%r8d
1551
jz .Lbase2_64_avx2$suffix
1552
1553
test \$63,$len
1554
jz .Leven_avx2$suffix
1555
1556
push %rbp
1557
.cfi_push %rbp
1558
mov %rsp,%rbp
1559
push %rbx
1560
.cfi_push %rbx
1561
push %r12
1562
.cfi_push %r12
1563
push %r13
1564
.cfi_push %r13
1565
push %r14
1566
.cfi_push %r14
1567
push %r15
1568
.cfi_push %r15
1569
.Lblocks_avx2_body$suffix:
1570
1571
mov $len,%r15 # reassign $len
1572
1573
mov 0($ctx),$d1 # load hash value
1574
mov 8($ctx),$d2
1575
mov 16($ctx),$h2#d
1576
1577
mov 24($ctx),$r0 # load r
1578
mov 32($ctx),$s1
1579
1580
################################# base 2^26 -> base 2^64
1581
mov $d1#d,$h0#d
1582
and \$`-1*(1<<31)`,$d1
1583
mov $d2,$r1 # borrow $r1
1584
mov $d2#d,$h1#d
1585
and \$`-1*(1<<31)`,$d2
1586
1587
shr \$6,$d1
1588
shl \$52,$r1
1589
add $d1,$h0
1590
shr \$12,$h1
1591
shr \$18,$d2
1592
add $r1,$h0
1593
adc $d2,$h1
1594
1595
mov $h2,$d1
1596
shl \$40,$d1
1597
shr \$24,$h2
1598
add $d1,$h1
1599
adc \$0,$h2 # can be partially reduced...
1600
1601
mov \$-4,$d2 # ... so reduce
1602
mov $h2,$d1
1603
and $h2,$d2
1604
shr \$2,$d1
1605
and \$3,$h2
1606
add $d2,$d1 # =*5
1607
add $d1,$h0
1608
adc \$0,$h1
1609
adc \$0,$h2
1610
1611
mov $s1,$r1
1612
mov $s1,%rax
1613
shr \$2,$s1
1614
add $r1,$s1 # s1 = r1 + (r1 >> 2)
1615
1616
.Lbase2_26_pre_avx2$suffix:
1617
add 0($inp),$h0 # accumulate input
1618
adc 8($inp),$h1
1619
lea 16($inp),$inp
1620
adc $padbit,$h2
1621
sub \$16,%r15
1622
1623
call __poly1305_block
1624
mov $r1,%rax
1625
1626
test \$63,%r15
1627
jnz .Lbase2_26_pre_avx2$suffix
1628
1629
test $padbit,$padbit # if $padbit is zero,
1630
jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format
1631
1632
################################# base 2^64 -> base 2^26
1633
mov $h0,%rax
1634
mov $h0,%rdx
1635
shr \$52,$h0
1636
mov $h1,$r0
1637
mov $h1,$r1
1638
shr \$26,%rdx
1639
and \$0x3ffffff,%rax # h[0]
1640
shl \$12,$r0
1641
and \$0x3ffffff,%rdx # h[1]
1642
shr \$14,$h1
1643
or $r0,$h0
1644
shl \$24,$h2
1645
and \$0x3ffffff,$h0 # h[2]
1646
shr \$40,$r1
1647
and \$0x3ffffff,$h1 # h[3]
1648
or $r1,$h2 # h[4]
1649
1650
test %r15,%r15
1651
jz .Lstore_base2_26_avx2$suffix
1652
1653
vmovd %rax#d,%x#$H0
1654
vmovd %rdx#d,%x#$H1
1655
vmovd $h0#d,%x#$H2
1656
vmovd $h1#d,%x#$H3
1657
vmovd $h2#d,%x#$H4
1658
jmp .Lproceed_avx2$suffix
1659
1660
.align 32
1661
.Lstore_base2_64_avx2$suffix:
1662
mov $h0,0($ctx)
1663
mov $h1,8($ctx)
1664
mov $h2,16($ctx) # note that is_base2_26 is zeroed
1665
jmp .Ldone_avx2$suffix
1666
1667
.align 16
1668
.Lstore_base2_26_avx2$suffix:
1669
mov %rax#d,0($ctx) # store hash value base 2^26
1670
mov %rdx#d,4($ctx)
1671
mov $h0#d,8($ctx)
1672
mov $h1#d,12($ctx)
1673
mov $h2#d,16($ctx)
1674
.align 16
1675
.Ldone_avx2$suffix:
1676
pop %r15
1677
.cfi_restore %r15
1678
pop %r14
1679
.cfi_restore %r14
1680
pop %r13
1681
.cfi_restore %r13
1682
pop %r12
1683
.cfi_restore %r12
1684
pop %rbx
1685
.cfi_restore %rbx
1686
pop %rbp
1687
.cfi_restore %rbp
1688
.Lno_data_avx2$suffix:
1689
.Lblocks_avx2_epilogue$suffix:
1690
RET
1691
.cfi_endproc
1692
1693
.align 32
1694
.Lbase2_64_avx2$suffix:
1695
.cfi_startproc
1696
push %rbp
1697
.cfi_push %rbp
1698
mov %rsp,%rbp
1699
push %rbx
1700
.cfi_push %rbx
1701
push %r12
1702
.cfi_push %r12
1703
push %r13
1704
.cfi_push %r13
1705
push %r14
1706
.cfi_push %r14
1707
push %r15
1708
.cfi_push %r15
1709
.Lbase2_64_avx2_body$suffix:
1710
1711
mov $len,%r15 # reassign $len
1712
1713
mov 24($ctx),$r0 # load r
1714
mov 32($ctx),$s1
1715
1716
mov 0($ctx),$h0 # load hash value
1717
mov 8($ctx),$h1
1718
mov 16($ctx),$h2#d
1719
1720
mov $s1,$r1
1721
mov $s1,%rax
1722
shr \$2,$s1
1723
add $r1,$s1 # s1 = r1 + (r1 >> 2)
1724
1725
test \$63,$len
1726
jz .Linit_avx2$suffix
1727
1728
.Lbase2_64_pre_avx2$suffix:
1729
add 0($inp),$h0 # accumulate input
1730
adc 8($inp),$h1
1731
lea 16($inp),$inp
1732
adc $padbit,$h2
1733
sub \$16,%r15
1734
1735
call __poly1305_block
1736
mov $r1,%rax
1737
1738
test \$63,%r15
1739
jnz .Lbase2_64_pre_avx2$suffix
1740
1741
.Linit_avx2$suffix:
1742
################################# base 2^64 -> base 2^26
1743
mov $h0,%rax
1744
mov $h0,%rdx
1745
shr \$52,$h0
1746
mov $h1,$d1
1747
mov $h1,$d2
1748
shr \$26,%rdx
1749
and \$0x3ffffff,%rax # h[0]
1750
shl \$12,$d1
1751
and \$0x3ffffff,%rdx # h[1]
1752
shr \$14,$h1
1753
or $d1,$h0
1754
shl \$24,$h2
1755
and \$0x3ffffff,$h0 # h[2]
1756
shr \$40,$d2
1757
and \$0x3ffffff,$h1 # h[3]
1758
or $d2,$h2 # h[4]
1759
1760
vmovd %rax#d,%x#$H0
1761
vmovd %rdx#d,%x#$H1
1762
vmovd $h0#d,%x#$H2
1763
vmovd $h1#d,%x#$H3
1764
vmovd $h2#d,%x#$H4
1765
movl \$1,20($ctx) # set is_base2_26
1766
1767
call __poly1305_init_avx
1768
1769
.Lproceed_avx2$suffix:
1770
mov %r15,$len # restore $len
1771
___
1772
$code.=<<___ if (!$kernel);
1773
mov OPENSSL_ia32cap_P+8(%rip),%r9d
1774
mov \$`(1<<31|1<<30|1<<16)`,%r11d
1775
___
1776
$code.=<<___;
1777
pop %r15
1778
.cfi_restore %r15
1779
pop %r14
1780
.cfi_restore %r14
1781
pop %r13
1782
.cfi_restore %r13
1783
pop %r12
1784
.cfi_restore %r12
1785
pop %rbx
1786
.cfi_restore %rbx
1787
pop %rbp
1788
.cfi_restore %rbp
1789
.Lbase2_64_avx2_epilogue$suffix:
1790
jmp .Ldo_avx2$suffix
1791
.cfi_endproc
1792
1793
.align 32
1794
.Leven_avx2$suffix:
1795
.cfi_startproc
1796
___
1797
$code.=<<___ if (!$kernel);
1798
mov OPENSSL_ia32cap_P+8(%rip),%r9d
1799
___
1800
$code.=<<___;
1801
vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
1802
vmovd 4*1($ctx),%x#$H1
1803
vmovd 4*2($ctx),%x#$H2
1804
vmovd 4*3($ctx),%x#$H3
1805
vmovd 4*4($ctx),%x#$H4
1806
1807
.Ldo_avx2$suffix:
1808
___
1809
$code.=<<___ if (!$kernel && $avx>2);
1810
cmp \$512,$len
1811
jb .Lskip_avx512
1812
and %r11d,%r9d
1813
test \$`1<<16`,%r9d # check for AVX512F
1814
jnz .Lblocks_avx512
1815
.Lskip_avx512$suffix:
1816
___
1817
$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
1818
cmp \$512,$len
1819
jae .Lblocks_avx512
1820
___
1821
$code.=<<___ if (!$win64);
1822
lea 8(%rsp),%r10
1823
.cfi_def_cfa_register %r10
1824
sub \$0x128,%rsp
1825
___
1826
$code.=<<___ if ($win64);
1827
lea 8(%rsp),%r10
1828
sub \$0x1c8,%rsp
1829
vmovdqa %xmm6,-0xb0(%r10)
1830
vmovdqa %xmm7,-0xa0(%r10)
1831
vmovdqa %xmm8,-0x90(%r10)
1832
vmovdqa %xmm9,-0x80(%r10)
1833
vmovdqa %xmm10,-0x70(%r10)
1834
vmovdqa %xmm11,-0x60(%r10)
1835
vmovdqa %xmm12,-0x50(%r10)
1836
vmovdqa %xmm13,-0x40(%r10)
1837
vmovdqa %xmm14,-0x30(%r10)
1838
vmovdqa %xmm15,-0x20(%r10)
1839
.Ldo_avx2_body$suffix:
1840
___
1841
$code.=<<___;
1842
lea .Lconst(%rip),%rcx
1843
lea 48+64($ctx),$ctx # size optimization
1844
vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
1845
1846
# expand and copy pre-calculated table to stack
1847
vmovdqu `16*0-64`($ctx),%x#$T2
1848
and \$-512,%rsp
1849
vmovdqu `16*1-64`($ctx),%x#$T3
1850
vmovdqu `16*2-64`($ctx),%x#$T4
1851
vmovdqu `16*3-64`($ctx),%x#$D0
1852
vmovdqu `16*4-64`($ctx),%x#$D1
1853
vmovdqu `16*5-64`($ctx),%x#$D2
1854
lea 0x90(%rsp),%rax # size optimization
1855
vmovdqu `16*6-64`($ctx),%x#$D3
1856
vpermd $T2,$T0,$T2 # 00003412 -> 14243444
1857
vmovdqu `16*7-64`($ctx),%x#$D4
1858
vpermd $T3,$T0,$T3
1859
vmovdqu `16*8-64`($ctx),%x#$MASK
1860
vpermd $T4,$T0,$T4
1861
vmovdqa $T2,0x00(%rsp)
1862
vpermd $D0,$T0,$D0
1863
vmovdqa $T3,0x20-0x90(%rax)
1864
vpermd $D1,$T0,$D1
1865
vmovdqa $T4,0x40-0x90(%rax)
1866
vpermd $D2,$T0,$D2
1867
vmovdqa $D0,0x60-0x90(%rax)
1868
vpermd $D3,$T0,$D3
1869
vmovdqa $D1,0x80-0x90(%rax)
1870
vpermd $D4,$T0,$D4
1871
vmovdqa $D2,0xa0-0x90(%rax)
1872
vpermd $MASK,$T0,$MASK
1873
vmovdqa $D3,0xc0-0x90(%rax)
1874
vmovdqa $D4,0xe0-0x90(%rax)
1875
vmovdqa $MASK,0x100-0x90(%rax)
1876
vmovdqa 64(%rcx),$MASK # .Lmask26
1877
1878
################################################################
1879
# load input
1880
vmovdqu 16*0($inp),%x#$T0
1881
vmovdqu 16*1($inp),%x#$T1
1882
vinserti128 \$1,16*2($inp),$T0,$T0
1883
vinserti128 \$1,16*3($inp),$T1,$T1
1884
lea 16*4($inp),$inp
1885
1886
vpsrldq \$6,$T0,$T2 # splat input
1887
vpsrldq \$6,$T1,$T3
1888
vpunpckhqdq $T1,$T0,$T4 # 4
1889
vpunpcklqdq $T3,$T2,$T2 # 2:3
1890
vpunpcklqdq $T1,$T0,$T0 # 0:1
1891
1892
vpsrlq \$30,$T2,$T3
1893
vpsrlq \$4,$T2,$T2
1894
vpsrlq \$26,$T0,$T1
1895
vpsrlq \$40,$T4,$T4 # 4
1896
vpand $MASK,$T2,$T2 # 2
1897
vpand $MASK,$T0,$T0 # 0
1898
vpand $MASK,$T1,$T1 # 1
1899
vpand $MASK,$T3,$T3 # 3
1900
vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1901
1902
vpaddq $H2,$T2,$H2 # accumulate input
1903
sub \$64,$len
1904
jz .Ltail_avx2$suffix
1905
jmp .Loop_avx2$suffix
1906
1907
.align 32
1908
.Loop_avx2$suffix:
1909
################################################################
1910
# ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
1911
# ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
1912
# ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
1913
# ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
1914
# \________/\__________/
1915
################################################################
1916
#vpaddq $H2,$T2,$H2 # accumulate input
1917
vpaddq $H0,$T0,$H0
1918
vmovdqa `32*0`(%rsp),$T0 # r0^4
1919
vpaddq $H1,$T1,$H1
1920
vmovdqa `32*1`(%rsp),$T1 # r1^4
1921
vpaddq $H3,$T3,$H3
1922
vmovdqa `32*3`(%rsp),$T2 # r2^4
1923
vpaddq $H4,$T4,$H4
1924
vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
1925
vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
1926
1927
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1928
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1929
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1930
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1931
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1932
#
1933
# however, as h2 is "chronologically" first one available pull
1934
# corresponding operations up, so it's
1935
#
1936
# d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
1937
# d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
1938
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1939
# d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
1940
# d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
1941
1942
vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1943
vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1944
vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1945
vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1946
vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1947
1948
vpmuludq $H0,$T1,$T4 # h0*r1
1949
vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
1950
vpaddq $T4,$D1,$D1 # d1 += h0*r1
1951
vpaddq $H2,$D2,$D2 # d2 += h1*r1
1952
vpmuludq $H3,$T1,$T4 # h3*r1
1953
vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
1954
vpaddq $T4,$D4,$D4 # d4 += h3*r1
1955
vpaddq $H2,$D0,$D0 # d0 += h4*s1
1956
vmovdqa `32*4-0x90`(%rax),$T1 # s2
1957
1958
vpmuludq $H0,$T0,$T4 # h0*r0
1959
vpmuludq $H1,$T0,$H2 # h1*r0
1960
vpaddq $T4,$D0,$D0 # d0 += h0*r0
1961
vpaddq $H2,$D1,$D1 # d1 += h1*r0
1962
vpmuludq $H3,$T0,$T4 # h3*r0
1963
vpmuludq $H4,$T0,$H2 # h4*r0
1964
vmovdqu 16*0($inp),%x#$T0 # load input
1965
vpaddq $T4,$D3,$D3 # d3 += h3*r0
1966
vpaddq $H2,$D4,$D4 # d4 += h4*r0
1967
vinserti128 \$1,16*2($inp),$T0,$T0
1968
1969
vpmuludq $H3,$T1,$T4 # h3*s2
1970
vpmuludq $H4,$T1,$H2 # h4*s2
1971
vmovdqu 16*1($inp),%x#$T1
1972
vpaddq $T4,$D0,$D0 # d0 += h3*s2
1973
vpaddq $H2,$D1,$D1 # d1 += h4*s2
1974
vmovdqa `32*5-0x90`(%rax),$H2 # r3
1975
vpmuludq $H1,$T2,$T4 # h1*r2
1976
vpmuludq $H0,$T2,$T2 # h0*r2
1977
vpaddq $T4,$D3,$D3 # d3 += h1*r2
1978
vpaddq $T2,$D2,$D2 # d2 += h0*r2
1979
vinserti128 \$1,16*3($inp),$T1,$T1
1980
lea 16*4($inp),$inp
1981
1982
vpmuludq $H1,$H2,$T4 # h1*r3
1983
vpmuludq $H0,$H2,$H2 # h0*r3
1984
vpsrldq \$6,$T0,$T2 # splat input
1985
vpaddq $T4,$D4,$D4 # d4 += h1*r3
1986
vpaddq $H2,$D3,$D3 # d3 += h0*r3
1987
vpmuludq $H3,$T3,$T4 # h3*s3
1988
vpmuludq $H4,$T3,$H2 # h4*s3
1989
vpsrldq \$6,$T1,$T3
1990
vpaddq $T4,$D1,$D1 # d1 += h3*s3
1991
vpaddq $H2,$D2,$D2 # d2 += h4*s3
1992
vpunpckhqdq $T1,$T0,$T4 # 4
1993
1994
vpmuludq $H3,$S4,$H3 # h3*s4
1995
vpmuludq $H4,$S4,$H4 # h4*s4
1996
vpunpcklqdq $T1,$T0,$T0 # 0:1
1997
vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
1998
vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
1999
vpunpcklqdq $T3,$T2,$T3 # 2:3
2000
vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
2001
vpmuludq $H1,$S4,$H0 # h1*s4
2002
vmovdqa 64(%rcx),$MASK # .Lmask26
2003
vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
2004
vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
2005
2006
################################################################
2007
# lazy reduction (interleaved with tail of input splat)
2008
2009
vpsrlq \$26,$H3,$D3
2010
vpand $MASK,$H3,$H3
2011
vpaddq $D3,$H4,$H4 # h3 -> h4
2012
2013
vpsrlq \$26,$H0,$D0
2014
vpand $MASK,$H0,$H0
2015
vpaddq $D0,$D1,$H1 # h0 -> h1
2016
2017
vpsrlq \$26,$H4,$D4
2018
vpand $MASK,$H4,$H4
2019
2020
vpsrlq \$4,$T3,$T2
2021
2022
vpsrlq \$26,$H1,$D1
2023
vpand $MASK,$H1,$H1
2024
vpaddq $D1,$H2,$H2 # h1 -> h2
2025
2026
vpaddq $D4,$H0,$H0
2027
vpsllq \$2,$D4,$D4
2028
vpaddq $D4,$H0,$H0 # h4 -> h0
2029
2030
vpand $MASK,$T2,$T2 # 2
2031
vpsrlq \$26,$T0,$T1
2032
2033
vpsrlq \$26,$H2,$D2
2034
vpand $MASK,$H2,$H2
2035
vpaddq $D2,$H3,$H3 # h2 -> h3
2036
2037
vpaddq $T2,$H2,$H2 # modulo-scheduled
2038
vpsrlq \$30,$T3,$T3
2039
2040
vpsrlq \$26,$H0,$D0
2041
vpand $MASK,$H0,$H0
2042
vpaddq $D0,$H1,$H1 # h0 -> h1
2043
2044
vpsrlq \$40,$T4,$T4 # 4
2045
2046
vpsrlq \$26,$H3,$D3
2047
vpand $MASK,$H3,$H3
2048
vpaddq $D3,$H4,$H4 # h3 -> h4
2049
2050
vpand $MASK,$T0,$T0 # 0
2051
vpand $MASK,$T1,$T1 # 1
2052
vpand $MASK,$T3,$T3 # 3
2053
vpor 32(%rcx),$T4,$T4 # padbit, yes, always
2054
2055
sub \$64,$len
2056
jnz .Loop_avx2$suffix
2057
2058
.byte 0x66,0x90
2059
.Ltail_avx2$suffix:
2060
################################################################
2061
# while above multiplications were by r^4 in all lanes, in last
2062
# iteration we multiply least significant lane by r^4 and most
2063
# significant one by r, so copy of above except that references
2064
# to the precomputed table are displaced by 4...
2065
2066
#vpaddq $H2,$T2,$H2 # accumulate input
2067
vpaddq $H0,$T0,$H0
2068
vmovdqu `32*0+4`(%rsp),$T0 # r0^4
2069
vpaddq $H1,$T1,$H1
2070
vmovdqu `32*1+4`(%rsp),$T1 # r1^4
2071
vpaddq $H3,$T3,$H3
2072
vmovdqu `32*3+4`(%rsp),$T2 # r2^4
2073
vpaddq $H4,$T4,$H4
2074
vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
2075
vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
2076
2077
vpmuludq $H2,$T0,$D2 # d2 = h2*r0
2078
vpmuludq $H2,$T1,$D3 # d3 = h2*r1
2079
vpmuludq $H2,$T2,$D4 # d4 = h2*r2
2080
vpmuludq $H2,$T3,$D0 # d0 = h2*s3
2081
vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2082
2083
vpmuludq $H0,$T1,$T4 # h0*r1
2084
vpmuludq $H1,$T1,$H2 # h1*r1
2085
vpaddq $T4,$D1,$D1 # d1 += h0*r1
2086
vpaddq $H2,$D2,$D2 # d2 += h1*r1
2087
vpmuludq $H3,$T1,$T4 # h3*r1
2088
vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
2089
vpaddq $T4,$D4,$D4 # d4 += h3*r1
2090
vpaddq $H2,$D0,$D0 # d0 += h4*s1
2091
2092
vpmuludq $H0,$T0,$T4 # h0*r0
2093
vpmuludq $H1,$T0,$H2 # h1*r0
2094
vpaddq $T4,$D0,$D0 # d0 += h0*r0
2095
vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
2096
vpaddq $H2,$D1,$D1 # d1 += h1*r0
2097
vpmuludq $H3,$T0,$T4 # h3*r0
2098
vpmuludq $H4,$T0,$H2 # h4*r0
2099
vpaddq $T4,$D3,$D3 # d3 += h3*r0
2100
vpaddq $H2,$D4,$D4 # d4 += h4*r0
2101
2102
vpmuludq $H3,$T1,$T4 # h3*s2
2103
vpmuludq $H4,$T1,$H2 # h4*s2
2104
vpaddq $T4,$D0,$D0 # d0 += h3*s2
2105
vpaddq $H2,$D1,$D1 # d1 += h4*s2
2106
vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
2107
vpmuludq $H1,$T2,$T4 # h1*r2
2108
vpmuludq $H0,$T2,$T2 # h0*r2
2109
vpaddq $T4,$D3,$D3 # d3 += h1*r2
2110
vpaddq $T2,$D2,$D2 # d2 += h0*r2
2111
2112
vpmuludq $H1,$H2,$T4 # h1*r3
2113
vpmuludq $H0,$H2,$H2 # h0*r3
2114
vpaddq $T4,$D4,$D4 # d4 += h1*r3
2115
vpaddq $H2,$D3,$D3 # d3 += h0*r3
2116
vpmuludq $H3,$T3,$T4 # h3*s3
2117
vpmuludq $H4,$T3,$H2 # h4*s3
2118
vpaddq $T4,$D1,$D1 # d1 += h3*s3
2119
vpaddq $H2,$D2,$D2 # d2 += h4*s3
2120
2121
vpmuludq $H3,$S4,$H3 # h3*s4
2122
vpmuludq $H4,$S4,$H4 # h4*s4
2123
vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
2124
vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
2125
vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
2126
vpmuludq $H1,$S4,$H0 # h1*s4
2127
vmovdqa 64(%rcx),$MASK # .Lmask26
2128
vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
2129
vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
2130
2131
################################################################
2132
# horizontal addition
2133
2134
vpsrldq \$8,$D1,$T1
2135
vpsrldq \$8,$H2,$T2
2136
vpsrldq \$8,$H3,$T3
2137
vpsrldq \$8,$H4,$T4
2138
vpsrldq \$8,$H0,$T0
2139
vpaddq $T1,$D1,$D1
2140
vpaddq $T2,$H2,$H2
2141
vpaddq $T3,$H3,$H3
2142
vpaddq $T4,$H4,$H4
2143
vpaddq $T0,$H0,$H0
2144
2145
vpermq \$0x2,$H3,$T3
2146
vpermq \$0x2,$H4,$T4
2147
vpermq \$0x2,$H0,$T0
2148
vpermq \$0x2,$D1,$T1
2149
vpermq \$0x2,$H2,$T2
2150
vpaddq $T3,$H3,$H3
2151
vpaddq $T4,$H4,$H4
2152
vpaddq $T0,$H0,$H0
2153
vpaddq $T1,$D1,$D1
2154
vpaddq $T2,$H2,$H2
2155
2156
################################################################
2157
# lazy reduction
2158
2159
vpsrlq \$26,$H3,$D3
2160
vpand $MASK,$H3,$H3
2161
vpaddq $D3,$H4,$H4 # h3 -> h4
2162
2163
vpsrlq \$26,$H0,$D0
2164
vpand $MASK,$H0,$H0
2165
vpaddq $D0,$D1,$H1 # h0 -> h1
2166
2167
vpsrlq \$26,$H4,$D4
2168
vpand $MASK,$H4,$H4
2169
2170
vpsrlq \$26,$H1,$D1
2171
vpand $MASK,$H1,$H1
2172
vpaddq $D1,$H2,$H2 # h1 -> h2
2173
2174
vpaddq $D4,$H0,$H0
2175
vpsllq \$2,$D4,$D4
2176
vpaddq $D4,$H0,$H0 # h4 -> h0
2177
2178
vpsrlq \$26,$H2,$D2
2179
vpand $MASK,$H2,$H2
2180
vpaddq $D2,$H3,$H3 # h2 -> h3
2181
2182
vpsrlq \$26,$H0,$D0
2183
vpand $MASK,$H0,$H0
2184
vpaddq $D0,$H1,$H1 # h0 -> h1
2185
2186
vpsrlq \$26,$H3,$D3
2187
vpand $MASK,$H3,$H3
2188
vpaddq $D3,$H4,$H4 # h3 -> h4
2189
2190
vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2191
vmovd %x#$H1,`4*1-48-64`($ctx)
2192
vmovd %x#$H2,`4*2-48-64`($ctx)
2193
vmovd %x#$H3,`4*3-48-64`($ctx)
2194
vmovd %x#$H4,`4*4-48-64`($ctx)
2195
___
2196
$code.=<<___ if ($win64);
2197
vmovdqa -0xb0(%r10),%xmm6
2198
vmovdqa -0xa0(%r10),%xmm7
2199
vmovdqa -0x90(%r10),%xmm8
2200
vmovdqa -0x80(%r10),%xmm9
2201
vmovdqa -0x70(%r10),%xmm10
2202
vmovdqa -0x60(%r10),%xmm11
2203
vmovdqa -0x50(%r10),%xmm12
2204
vmovdqa -0x40(%r10),%xmm13
2205
vmovdqa -0x30(%r10),%xmm14
2206
vmovdqa -0x20(%r10),%xmm15
2207
lea -8(%r10),%rsp
2208
.Ldo_avx2_epilogue$suffix:
2209
___
2210
$code.=<<___ if (!$win64);
2211
lea -8(%r10),%rsp
2212
.cfi_def_cfa_register %rsp
2213
___
2214
$code.=<<___;
2215
vzeroupper
2216
RET
2217
.cfi_endproc
2218
___
2219
if($avx > 2 && $avx512) {
2220
my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
2221
my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
2222
my $PADBIT="%zmm30";
2223
2224
map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
2225
map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
2226
map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
2227
map(s/%y/%z/,($MASK));
2228
2229
$code.=<<___;
2230
.cfi_startproc
2231
.Lblocks_avx512:
2232
mov \$15,%eax
2233
kmovw %eax,%k2
2234
___
2235
$code.=<<___ if (!$win64);
2236
lea 8(%rsp),%r10
2237
.cfi_def_cfa_register %r10
2238
sub \$0x128,%rsp
2239
___
2240
$code.=<<___ if ($win64);
2241
lea 8(%rsp),%r10
2242
sub \$0x1c8,%rsp
2243
vmovdqa %xmm6,-0xb0(%r10)
2244
vmovdqa %xmm7,-0xa0(%r10)
2245
vmovdqa %xmm8,-0x90(%r10)
2246
vmovdqa %xmm9,-0x80(%r10)
2247
vmovdqa %xmm10,-0x70(%r10)
2248
vmovdqa %xmm11,-0x60(%r10)
2249
vmovdqa %xmm12,-0x50(%r10)
2250
vmovdqa %xmm13,-0x40(%r10)
2251
vmovdqa %xmm14,-0x30(%r10)
2252
vmovdqa %xmm15,-0x20(%r10)
2253
.Ldo_avx512_body:
2254
___
2255
$code.=<<___;
2256
lea .Lconst(%rip),%rcx
2257
lea 48+64($ctx),$ctx # size optimization
2258
vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
2259
2260
# expand pre-calculated table
2261
vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
2262
and \$-512,%rsp
2263
vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
2264
mov \$0x20,%rax
2265
vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
2266
vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
2267
vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
2268
vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
2269
vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
2270
vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
2271
vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
2272
vpermd $D0,$T2,$R0 # 00003412 -> 14243444
2273
vpbroadcastq 64(%rcx),$MASK # .Lmask26
2274
vpermd $D1,$T2,$R1
2275
vpermd $T0,$T2,$S1
2276
vpermd $D2,$T2,$R2
2277
vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
2278
vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
2279
vpermd $T1,$T2,$S2
2280
vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
2281
vpsrlq \$32,$R1,$T1
2282
vpermd $D3,$T2,$R3
2283
vmovdqa64 $S1,0x40(%rsp){%k2}
2284
vpermd $T3,$T2,$S3
2285
vpermd $D4,$T2,$R4
2286
vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
2287
vpermd $T4,$T2,$S4
2288
vmovdqa64 $S2,0x80(%rsp){%k2}
2289
vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
2290
vmovdqa64 $S3,0xc0(%rsp){%k2}
2291
vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
2292
vmovdqa64 $S4,0x100(%rsp){%k2}
2293
2294
################################################################
2295
# calculate 5th through 8th powers of the key
2296
#
2297
# d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
2298
# d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
2299
# d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
2300
# d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
2301
# d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
2302
2303
vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
2304
vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
2305
vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
2306
vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
2307
vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
2308
vpsrlq \$32,$R2,$T2
2309
2310
vpmuludq $T1,$S4,$M0
2311
vpmuludq $T1,$R0,$M1
2312
vpmuludq $T1,$R1,$M2
2313
vpmuludq $T1,$R2,$M3
2314
vpmuludq $T1,$R3,$M4
2315
vpsrlq \$32,$R3,$T3
2316
vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
2317
vpaddq $M1,$D1,$D1 # d1 += r1'*r0
2318
vpaddq $M2,$D2,$D2 # d2 += r1'*r1
2319
vpaddq $M3,$D3,$D3 # d3 += r1'*r2
2320
vpaddq $M4,$D4,$D4 # d4 += r1'*r3
2321
2322
vpmuludq $T2,$S3,$M0
2323
vpmuludq $T2,$S4,$M1
2324
vpmuludq $T2,$R1,$M3
2325
vpmuludq $T2,$R2,$M4
2326
vpmuludq $T2,$R0,$M2
2327
vpsrlq \$32,$R4,$T4
2328
vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
2329
vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
2330
vpaddq $M3,$D3,$D3 # d3 += r2'*r1
2331
vpaddq $M4,$D4,$D4 # d4 += r2'*r2
2332
vpaddq $M2,$D2,$D2 # d2 += r2'*r0
2333
2334
vpmuludq $T3,$S2,$M0
2335
vpmuludq $T3,$R0,$M3
2336
vpmuludq $T3,$R1,$M4
2337
vpmuludq $T3,$S3,$M1
2338
vpmuludq $T3,$S4,$M2
2339
vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
2340
vpaddq $M3,$D3,$D3 # d3 += r3'*r0
2341
vpaddq $M4,$D4,$D4 # d4 += r3'*r1
2342
vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
2343
vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
2344
2345
vpmuludq $T4,$S4,$M3
2346
vpmuludq $T4,$R0,$M4
2347
vpmuludq $T4,$S1,$M0
2348
vpmuludq $T4,$S2,$M1
2349
vpmuludq $T4,$S3,$M2
2350
vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
2351
vpaddq $M4,$D4,$D4 # d4 += r2'*r0
2352
vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
2353
vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
2354
vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
2355
2356
################################################################
2357
# load input
2358
vmovdqu64 16*0($inp),%z#$T3
2359
vmovdqu64 16*4($inp),%z#$T4
2360
lea 16*8($inp),$inp
2361
2362
################################################################
2363
# lazy reduction
2364
2365
vpsrlq \$26,$D3,$M3
2366
vpandq $MASK,$D3,$D3
2367
vpaddq $M3,$D4,$D4 # d3 -> d4
2368
2369
vpsrlq \$26,$D0,$M0
2370
vpandq $MASK,$D0,$D0
2371
vpaddq $M0,$D1,$D1 # d0 -> d1
2372
2373
vpsrlq \$26,$D4,$M4
2374
vpandq $MASK,$D4,$D4
2375
2376
vpsrlq \$26,$D1,$M1
2377
vpandq $MASK,$D1,$D1
2378
vpaddq $M1,$D2,$D2 # d1 -> d2
2379
2380
vpaddq $M4,$D0,$D0
2381
vpsllq \$2,$M4,$M4
2382
vpaddq $M4,$D0,$D0 # d4 -> d0
2383
2384
vpsrlq \$26,$D2,$M2
2385
vpandq $MASK,$D2,$D2
2386
vpaddq $M2,$D3,$D3 # d2 -> d3
2387
2388
vpsrlq \$26,$D0,$M0
2389
vpandq $MASK,$D0,$D0
2390
vpaddq $M0,$D1,$D1 # d0 -> d1
2391
2392
vpsrlq \$26,$D3,$M3
2393
vpandq $MASK,$D3,$D3
2394
vpaddq $M3,$D4,$D4 # d3 -> d4
2395
2396
################################################################
2397
# at this point we have 14243444 in $R0-$S4 and 05060708 in
2398
# $D0-$D4, ...
2399
2400
vpunpcklqdq $T4,$T3,$T0 # transpose input
2401
vpunpckhqdq $T4,$T3,$T4
2402
2403
# ... since input 64-bit lanes are ordered as 73625140, we could
2404
# "vperm" it to 76543210 (here and in each loop iteration), *or*
2405
# we could just flow along, hence the goal for $R0-$S4 is
2406
# 1858286838784888 ...
2407
2408
vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
2409
mov \$0x7777,%eax
2410
kmovw %eax,%k1
2411
2412
vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
2413
vpermd $R1,$M0,$R1
2414
vpermd $R2,$M0,$R2
2415
vpermd $R3,$M0,$R3
2416
vpermd $R4,$M0,$R4
2417
2418
vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
2419
vpermd $D1,$M0,${R1}{%k1}
2420
vpermd $D2,$M0,${R2}{%k1}
2421
vpermd $D3,$M0,${R3}{%k1}
2422
vpermd $D4,$M0,${R4}{%k1}
2423
2424
vpslld \$2,$R1,$S1 # *5
2425
vpslld \$2,$R2,$S2
2426
vpslld \$2,$R3,$S3
2427
vpslld \$2,$R4,$S4
2428
vpaddd $R1,$S1,$S1
2429
vpaddd $R2,$S2,$S2
2430
vpaddd $R3,$S3,$S3
2431
vpaddd $R4,$S4,$S4
2432
2433
vpbroadcastq 32(%rcx),$PADBIT # .L129
2434
2435
vpsrlq \$52,$T0,$T2 # splat input
2436
vpsllq \$12,$T4,$T3
2437
vporq $T3,$T2,$T2
2438
vpsrlq \$26,$T0,$T1
2439
vpsrlq \$14,$T4,$T3
2440
vpsrlq \$40,$T4,$T4 # 4
2441
vpandq $MASK,$T2,$T2 # 2
2442
vpandq $MASK,$T0,$T0 # 0
2443
#vpandq $MASK,$T1,$T1 # 1
2444
#vpandq $MASK,$T3,$T3 # 3
2445
#vporq $PADBIT,$T4,$T4 # padbit, yes, always
2446
2447
vpaddq $H2,$T2,$H2 # accumulate input
2448
sub \$192,$len
2449
jbe .Ltail_avx512
2450
jmp .Loop_avx512
2451
2452
.align 32
2453
.Loop_avx512:
2454
################################################################
2455
# ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
2456
# ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
2457
# ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
2458
# ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
2459
# ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
2460
# ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
2461
# ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
2462
# ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
2463
# \________/\___________/
2464
################################################################
2465
#vpaddq $H2,$T2,$H2 # accumulate input
2466
2467
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
2468
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
2469
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
2470
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
2471
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
2472
#
2473
# however, as h2 is "chronologically" first one available pull
2474
# corresponding operations up, so it's
2475
#
2476
# d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
2477
# d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
2478
# d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
2479
# d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
2480
# d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
2481
2482
vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2483
vpaddq $H0,$T0,$H0
2484
vpmuludq $H2,$R2,$D4 # d4 = h2*r2
2485
vpandq $MASK,$T1,$T1 # 1
2486
vpmuludq $H2,$S3,$D0 # d0 = h2*s3
2487
vpandq $MASK,$T3,$T3 # 3
2488
vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2489
vporq $PADBIT,$T4,$T4 # padbit, yes, always
2490
vpmuludq $H2,$R0,$D2 # d2 = h2*r0
2491
vpaddq $H1,$T1,$H1 # accumulate input
2492
vpaddq $H3,$T3,$H3
2493
vpaddq $H4,$T4,$H4
2494
2495
vmovdqu64 16*0($inp),$T3 # load input
2496
vmovdqu64 16*4($inp),$T4
2497
lea 16*8($inp),$inp
2498
vpmuludq $H0,$R3,$M3
2499
vpmuludq $H0,$R4,$M4
2500
vpmuludq $H0,$R0,$M0
2501
vpmuludq $H0,$R1,$M1
2502
vpaddq $M3,$D3,$D3 # d3 += h0*r3
2503
vpaddq $M4,$D4,$D4 # d4 += h0*r4
2504
vpaddq $M0,$D0,$D0 # d0 += h0*r0
2505
vpaddq $M1,$D1,$D1 # d1 += h0*r1
2506
2507
vpmuludq $H1,$R2,$M3
2508
vpmuludq $H1,$R3,$M4
2509
vpmuludq $H1,$S4,$M0
2510
vpmuludq $H0,$R2,$M2
2511
vpaddq $M3,$D3,$D3 # d3 += h1*r2
2512
vpaddq $M4,$D4,$D4 # d4 += h1*r3
2513
vpaddq $M0,$D0,$D0 # d0 += h1*s4
2514
vpaddq $M2,$D2,$D2 # d2 += h0*r2
2515
2516
vpunpcklqdq $T4,$T3,$T0 # transpose input
2517
vpunpckhqdq $T4,$T3,$T4
2518
2519
vpmuludq $H3,$R0,$M3
2520
vpmuludq $H3,$R1,$M4
2521
vpmuludq $H1,$R0,$M1
2522
vpmuludq $H1,$R1,$M2
2523
vpaddq $M3,$D3,$D3 # d3 += h3*r0
2524
vpaddq $M4,$D4,$D4 # d4 += h3*r1
2525
vpaddq $M1,$D1,$D1 # d1 += h1*r0
2526
vpaddq $M2,$D2,$D2 # d2 += h1*r1
2527
2528
vpmuludq $H4,$S4,$M3
2529
vpmuludq $H4,$R0,$M4
2530
vpmuludq $H3,$S2,$M0
2531
vpmuludq $H3,$S3,$M1
2532
vpaddq $M3,$D3,$D3 # d3 += h4*s4
2533
vpmuludq $H3,$S4,$M2
2534
vpaddq $M4,$D4,$D4 # d4 += h4*r0
2535
vpaddq $M0,$D0,$D0 # d0 += h3*s2
2536
vpaddq $M1,$D1,$D1 # d1 += h3*s3
2537
vpaddq $M2,$D2,$D2 # d2 += h3*s4
2538
2539
vpmuludq $H4,$S1,$M0
2540
vpmuludq $H4,$S2,$M1
2541
vpmuludq $H4,$S3,$M2
2542
vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2543
vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2544
vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2545
2546
################################################################
2547
# lazy reduction (interleaved with input splat)
2548
2549
vpsrlq \$52,$T0,$T2 # splat input
2550
vpsllq \$12,$T4,$T3
2551
2552
vpsrlq \$26,$D3,$H3
2553
vpandq $MASK,$D3,$D3
2554
vpaddq $H3,$D4,$H4 # h3 -> h4
2555
2556
vporq $T3,$T2,$T2
2557
2558
vpsrlq \$26,$H0,$D0
2559
vpandq $MASK,$H0,$H0
2560
vpaddq $D0,$H1,$H1 # h0 -> h1
2561
2562
vpandq $MASK,$T2,$T2 # 2
2563
2564
vpsrlq \$26,$H4,$D4
2565
vpandq $MASK,$H4,$H4
2566
2567
vpsrlq \$26,$H1,$D1
2568
vpandq $MASK,$H1,$H1
2569
vpaddq $D1,$H2,$H2 # h1 -> h2
2570
2571
vpaddq $D4,$H0,$H0
2572
vpsllq \$2,$D4,$D4
2573
vpaddq $D4,$H0,$H0 # h4 -> h0
2574
2575
vpaddq $T2,$H2,$H2 # modulo-scheduled
2576
vpsrlq \$26,$T0,$T1
2577
2578
vpsrlq \$26,$H2,$D2
2579
vpandq $MASK,$H2,$H2
2580
vpaddq $D2,$D3,$H3 # h2 -> h3
2581
2582
vpsrlq \$14,$T4,$T3
2583
2584
vpsrlq \$26,$H0,$D0
2585
vpandq $MASK,$H0,$H0
2586
vpaddq $D0,$H1,$H1 # h0 -> h1
2587
2588
vpsrlq \$40,$T4,$T4 # 4
2589
2590
vpsrlq \$26,$H3,$D3
2591
vpandq $MASK,$H3,$H3
2592
vpaddq $D3,$H4,$H4 # h3 -> h4
2593
2594
vpandq $MASK,$T0,$T0 # 0
2595
#vpandq $MASK,$T1,$T1 # 1
2596
#vpandq $MASK,$T3,$T3 # 3
2597
#vporq $PADBIT,$T4,$T4 # padbit, yes, always
2598
2599
sub \$128,$len
2600
ja .Loop_avx512
2601
2602
.Ltail_avx512:
2603
################################################################
2604
# while above multiplications were by r^8 in all lanes, in last
2605
# iteration we multiply least significant lane by r^8 and most
2606
# significant one by r, that's why table gets shifted...
2607
2608
vpsrlq \$32,$R0,$R0 # 0105020603070408
2609
vpsrlq \$32,$R1,$R1
2610
vpsrlq \$32,$R2,$R2
2611
vpsrlq \$32,$S3,$S3
2612
vpsrlq \$32,$S4,$S4
2613
vpsrlq \$32,$R3,$R3
2614
vpsrlq \$32,$R4,$R4
2615
vpsrlq \$32,$S1,$S1
2616
vpsrlq \$32,$S2,$S2
2617
2618
################################################################
2619
# load either next or last 64 byte of input
2620
lea ($inp,$len),$inp
2621
2622
#vpaddq $H2,$T2,$H2 # accumulate input
2623
vpaddq $H0,$T0,$H0
2624
2625
vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2626
vpmuludq $H2,$R2,$D4 # d4 = h2*r2
2627
vpmuludq $H2,$S3,$D0 # d0 = h2*s3
2628
vpandq $MASK,$T1,$T1 # 1
2629
vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2630
vpandq $MASK,$T3,$T3 # 3
2631
vpmuludq $H2,$R0,$D2 # d2 = h2*r0
2632
vporq $PADBIT,$T4,$T4 # padbit, yes, always
2633
vpaddq $H1,$T1,$H1 # accumulate input
2634
vpaddq $H3,$T3,$H3
2635
vpaddq $H4,$T4,$H4
2636
2637
vmovdqu 16*0($inp),%x#$T0
2638
vpmuludq $H0,$R3,$M3
2639
vpmuludq $H0,$R4,$M4
2640
vpmuludq $H0,$R0,$M0
2641
vpmuludq $H0,$R1,$M1
2642
vpaddq $M3,$D3,$D3 # d3 += h0*r3
2643
vpaddq $M4,$D4,$D4 # d4 += h0*r4
2644
vpaddq $M0,$D0,$D0 # d0 += h0*r0
2645
vpaddq $M1,$D1,$D1 # d1 += h0*r1
2646
2647
vmovdqu 16*1($inp),%x#$T1
2648
vpmuludq $H1,$R2,$M3
2649
vpmuludq $H1,$R3,$M4
2650
vpmuludq $H1,$S4,$M0
2651
vpmuludq $H0,$R2,$M2
2652
vpaddq $M3,$D3,$D3 # d3 += h1*r2
2653
vpaddq $M4,$D4,$D4 # d4 += h1*r3
2654
vpaddq $M0,$D0,$D0 # d0 += h1*s4
2655
vpaddq $M2,$D2,$D2 # d2 += h0*r2
2656
2657
vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
2658
vpmuludq $H3,$R0,$M3
2659
vpmuludq $H3,$R1,$M4
2660
vpmuludq $H1,$R0,$M1
2661
vpmuludq $H1,$R1,$M2
2662
vpaddq $M3,$D3,$D3 # d3 += h3*r0
2663
vpaddq $M4,$D4,$D4 # d4 += h3*r1
2664
vpaddq $M1,$D1,$D1 # d1 += h1*r0
2665
vpaddq $M2,$D2,$D2 # d2 += h1*r1
2666
2667
vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
2668
vpmuludq $H4,$S4,$M3
2669
vpmuludq $H4,$R0,$M4
2670
vpmuludq $H3,$S2,$M0
2671
vpmuludq $H3,$S3,$M1
2672
vpmuludq $H3,$S4,$M2
2673
vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
2674
vpaddq $M4,$D4,$D4 # d4 += h4*r0
2675
vpaddq $M0,$D0,$D0 # d0 += h3*s2
2676
vpaddq $M1,$D1,$D1 # d1 += h3*s3
2677
vpaddq $M2,$D2,$D2 # d2 += h3*s4
2678
2679
vpmuludq $H4,$S1,$M0
2680
vpmuludq $H4,$S2,$M1
2681
vpmuludq $H4,$S3,$M2
2682
vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2683
vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2684
vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2685
2686
################################################################
2687
# horizontal addition
2688
2689
mov \$1,%eax
2690
vpermq \$0xb1,$H3,$D3
2691
vpermq \$0xb1,$D4,$H4
2692
vpermq \$0xb1,$H0,$D0
2693
vpermq \$0xb1,$H1,$D1
2694
vpermq \$0xb1,$H2,$D2
2695
vpaddq $D3,$H3,$H3
2696
vpaddq $D4,$H4,$H4
2697
vpaddq $D0,$H0,$H0
2698
vpaddq $D1,$H1,$H1
2699
vpaddq $D2,$H2,$H2
2700
2701
kmovw %eax,%k3
2702
vpermq \$0x2,$H3,$D3
2703
vpermq \$0x2,$H4,$D4
2704
vpermq \$0x2,$H0,$D0
2705
vpermq \$0x2,$H1,$D1
2706
vpermq \$0x2,$H2,$D2
2707
vpaddq $D3,$H3,$H3
2708
vpaddq $D4,$H4,$H4
2709
vpaddq $D0,$H0,$H0
2710
vpaddq $D1,$H1,$H1
2711
vpaddq $D2,$H2,$H2
2712
2713
vextracti64x4 \$0x1,$H3,%y#$D3
2714
vextracti64x4 \$0x1,$H4,%y#$D4
2715
vextracti64x4 \$0x1,$H0,%y#$D0
2716
vextracti64x4 \$0x1,$H1,%y#$D1
2717
vextracti64x4 \$0x1,$H2,%y#$D2
2718
vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
2719
vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
2720
vpaddq $D0,$H0,${H0}{%k3}{z}
2721
vpaddq $D1,$H1,${H1}{%k3}{z}
2722
vpaddq $D2,$H2,${H2}{%k3}{z}
2723
___
2724
map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
2725
map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
2726
$code.=<<___;
2727
################################################################
2728
# lazy reduction (interleaved with input splat)
2729
2730
vpsrlq \$26,$H3,$D3
2731
vpand $MASK,$H3,$H3
2732
vpsrldq \$6,$T0,$T2 # splat input
2733
vpsrldq \$6,$T1,$T3
2734
vpunpckhqdq $T1,$T0,$T4 # 4
2735
vpaddq $D3,$H4,$H4 # h3 -> h4
2736
2737
vpsrlq \$26,$H0,$D0
2738
vpand $MASK,$H0,$H0
2739
vpunpcklqdq $T3,$T2,$T2 # 2:3
2740
vpunpcklqdq $T1,$T0,$T0 # 0:1
2741
vpaddq $D0,$H1,$H1 # h0 -> h1
2742
2743
vpsrlq \$26,$H4,$D4
2744
vpand $MASK,$H4,$H4
2745
2746
vpsrlq \$26,$H1,$D1
2747
vpand $MASK,$H1,$H1
2748
vpsrlq \$30,$T2,$T3
2749
vpsrlq \$4,$T2,$T2
2750
vpaddq $D1,$H2,$H2 # h1 -> h2
2751
2752
vpaddq $D4,$H0,$H0
2753
vpsllq \$2,$D4,$D4
2754
vpsrlq \$26,$T0,$T1
2755
vpsrlq \$40,$T4,$T4 # 4
2756
vpaddq $D4,$H0,$H0 # h4 -> h0
2757
2758
vpsrlq \$26,$H2,$D2
2759
vpand $MASK,$H2,$H2
2760
vpand $MASK,$T2,$T2 # 2
2761
vpand $MASK,$T0,$T0 # 0
2762
vpaddq $D2,$H3,$H3 # h2 -> h3
2763
2764
vpsrlq \$26,$H0,$D0
2765
vpand $MASK,$H0,$H0
2766
vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
2767
vpand $MASK,$T1,$T1 # 1
2768
vpaddq $D0,$H1,$H1 # h0 -> h1
2769
2770
vpsrlq \$26,$H3,$D3
2771
vpand $MASK,$H3,$H3
2772
vpand $MASK,$T3,$T3 # 3
2773
vpor 32(%rcx),$T4,$T4 # padbit, yes, always
2774
vpaddq $D3,$H4,$H4 # h3 -> h4
2775
2776
lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
2777
add \$64,$len
2778
jnz .Ltail_avx2$suffix
2779
2780
vpsubq $T2,$H2,$H2 # undo input accumulation
2781
vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2782
vmovd %x#$H1,`4*1-48-64`($ctx)
2783
vmovd %x#$H2,`4*2-48-64`($ctx)
2784
vmovd %x#$H3,`4*3-48-64`($ctx)
2785
vmovd %x#$H4,`4*4-48-64`($ctx)
2786
vzeroall
2787
___
2788
$code.=<<___ if ($win64);
2789
movdqa -0xb0(%r10),%xmm6
2790
movdqa -0xa0(%r10),%xmm7
2791
movdqa -0x90(%r10),%xmm8
2792
movdqa -0x80(%r10),%xmm9
2793
movdqa -0x70(%r10),%xmm10
2794
movdqa -0x60(%r10),%xmm11
2795
movdqa -0x50(%r10),%xmm12
2796
movdqa -0x40(%r10),%xmm13
2797
movdqa -0x30(%r10),%xmm14
2798
movdqa -0x20(%r10),%xmm15
2799
lea -8(%r10),%rsp
2800
.Ldo_avx512_epilogue:
2801
___
2802
$code.=<<___ if (!$win64);
2803
lea -8(%r10),%rsp
2804
.cfi_def_cfa_register %rsp
2805
___
2806
$code.=<<___;
2807
RET
2808
.cfi_endproc
2809
___
2810
2811
}
2812
2813
}
2814
2815
&declare_function("poly1305_blocks_avx2", 32, 4);
2816
poly1305_blocks_avxN(0);
2817
&end_function("poly1305_blocks_avx2");
2818
2819
#######################################################################
2820
if ($avx>2) {
2821
# On entry we have input length divisible by 64. But since inner loop
2822
# processes 128 bytes per iteration, cases when length is not divisible
2823
# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2824
# reason stack layout is kept identical to poly1305_blocks_avx2. If not
2825
# for this tail, we wouldn't have to even allocate stack frame...
2826
2827
&declare_function("poly1305_blocks_avx512", 32, 4);
2828
poly1305_blocks_avxN(1);
2829
&end_function("poly1305_blocks_avx512");
2830
2831
if (!$kernel && $avx>3) {
2832
########################################################################
2833
# VPMADD52 version using 2^44 radix.
2834
#
2835
# One can argue that base 2^52 would be more natural. Well, even though
2836
# some operations would be more natural, one has to recognize couple of
2837
# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
2838
# at amount of multiply-n-accumulate operations. Secondly, it makes it
2839
# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
2840
# reference implementations], which means that more such operations
2841
# would have to be performed in inner loop, which in turn makes critical
2842
# path longer. In other words, even though base 2^44 reduction might
2843
# look less elegant, overall critical path is actually shorter...
2844
2845
########################################################################
2846
# Layout of opaque area is following.
2847
#
2848
# unsigned __int64 h[3]; # current hash value base 2^44
2849
# unsigned __int64 s[2]; # key value*20 base 2^44
2850
# unsigned __int64 r[3]; # key value base 2^44
2851
# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
2852
# # r^n positions reflect
2853
# # placement in register, not
2854
# # memory, R[3] is R[1]*20
2855
2856
$code.=<<___;
2857
.type poly1305_init_base2_44,\@function,3
2858
.align 32
2859
poly1305_init_base2_44:
2860
xor %eax,%eax
2861
mov %rax,0($ctx) # initialize hash value
2862
mov %rax,8($ctx)
2863
mov %rax,16($ctx)
2864
2865
.Linit_base2_44:
2866
lea poly1305_blocks_vpmadd52(%rip),%r10
2867
lea poly1305_emit_base2_44(%rip),%r11
2868
2869
mov \$0x0ffffffc0fffffff,%rax
2870
mov \$0x0ffffffc0ffffffc,%rcx
2871
and 0($inp),%rax
2872
mov \$0x00000fffffffffff,%r8
2873
and 8($inp),%rcx
2874
mov \$0x00000fffffffffff,%r9
2875
and %rax,%r8
2876
shrd \$44,%rcx,%rax
2877
mov %r8,40($ctx) # r0
2878
and %r9,%rax
2879
shr \$24,%rcx
2880
mov %rax,48($ctx) # r1
2881
lea (%rax,%rax,4),%rax # *5
2882
mov %rcx,56($ctx) # r2
2883
shl \$2,%rax # magic <<2
2884
lea (%rcx,%rcx,4),%rcx # *5
2885
shl \$2,%rcx # magic <<2
2886
mov %rax,24($ctx) # s1
2887
mov %rcx,32($ctx) # s2
2888
movq \$-1,64($ctx) # write impossible value
2889
___
2890
$code.=<<___ if ($flavour !~ /elf32/);
2891
mov %r10,0(%rdx)
2892
mov %r11,8(%rdx)
2893
___
2894
$code.=<<___ if ($flavour =~ /elf32/);
2895
mov %r10d,0(%rdx)
2896
mov %r11d,4(%rdx)
2897
___
2898
$code.=<<___;
2899
mov \$1,%eax
2900
RET
2901
.size poly1305_init_base2_44,.-poly1305_init_base2_44
2902
___
2903
{
2904
my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
2905
my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
2906
my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
2907
2908
$code.=<<___;
2909
.type poly1305_blocks_vpmadd52,\@function,4
2910
.align 32
2911
poly1305_blocks_vpmadd52:
2912
shr \$4,$len
2913
jz .Lno_data_vpmadd52 # too short
2914
2915
shl \$40,$padbit
2916
mov 64($ctx),%r8 # peek on power of the key
2917
2918
# if powers of the key are not calculated yet, process up to 3
2919
# blocks with this single-block subroutine, otherwise ensure that
2920
# length is divisible by 2 blocks and pass the rest down to next
2921
# subroutine...
2922
2923
mov \$3,%rax
2924
mov \$1,%r10
2925
cmp \$4,$len # is input long
2926
cmovae %r10,%rax
2927
test %r8,%r8 # is power value impossible?
2928
cmovns %r10,%rax
2929
2930
and $len,%rax # is input of favourable length?
2931
jz .Lblocks_vpmadd52_4x
2932
2933
sub %rax,$len
2934
mov \$7,%r10d
2935
mov \$1,%r11d
2936
kmovw %r10d,%k7
2937
lea .L2_44_inp_permd(%rip),%r10
2938
kmovw %r11d,%k1
2939
2940
vmovq $padbit,%x#$PAD
2941
vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
2942
vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
2943
vpermq \$0xcf,$PAD,$PAD
2944
vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
2945
2946
vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
2947
vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
2948
vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
2949
vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
2950
2951
vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
2952
vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
2953
2954
jmp .Loop_vpmadd52
2955
2956
.align 32
2957
.Loop_vpmadd52:
2958
vmovdqu32 0($inp),%x#$T0 # load input as ----3210
2959
lea 16($inp),$inp
2960
2961
vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
2962
vpsrlvq $inp_shift,$T0,$T0
2963
vpandq $reduc_mask,$T0,$T0
2964
vporq $PAD,$T0,$T0
2965
2966
vpaddq $T0,$Dlo,$Dlo # accumulate input
2967
2968
vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
2969
vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
2970
vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
2971
2972
vpxord $Dlo,$Dlo,$Dlo
2973
vpxord $Dhi,$Dhi,$Dhi
2974
2975
vpmadd52luq $r2r1r0,$H0,$Dlo
2976
vpmadd52huq $r2r1r0,$H0,$Dhi
2977
2978
vpmadd52luq $r1r0s2,$H1,$Dlo
2979
vpmadd52huq $r1r0s2,$H1,$Dhi
2980
2981
vpmadd52luq $r0s2s1,$H2,$Dlo
2982
vpmadd52huq $r0s2s1,$H2,$Dhi
2983
2984
vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
2985
vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
2986
vpandq $reduc_mask,$Dlo,$Dlo
2987
2988
vpaddq $T0,$Dhi,$Dhi
2989
2990
vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
2991
2992
vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
2993
2994
vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
2995
vpandq $reduc_mask,$Dlo,$Dlo
2996
2997
vpermq \$0b10010011,$T0,$T0
2998
2999
vpaddq $T0,$Dlo,$Dlo
3000
3001
vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
3002
3003
vpaddq $T0,$Dlo,$Dlo
3004
vpsllq \$2,$T0,$T0
3005
3006
vpaddq $T0,$Dlo,$Dlo
3007
3008
dec %rax # len-=16
3009
jnz .Loop_vpmadd52
3010
3011
vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
3012
3013
test $len,$len
3014
jnz .Lblocks_vpmadd52_4x
3015
3016
.Lno_data_vpmadd52:
3017
RET
3018
.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
3019
___
3020
}
3021
{
3022
########################################################################
3023
# As implied by its name 4x subroutine processes 4 blocks in parallel
3024
# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
3025
# and is handled in 256-bit %ymm registers.
3026
3027
my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
3028
my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
3029
my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
3030
3031
$code.=<<___;
3032
.type poly1305_blocks_vpmadd52_4x,\@function,4
3033
.align 32
3034
poly1305_blocks_vpmadd52_4x:
3035
shr \$4,$len
3036
jz .Lno_data_vpmadd52_4x # too short
3037
3038
shl \$40,$padbit
3039
mov 64($ctx),%r8 # peek on power of the key
3040
3041
.Lblocks_vpmadd52_4x:
3042
vpbroadcastq $padbit,$PAD
3043
3044
vmovdqa64 .Lx_mask44(%rip),$mask44
3045
mov \$5,%eax
3046
vmovdqa64 .Lx_mask42(%rip),$mask42
3047
kmovw %eax,%k1 # used in 2x path
3048
3049
test %r8,%r8 # is power value impossible?
3050
js .Linit_vpmadd52 # if it is, then init R[4]
3051
3052
vmovq 0($ctx),%x#$H0 # load current hash value
3053
vmovq 8($ctx),%x#$H1
3054
vmovq 16($ctx),%x#$H2
3055
3056
test \$3,$len # is length 4*n+2?
3057
jnz .Lblocks_vpmadd52_2x_do
3058
3059
.Lblocks_vpmadd52_4x_do:
3060
vpbroadcastq 64($ctx),$R0 # load 4th power of the key
3061
vpbroadcastq 96($ctx),$R1
3062
vpbroadcastq 128($ctx),$R2
3063
vpbroadcastq 160($ctx),$S1
3064
3065
.Lblocks_vpmadd52_4x_key_loaded:
3066
vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3067
vpaddq $R2,$S2,$S2
3068
vpsllq \$2,$S2,$S2
3069
3070
test \$7,$len # is len 8*n?
3071
jz .Lblocks_vpmadd52_8x
3072
3073
vmovdqu64 16*0($inp),$T2 # load data
3074
vmovdqu64 16*2($inp),$T3
3075
lea 16*4($inp),$inp
3076
3077
vpunpcklqdq $T3,$T2,$T1 # transpose data
3078
vpunpckhqdq $T3,$T2,$T3
3079
3080
# at this point 64-bit lanes are ordered as 3-1-2-0
3081
3082
vpsrlq \$24,$T3,$T2 # splat the data
3083
vporq $PAD,$T2,$T2
3084
vpaddq $T2,$H2,$H2 # accumulate input
3085
vpandq $mask44,$T1,$T0
3086
vpsrlq \$44,$T1,$T1
3087
vpsllq \$20,$T3,$T3
3088
vporq $T3,$T1,$T1
3089
vpandq $mask44,$T1,$T1
3090
3091
sub \$4,$len
3092
jz .Ltail_vpmadd52_4x
3093
jmp .Loop_vpmadd52_4x
3094
ud2
3095
3096
.align 32
3097
.Linit_vpmadd52:
3098
vmovq 24($ctx),%x#$S1 # load key
3099
vmovq 56($ctx),%x#$H2
3100
vmovq 32($ctx),%x#$S2
3101
vmovq 40($ctx),%x#$R0
3102
vmovq 48($ctx),%x#$R1
3103
3104
vmovdqa $R0,$H0
3105
vmovdqa $R1,$H1
3106
vmovdqa $H2,$R2
3107
3108
mov \$2,%eax
3109
3110
.Lmul_init_vpmadd52:
3111
vpxorq $D0lo,$D0lo,$D0lo
3112
vpmadd52luq $H2,$S1,$D0lo
3113
vpxorq $D0hi,$D0hi,$D0hi
3114
vpmadd52huq $H2,$S1,$D0hi
3115
vpxorq $D1lo,$D1lo,$D1lo
3116
vpmadd52luq $H2,$S2,$D1lo
3117
vpxorq $D1hi,$D1hi,$D1hi
3118
vpmadd52huq $H2,$S2,$D1hi
3119
vpxorq $D2lo,$D2lo,$D2lo
3120
vpmadd52luq $H2,$R0,$D2lo
3121
vpxorq $D2hi,$D2hi,$D2hi
3122
vpmadd52huq $H2,$R0,$D2hi
3123
3124
vpmadd52luq $H0,$R0,$D0lo
3125
vpmadd52huq $H0,$R0,$D0hi
3126
vpmadd52luq $H0,$R1,$D1lo
3127
vpmadd52huq $H0,$R1,$D1hi
3128
vpmadd52luq $H0,$R2,$D2lo
3129
vpmadd52huq $H0,$R2,$D2hi
3130
3131
vpmadd52luq $H1,$S2,$D0lo
3132
vpmadd52huq $H1,$S2,$D0hi
3133
vpmadd52luq $H1,$R0,$D1lo
3134
vpmadd52huq $H1,$R0,$D1hi
3135
vpmadd52luq $H1,$R1,$D2lo
3136
vpmadd52huq $H1,$R1,$D2hi
3137
3138
################################################################
3139
# partial reduction
3140
vpsrlq \$44,$D0lo,$tmp
3141
vpsllq \$8,$D0hi,$D0hi
3142
vpandq $mask44,$D0lo,$H0
3143
vpaddq $tmp,$D0hi,$D0hi
3144
3145
vpaddq $D0hi,$D1lo,$D1lo
3146
3147
vpsrlq \$44,$D1lo,$tmp
3148
vpsllq \$8,$D1hi,$D1hi
3149
vpandq $mask44,$D1lo,$H1
3150
vpaddq $tmp,$D1hi,$D1hi
3151
3152
vpaddq $D1hi,$D2lo,$D2lo
3153
3154
vpsrlq \$42,$D2lo,$tmp
3155
vpsllq \$10,$D2hi,$D2hi
3156
vpandq $mask42,$D2lo,$H2
3157
vpaddq $tmp,$D2hi,$D2hi
3158
3159
vpaddq $D2hi,$H0,$H0
3160
vpsllq \$2,$D2hi,$D2hi
3161
3162
vpaddq $D2hi,$H0,$H0
3163
3164
vpsrlq \$44,$H0,$tmp # additional step
3165
vpandq $mask44,$H0,$H0
3166
3167
vpaddq $tmp,$H1,$H1
3168
3169
dec %eax
3170
jz .Ldone_init_vpmadd52
3171
3172
vpunpcklqdq $R1,$H1,$R1 # 1,2
3173
vpbroadcastq %x#$H1,%x#$H1 # 2,2
3174
vpunpcklqdq $R2,$H2,$R2
3175
vpbroadcastq %x#$H2,%x#$H2
3176
vpunpcklqdq $R0,$H0,$R0
3177
vpbroadcastq %x#$H0,%x#$H0
3178
3179
vpsllq \$2,$R1,$S1 # S1 = R1*5*4
3180
vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3181
vpaddq $R1,$S1,$S1
3182
vpaddq $R2,$S2,$S2
3183
vpsllq \$2,$S1,$S1
3184
vpsllq \$2,$S2,$S2
3185
3186
jmp .Lmul_init_vpmadd52
3187
ud2
3188
3189
.align 32
3190
.Ldone_init_vpmadd52:
3191
vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
3192
vinserti128 \$1,%x#$R2,$H2,$R2
3193
vinserti128 \$1,%x#$R0,$H0,$R0
3194
3195
vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
3196
vpermq \$0b11011000,$R2,$R2
3197
vpermq \$0b11011000,$R0,$R0
3198
3199
vpsllq \$2,$R1,$S1 # S1 = R1*5*4
3200
vpaddq $R1,$S1,$S1
3201
vpsllq \$2,$S1,$S1
3202
3203
vmovq 0($ctx),%x#$H0 # load current hash value
3204
vmovq 8($ctx),%x#$H1
3205
vmovq 16($ctx),%x#$H2
3206
3207
test \$3,$len # is length 4*n+2?
3208
jnz .Ldone_init_vpmadd52_2x
3209
3210
vmovdqu64 $R0,64($ctx) # save key powers
3211
vpbroadcastq %x#$R0,$R0 # broadcast 4th power
3212
vmovdqu64 $R1,96($ctx)
3213
vpbroadcastq %x#$R1,$R1
3214
vmovdqu64 $R2,128($ctx)
3215
vpbroadcastq %x#$R2,$R2
3216
vmovdqu64 $S1,160($ctx)
3217
vpbroadcastq %x#$S1,$S1
3218
3219
jmp .Lblocks_vpmadd52_4x_key_loaded
3220
ud2
3221
3222
.align 32
3223
.Ldone_init_vpmadd52_2x:
3224
vmovdqu64 $R0,64($ctx) # save key powers
3225
vpsrldq \$8,$R0,$R0 # 0-1-0-2
3226
vmovdqu64 $R1,96($ctx)
3227
vpsrldq \$8,$R1,$R1
3228
vmovdqu64 $R2,128($ctx)
3229
vpsrldq \$8,$R2,$R2
3230
vmovdqu64 $S1,160($ctx)
3231
vpsrldq \$8,$S1,$S1
3232
jmp .Lblocks_vpmadd52_2x_key_loaded
3233
ud2
3234
3235
.align 32
3236
.Lblocks_vpmadd52_2x_do:
3237
vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
3238
vmovdqu64 160+8($ctx),${S1}{%k1}{z}
3239
vmovdqu64 64+8($ctx),${R0}{%k1}{z}
3240
vmovdqu64 96+8($ctx),${R1}{%k1}{z}
3241
3242
.Lblocks_vpmadd52_2x_key_loaded:
3243
vmovdqu64 16*0($inp),$T2 # load data
3244
vpxorq $T3,$T3,$T3
3245
lea 16*2($inp),$inp
3246
3247
vpunpcklqdq $T3,$T2,$T1 # transpose data
3248
vpunpckhqdq $T3,$T2,$T3
3249
3250
# at this point 64-bit lanes are ordered as x-1-x-0
3251
3252
vpsrlq \$24,$T3,$T2 # splat the data
3253
vporq $PAD,$T2,$T2
3254
vpaddq $T2,$H2,$H2 # accumulate input
3255
vpandq $mask44,$T1,$T0
3256
vpsrlq \$44,$T1,$T1
3257
vpsllq \$20,$T3,$T3
3258
vporq $T3,$T1,$T1
3259
vpandq $mask44,$T1,$T1
3260
3261
jmp .Ltail_vpmadd52_2x
3262
ud2
3263
3264
.align 32
3265
.Loop_vpmadd52_4x:
3266
#vpaddq $T2,$H2,$H2 # accumulate input
3267
vpaddq $T0,$H0,$H0
3268
vpaddq $T1,$H1,$H1
3269
3270
vpxorq $D0lo,$D0lo,$D0lo
3271
vpmadd52luq $H2,$S1,$D0lo
3272
vpxorq $D0hi,$D0hi,$D0hi
3273
vpmadd52huq $H2,$S1,$D0hi
3274
vpxorq $D1lo,$D1lo,$D1lo
3275
vpmadd52luq $H2,$S2,$D1lo
3276
vpxorq $D1hi,$D1hi,$D1hi
3277
vpmadd52huq $H2,$S2,$D1hi
3278
vpxorq $D2lo,$D2lo,$D2lo
3279
vpmadd52luq $H2,$R0,$D2lo
3280
vpxorq $D2hi,$D2hi,$D2hi
3281
vpmadd52huq $H2,$R0,$D2hi
3282
3283
vmovdqu64 16*0($inp),$T2 # load data
3284
vmovdqu64 16*2($inp),$T3
3285
lea 16*4($inp),$inp
3286
vpmadd52luq $H0,$R0,$D0lo
3287
vpmadd52huq $H0,$R0,$D0hi
3288
vpmadd52luq $H0,$R1,$D1lo
3289
vpmadd52huq $H0,$R1,$D1hi
3290
vpmadd52luq $H0,$R2,$D2lo
3291
vpmadd52huq $H0,$R2,$D2hi
3292
3293
vpunpcklqdq $T3,$T2,$T1 # transpose data
3294
vpunpckhqdq $T3,$T2,$T3
3295
vpmadd52luq $H1,$S2,$D0lo
3296
vpmadd52huq $H1,$S2,$D0hi
3297
vpmadd52luq $H1,$R0,$D1lo
3298
vpmadd52huq $H1,$R0,$D1hi
3299
vpmadd52luq $H1,$R1,$D2lo
3300
vpmadd52huq $H1,$R1,$D2hi
3301
3302
################################################################
3303
# partial reduction (interleaved with data splat)
3304
vpsrlq \$44,$D0lo,$tmp
3305
vpsllq \$8,$D0hi,$D0hi
3306
vpandq $mask44,$D0lo,$H0
3307
vpaddq $tmp,$D0hi,$D0hi
3308
3309
vpsrlq \$24,$T3,$T2
3310
vporq $PAD,$T2,$T2
3311
vpaddq $D0hi,$D1lo,$D1lo
3312
3313
vpsrlq \$44,$D1lo,$tmp
3314
vpsllq \$8,$D1hi,$D1hi
3315
vpandq $mask44,$D1lo,$H1
3316
vpaddq $tmp,$D1hi,$D1hi
3317
3318
vpandq $mask44,$T1,$T0
3319
vpsrlq \$44,$T1,$T1
3320
vpsllq \$20,$T3,$T3
3321
vpaddq $D1hi,$D2lo,$D2lo
3322
3323
vpsrlq \$42,$D2lo,$tmp
3324
vpsllq \$10,$D2hi,$D2hi
3325
vpandq $mask42,$D2lo,$H2
3326
vpaddq $tmp,$D2hi,$D2hi
3327
3328
vpaddq $T2,$H2,$H2 # accumulate input
3329
vpaddq $D2hi,$H0,$H0
3330
vpsllq \$2,$D2hi,$D2hi
3331
3332
vpaddq $D2hi,$H0,$H0
3333
vporq $T3,$T1,$T1
3334
vpandq $mask44,$T1,$T1
3335
3336
vpsrlq \$44,$H0,$tmp # additional step
3337
vpandq $mask44,$H0,$H0
3338
3339
vpaddq $tmp,$H1,$H1
3340
3341
sub \$4,$len # len-=64
3342
jnz .Loop_vpmadd52_4x
3343
3344
.Ltail_vpmadd52_4x:
3345
vmovdqu64 128($ctx),$R2 # load all key powers
3346
vmovdqu64 160($ctx),$S1
3347
vmovdqu64 64($ctx),$R0
3348
vmovdqu64 96($ctx),$R1
3349
3350
.Ltail_vpmadd52_2x:
3351
vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3352
vpaddq $R2,$S2,$S2
3353
vpsllq \$2,$S2,$S2
3354
3355
#vpaddq $T2,$H2,$H2 # accumulate input
3356
vpaddq $T0,$H0,$H0
3357
vpaddq $T1,$H1,$H1
3358
3359
vpxorq $D0lo,$D0lo,$D0lo
3360
vpmadd52luq $H2,$S1,$D0lo
3361
vpxorq $D0hi,$D0hi,$D0hi
3362
vpmadd52huq $H2,$S1,$D0hi
3363
vpxorq $D1lo,$D1lo,$D1lo
3364
vpmadd52luq $H2,$S2,$D1lo
3365
vpxorq $D1hi,$D1hi,$D1hi
3366
vpmadd52huq $H2,$S2,$D1hi
3367
vpxorq $D2lo,$D2lo,$D2lo
3368
vpmadd52luq $H2,$R0,$D2lo
3369
vpxorq $D2hi,$D2hi,$D2hi
3370
vpmadd52huq $H2,$R0,$D2hi
3371
3372
vpmadd52luq $H0,$R0,$D0lo
3373
vpmadd52huq $H0,$R0,$D0hi
3374
vpmadd52luq $H0,$R1,$D1lo
3375
vpmadd52huq $H0,$R1,$D1hi
3376
vpmadd52luq $H0,$R2,$D2lo
3377
vpmadd52huq $H0,$R2,$D2hi
3378
3379
vpmadd52luq $H1,$S2,$D0lo
3380
vpmadd52huq $H1,$S2,$D0hi
3381
vpmadd52luq $H1,$R0,$D1lo
3382
vpmadd52huq $H1,$R0,$D1hi
3383
vpmadd52luq $H1,$R1,$D2lo
3384
vpmadd52huq $H1,$R1,$D2hi
3385
3386
################################################################
3387
# horizontal addition
3388
3389
mov \$1,%eax
3390
kmovw %eax,%k1
3391
vpsrldq \$8,$D0lo,$T0
3392
vpsrldq \$8,$D0hi,$H0
3393
vpsrldq \$8,$D1lo,$T1
3394
vpsrldq \$8,$D1hi,$H1
3395
vpaddq $T0,$D0lo,$D0lo
3396
vpaddq $H0,$D0hi,$D0hi
3397
vpsrldq \$8,$D2lo,$T2
3398
vpsrldq \$8,$D2hi,$H2
3399
vpaddq $T1,$D1lo,$D1lo
3400
vpaddq $H1,$D1hi,$D1hi
3401
vpermq \$0x2,$D0lo,$T0
3402
vpermq \$0x2,$D0hi,$H0
3403
vpaddq $T2,$D2lo,$D2lo
3404
vpaddq $H2,$D2hi,$D2hi
3405
3406
vpermq \$0x2,$D1lo,$T1
3407
vpermq \$0x2,$D1hi,$H1
3408
vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
3409
vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
3410
vpermq \$0x2,$D2lo,$T2
3411
vpermq \$0x2,$D2hi,$H2
3412
vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
3413
vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
3414
vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
3415
vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
3416
3417
################################################################
3418
# partial reduction
3419
vpsrlq \$44,$D0lo,$tmp
3420
vpsllq \$8,$D0hi,$D0hi
3421
vpandq $mask44,$D0lo,$H0
3422
vpaddq $tmp,$D0hi,$D0hi
3423
3424
vpaddq $D0hi,$D1lo,$D1lo
3425
3426
vpsrlq \$44,$D1lo,$tmp
3427
vpsllq \$8,$D1hi,$D1hi
3428
vpandq $mask44,$D1lo,$H1
3429
vpaddq $tmp,$D1hi,$D1hi
3430
3431
vpaddq $D1hi,$D2lo,$D2lo
3432
3433
vpsrlq \$42,$D2lo,$tmp
3434
vpsllq \$10,$D2hi,$D2hi
3435
vpandq $mask42,$D2lo,$H2
3436
vpaddq $tmp,$D2hi,$D2hi
3437
3438
vpaddq $D2hi,$H0,$H0
3439
vpsllq \$2,$D2hi,$D2hi
3440
3441
vpaddq $D2hi,$H0,$H0
3442
3443
vpsrlq \$44,$H0,$tmp # additional step
3444
vpandq $mask44,$H0,$H0
3445
3446
vpaddq $tmp,$H1,$H1
3447
# at this point $len is
3448
# either 4*n+2 or 0...
3449
sub \$2,$len # len-=32
3450
ja .Lblocks_vpmadd52_4x_do
3451
3452
vmovq %x#$H0,0($ctx)
3453
vmovq %x#$H1,8($ctx)
3454
vmovq %x#$H2,16($ctx)
3455
vzeroall
3456
3457
.Lno_data_vpmadd52_4x:
3458
RET
3459
.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
3460
___
3461
}
3462
{
3463
########################################################################
3464
# As implied by its name 8x subroutine processes 8 blocks in parallel...
3465
# This is intermediate version, as it's used only in cases when input
3466
# length is either 8*n, 8*n+1 or 8*n+2...
3467
3468
my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
3469
my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
3470
my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
3471
my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
3472
3473
$code.=<<___;
3474
.type poly1305_blocks_vpmadd52_8x,\@function,4
3475
.align 32
3476
poly1305_blocks_vpmadd52_8x:
3477
shr \$4,$len
3478
jz .Lno_data_vpmadd52_8x # too short
3479
3480
shl \$40,$padbit
3481
mov 64($ctx),%r8 # peek on power of the key
3482
3483
vmovdqa64 .Lx_mask44(%rip),$mask44
3484
vmovdqa64 .Lx_mask42(%rip),$mask42
3485
3486
test %r8,%r8 # is power value impossible?
3487
js .Linit_vpmadd52 # if it is, then init R[4]
3488
3489
vmovq 0($ctx),%x#$H0 # load current hash value
3490
vmovq 8($ctx),%x#$H1
3491
vmovq 16($ctx),%x#$H2
3492
3493
.Lblocks_vpmadd52_8x:
3494
################################################################
3495
# fist we calculate more key powers
3496
3497
vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
3498
vmovdqu64 160($ctx),$S1
3499
vmovdqu64 64($ctx),$R0
3500
vmovdqu64 96($ctx),$R1
3501
3502
vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3503
vpaddq $R2,$S2,$S2
3504
vpsllq \$2,$S2,$S2
3505
3506
vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
3507
vpbroadcastq %x#$R0,$RR0
3508
vpbroadcastq %x#$R1,$RR1
3509
3510
vpxorq $D0lo,$D0lo,$D0lo
3511
vpmadd52luq $RR2,$S1,$D0lo
3512
vpxorq $D0hi,$D0hi,$D0hi
3513
vpmadd52huq $RR2,$S1,$D0hi
3514
vpxorq $D1lo,$D1lo,$D1lo
3515
vpmadd52luq $RR2,$S2,$D1lo
3516
vpxorq $D1hi,$D1hi,$D1hi
3517
vpmadd52huq $RR2,$S2,$D1hi
3518
vpxorq $D2lo,$D2lo,$D2lo
3519
vpmadd52luq $RR2,$R0,$D2lo
3520
vpxorq $D2hi,$D2hi,$D2hi
3521
vpmadd52huq $RR2,$R0,$D2hi
3522
3523
vpmadd52luq $RR0,$R0,$D0lo
3524
vpmadd52huq $RR0,$R0,$D0hi
3525
vpmadd52luq $RR0,$R1,$D1lo
3526
vpmadd52huq $RR0,$R1,$D1hi
3527
vpmadd52luq $RR0,$R2,$D2lo
3528
vpmadd52huq $RR0,$R2,$D2hi
3529
3530
vpmadd52luq $RR1,$S2,$D0lo
3531
vpmadd52huq $RR1,$S2,$D0hi
3532
vpmadd52luq $RR1,$R0,$D1lo
3533
vpmadd52huq $RR1,$R0,$D1hi
3534
vpmadd52luq $RR1,$R1,$D2lo
3535
vpmadd52huq $RR1,$R1,$D2hi
3536
3537
################################################################
3538
# partial reduction
3539
vpsrlq \$44,$D0lo,$tmp
3540
vpsllq \$8,$D0hi,$D0hi
3541
vpandq $mask44,$D0lo,$RR0
3542
vpaddq $tmp,$D0hi,$D0hi
3543
3544
vpaddq $D0hi,$D1lo,$D1lo
3545
3546
vpsrlq \$44,$D1lo,$tmp
3547
vpsllq \$8,$D1hi,$D1hi
3548
vpandq $mask44,$D1lo,$RR1
3549
vpaddq $tmp,$D1hi,$D1hi
3550
3551
vpaddq $D1hi,$D2lo,$D2lo
3552
3553
vpsrlq \$42,$D2lo,$tmp
3554
vpsllq \$10,$D2hi,$D2hi
3555
vpandq $mask42,$D2lo,$RR2
3556
vpaddq $tmp,$D2hi,$D2hi
3557
3558
vpaddq $D2hi,$RR0,$RR0
3559
vpsllq \$2,$D2hi,$D2hi
3560
3561
vpaddq $D2hi,$RR0,$RR0
3562
3563
vpsrlq \$44,$RR0,$tmp # additional step
3564
vpandq $mask44,$RR0,$RR0
3565
3566
vpaddq $tmp,$RR1,$RR1
3567
3568
################################################################
3569
# At this point Rx holds 1324 powers, RRx - 5768, and the goal
3570
# is 15263748, which reflects how data is loaded...
3571
3572
vpunpcklqdq $R2,$RR2,$T2 # 3748
3573
vpunpckhqdq $R2,$RR2,$R2 # 1526
3574
vpunpcklqdq $R0,$RR0,$T0
3575
vpunpckhqdq $R0,$RR0,$R0
3576
vpunpcklqdq $R1,$RR1,$T1
3577
vpunpckhqdq $R1,$RR1,$R1
3578
___
3579
######## switch to %zmm
3580
map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3581
map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3582
map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3583
map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
3584
3585
$code.=<<___;
3586
vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
3587
vshufi64x2 \$0x44,$R0,$T0,$RR0
3588
vshufi64x2 \$0x44,$R1,$T1,$RR1
3589
3590
vmovdqu64 16*0($inp),$T2 # load data
3591
vmovdqu64 16*4($inp),$T3
3592
lea 16*8($inp),$inp
3593
3594
vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
3595
vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
3596
vpaddq $RR2,$SS2,$SS2
3597
vpaddq $RR1,$SS1,$SS1
3598
vpsllq \$2,$SS2,$SS2
3599
vpsllq \$2,$SS1,$SS1
3600
3601
vpbroadcastq $padbit,$PAD
3602
vpbroadcastq %x#$mask44,$mask44
3603
vpbroadcastq %x#$mask42,$mask42
3604
3605
vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
3606
vpbroadcastq %x#$SS2,$S2
3607
vpbroadcastq %x#$RR0,$R0
3608
vpbroadcastq %x#$RR1,$R1
3609
vpbroadcastq %x#$RR2,$R2
3610
3611
vpunpcklqdq $T3,$T2,$T1 # transpose data
3612
vpunpckhqdq $T3,$T2,$T3
3613
3614
# at this point 64-bit lanes are ordered as 73625140
3615
3616
vpsrlq \$24,$T3,$T2 # splat the data
3617
vporq $PAD,$T2,$T2
3618
vpaddq $T2,$H2,$H2 # accumulate input
3619
vpandq $mask44,$T1,$T0
3620
vpsrlq \$44,$T1,$T1
3621
vpsllq \$20,$T3,$T3
3622
vporq $T3,$T1,$T1
3623
vpandq $mask44,$T1,$T1
3624
3625
sub \$8,$len
3626
jz .Ltail_vpmadd52_8x
3627
jmp .Loop_vpmadd52_8x
3628
3629
.align 32
3630
.Loop_vpmadd52_8x:
3631
#vpaddq $T2,$H2,$H2 # accumulate input
3632
vpaddq $T0,$H0,$H0
3633
vpaddq $T1,$H1,$H1
3634
3635
vpxorq $D0lo,$D0lo,$D0lo
3636
vpmadd52luq $H2,$S1,$D0lo
3637
vpxorq $D0hi,$D0hi,$D0hi
3638
vpmadd52huq $H2,$S1,$D0hi
3639
vpxorq $D1lo,$D1lo,$D1lo
3640
vpmadd52luq $H2,$S2,$D1lo
3641
vpxorq $D1hi,$D1hi,$D1hi
3642
vpmadd52huq $H2,$S2,$D1hi
3643
vpxorq $D2lo,$D2lo,$D2lo
3644
vpmadd52luq $H2,$R0,$D2lo
3645
vpxorq $D2hi,$D2hi,$D2hi
3646
vpmadd52huq $H2,$R0,$D2hi
3647
3648
vmovdqu64 16*0($inp),$T2 # load data
3649
vmovdqu64 16*4($inp),$T3
3650
lea 16*8($inp),$inp
3651
vpmadd52luq $H0,$R0,$D0lo
3652
vpmadd52huq $H0,$R0,$D0hi
3653
vpmadd52luq $H0,$R1,$D1lo
3654
vpmadd52huq $H0,$R1,$D1hi
3655
vpmadd52luq $H0,$R2,$D2lo
3656
vpmadd52huq $H0,$R2,$D2hi
3657
3658
vpunpcklqdq $T3,$T2,$T1 # transpose data
3659
vpunpckhqdq $T3,$T2,$T3
3660
vpmadd52luq $H1,$S2,$D0lo
3661
vpmadd52huq $H1,$S2,$D0hi
3662
vpmadd52luq $H1,$R0,$D1lo
3663
vpmadd52huq $H1,$R0,$D1hi
3664
vpmadd52luq $H1,$R1,$D2lo
3665
vpmadd52huq $H1,$R1,$D2hi
3666
3667
################################################################
3668
# partial reduction (interleaved with data splat)
3669
vpsrlq \$44,$D0lo,$tmp
3670
vpsllq \$8,$D0hi,$D0hi
3671
vpandq $mask44,$D0lo,$H0
3672
vpaddq $tmp,$D0hi,$D0hi
3673
3674
vpsrlq \$24,$T3,$T2
3675
vporq $PAD,$T2,$T2
3676
vpaddq $D0hi,$D1lo,$D1lo
3677
3678
vpsrlq \$44,$D1lo,$tmp
3679
vpsllq \$8,$D1hi,$D1hi
3680
vpandq $mask44,$D1lo,$H1
3681
vpaddq $tmp,$D1hi,$D1hi
3682
3683
vpandq $mask44,$T1,$T0
3684
vpsrlq \$44,$T1,$T1
3685
vpsllq \$20,$T3,$T3
3686
vpaddq $D1hi,$D2lo,$D2lo
3687
3688
vpsrlq \$42,$D2lo,$tmp
3689
vpsllq \$10,$D2hi,$D2hi
3690
vpandq $mask42,$D2lo,$H2
3691
vpaddq $tmp,$D2hi,$D2hi
3692
3693
vpaddq $T2,$H2,$H2 # accumulate input
3694
vpaddq $D2hi,$H0,$H0
3695
vpsllq \$2,$D2hi,$D2hi
3696
3697
vpaddq $D2hi,$H0,$H0
3698
vporq $T3,$T1,$T1
3699
vpandq $mask44,$T1,$T1
3700
3701
vpsrlq \$44,$H0,$tmp # additional step
3702
vpandq $mask44,$H0,$H0
3703
3704
vpaddq $tmp,$H1,$H1
3705
3706
sub \$8,$len # len-=128
3707
jnz .Loop_vpmadd52_8x
3708
3709
.Ltail_vpmadd52_8x:
3710
#vpaddq $T2,$H2,$H2 # accumulate input
3711
vpaddq $T0,$H0,$H0
3712
vpaddq $T1,$H1,$H1
3713
3714
vpxorq $D0lo,$D0lo,$D0lo
3715
vpmadd52luq $H2,$SS1,$D0lo
3716
vpxorq $D0hi,$D0hi,$D0hi
3717
vpmadd52huq $H2,$SS1,$D0hi
3718
vpxorq $D1lo,$D1lo,$D1lo
3719
vpmadd52luq $H2,$SS2,$D1lo
3720
vpxorq $D1hi,$D1hi,$D1hi
3721
vpmadd52huq $H2,$SS2,$D1hi
3722
vpxorq $D2lo,$D2lo,$D2lo
3723
vpmadd52luq $H2,$RR0,$D2lo
3724
vpxorq $D2hi,$D2hi,$D2hi
3725
vpmadd52huq $H2,$RR0,$D2hi
3726
3727
vpmadd52luq $H0,$RR0,$D0lo
3728
vpmadd52huq $H0,$RR0,$D0hi
3729
vpmadd52luq $H0,$RR1,$D1lo
3730
vpmadd52huq $H0,$RR1,$D1hi
3731
vpmadd52luq $H0,$RR2,$D2lo
3732
vpmadd52huq $H0,$RR2,$D2hi
3733
3734
vpmadd52luq $H1,$SS2,$D0lo
3735
vpmadd52huq $H1,$SS2,$D0hi
3736
vpmadd52luq $H1,$RR0,$D1lo
3737
vpmadd52huq $H1,$RR0,$D1hi
3738
vpmadd52luq $H1,$RR1,$D2lo
3739
vpmadd52huq $H1,$RR1,$D2hi
3740
3741
################################################################
3742
# horizontal addition
3743
3744
mov \$1,%eax
3745
kmovw %eax,%k1
3746
vpsrldq \$8,$D0lo,$T0
3747
vpsrldq \$8,$D0hi,$H0
3748
vpsrldq \$8,$D1lo,$T1
3749
vpsrldq \$8,$D1hi,$H1
3750
vpaddq $T0,$D0lo,$D0lo
3751
vpaddq $H0,$D0hi,$D0hi
3752
vpsrldq \$8,$D2lo,$T2
3753
vpsrldq \$8,$D2hi,$H2
3754
vpaddq $T1,$D1lo,$D1lo
3755
vpaddq $H1,$D1hi,$D1hi
3756
vpermq \$0x2,$D0lo,$T0
3757
vpermq \$0x2,$D0hi,$H0
3758
vpaddq $T2,$D2lo,$D2lo
3759
vpaddq $H2,$D2hi,$D2hi
3760
3761
vpermq \$0x2,$D1lo,$T1
3762
vpermq \$0x2,$D1hi,$H1
3763
vpaddq $T0,$D0lo,$D0lo
3764
vpaddq $H0,$D0hi,$D0hi
3765
vpermq \$0x2,$D2lo,$T2
3766
vpermq \$0x2,$D2hi,$H2
3767
vpaddq $T1,$D1lo,$D1lo
3768
vpaddq $H1,$D1hi,$D1hi
3769
vextracti64x4 \$1,$D0lo,%y#$T0
3770
vextracti64x4 \$1,$D0hi,%y#$H0
3771
vpaddq $T2,$D2lo,$D2lo
3772
vpaddq $H2,$D2hi,$D2hi
3773
3774
vextracti64x4 \$1,$D1lo,%y#$T1
3775
vextracti64x4 \$1,$D1hi,%y#$H1
3776
vextracti64x4 \$1,$D2lo,%y#$T2
3777
vextracti64x4 \$1,$D2hi,%y#$H2
3778
___
3779
######## switch back to %ymm
3780
map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3781
map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3782
map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3783
3784
$code.=<<___;
3785
vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
3786
vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
3787
vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
3788
vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
3789
vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
3790
vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
3791
3792
################################################################
3793
# partial reduction
3794
vpsrlq \$44,$D0lo,$tmp
3795
vpsllq \$8,$D0hi,$D0hi
3796
vpandq $mask44,$D0lo,$H0
3797
vpaddq $tmp,$D0hi,$D0hi
3798
3799
vpaddq $D0hi,$D1lo,$D1lo
3800
3801
vpsrlq \$44,$D1lo,$tmp
3802
vpsllq \$8,$D1hi,$D1hi
3803
vpandq $mask44,$D1lo,$H1
3804
vpaddq $tmp,$D1hi,$D1hi
3805
3806
vpaddq $D1hi,$D2lo,$D2lo
3807
3808
vpsrlq \$42,$D2lo,$tmp
3809
vpsllq \$10,$D2hi,$D2hi
3810
vpandq $mask42,$D2lo,$H2
3811
vpaddq $tmp,$D2hi,$D2hi
3812
3813
vpaddq $D2hi,$H0,$H0
3814
vpsllq \$2,$D2hi,$D2hi
3815
3816
vpaddq $D2hi,$H0,$H0
3817
3818
vpsrlq \$44,$H0,$tmp # additional step
3819
vpandq $mask44,$H0,$H0
3820
3821
vpaddq $tmp,$H1,$H1
3822
3823
################################################################
3824
3825
vmovq %x#$H0,0($ctx)
3826
vmovq %x#$H1,8($ctx)
3827
vmovq %x#$H2,16($ctx)
3828
vzeroall
3829
3830
.Lno_data_vpmadd52_8x:
3831
RET
3832
.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
3833
___
3834
}
3835
$code.=<<___;
3836
.type poly1305_emit_base2_44,\@function,3
3837
.align 32
3838
poly1305_emit_base2_44:
3839
mov 0($ctx),%r8 # load hash value
3840
mov 8($ctx),%r9
3841
mov 16($ctx),%r10
3842
3843
mov %r9,%rax
3844
shr \$20,%r9
3845
shl \$44,%rax
3846
mov %r10,%rcx
3847
shr \$40,%r10
3848
shl \$24,%rcx
3849
3850
add %rax,%r8
3851
adc %rcx,%r9
3852
adc \$0,%r10
3853
3854
mov %r8,%rax
3855
add \$5,%r8 # compare to modulus
3856
mov %r9,%rcx
3857
adc \$0,%r9
3858
adc \$0,%r10
3859
shr \$2,%r10 # did 130-bit value overflow?
3860
cmovnz %r8,%rax
3861
cmovnz %r9,%rcx
3862
3863
add 0($nonce),%rax # accumulate nonce
3864
adc 8($nonce),%rcx
3865
mov %rax,0($mac) # write result
3866
mov %rcx,8($mac)
3867
3868
RET
3869
.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
3870
___
3871
} } }
3872
}
3873
3874
if (!$kernel)
3875
{ # chacha20-poly1305 helpers
3876
my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
3877
("%rdi","%rsi","%rdx","%rcx"); # Unix order
3878
$code.=<<___;
3879
.globl xor128_encrypt_n_pad
3880
.type xor128_encrypt_n_pad,\@abi-omnipotent
3881
.align 16
3882
xor128_encrypt_n_pad:
3883
sub $otp,$inp
3884
sub $otp,$out
3885
mov $len,%r10 # put len aside
3886
shr \$4,$len # len / 16
3887
jz .Ltail_enc
3888
nop
3889
.Loop_enc_xmm:
3890
movdqu ($inp,$otp),%xmm0
3891
pxor ($otp),%xmm0
3892
movdqu %xmm0,($out,$otp)
3893
movdqa %xmm0,($otp)
3894
lea 16($otp),$otp
3895
dec $len
3896
jnz .Loop_enc_xmm
3897
3898
and \$15,%r10 # len % 16
3899
jz .Ldone_enc
3900
3901
.Ltail_enc:
3902
mov \$16,$len
3903
sub %r10,$len
3904
xor %eax,%eax
3905
.Loop_enc_byte:
3906
mov ($inp,$otp),%al
3907
xor ($otp),%al
3908
mov %al,($out,$otp)
3909
mov %al,($otp)
3910
lea 1($otp),$otp
3911
dec %r10
3912
jnz .Loop_enc_byte
3913
3914
xor %eax,%eax
3915
.Loop_enc_pad:
3916
mov %al,($otp)
3917
lea 1($otp),$otp
3918
dec $len
3919
jnz .Loop_enc_pad
3920
3921
.Ldone_enc:
3922
mov $otp,%rax
3923
RET
3924
.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
3925
3926
.globl xor128_decrypt_n_pad
3927
.type xor128_decrypt_n_pad,\@abi-omnipotent
3928
.align 16
3929
xor128_decrypt_n_pad:
3930
sub $otp,$inp
3931
sub $otp,$out
3932
mov $len,%r10 # put len aside
3933
shr \$4,$len # len / 16
3934
jz .Ltail_dec
3935
nop
3936
.Loop_dec_xmm:
3937
movdqu ($inp,$otp),%xmm0
3938
movdqa ($otp),%xmm1
3939
pxor %xmm0,%xmm1
3940
movdqu %xmm1,($out,$otp)
3941
movdqa %xmm0,($otp)
3942
lea 16($otp),$otp
3943
dec $len
3944
jnz .Loop_dec_xmm
3945
3946
pxor %xmm1,%xmm1
3947
and \$15,%r10 # len % 16
3948
jz .Ldone_dec
3949
3950
.Ltail_dec:
3951
mov \$16,$len
3952
sub %r10,$len
3953
xor %eax,%eax
3954
xor %r11d,%r11d
3955
.Loop_dec_byte:
3956
mov ($inp,$otp),%r11b
3957
mov ($otp),%al
3958
xor %r11b,%al
3959
mov %al,($out,$otp)
3960
mov %r11b,($otp)
3961
lea 1($otp),$otp
3962
dec %r10
3963
jnz .Loop_dec_byte
3964
3965
xor %eax,%eax
3966
.Loop_dec_pad:
3967
mov %al,($otp)
3968
lea 1($otp),$otp
3969
dec $len
3970
jnz .Loop_dec_pad
3971
3972
.Ldone_dec:
3973
mov $otp,%rax
3974
RET
3975
.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
3976
___
3977
}
3978
3979
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3980
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
3981
if ($win64) {
3982
$rec="%rcx";
3983
$frame="%rdx";
3984
$context="%r8";
3985
$disp="%r9";
3986
3987
$code.=<<___;
3988
.extern __imp_RtlVirtualUnwind
3989
.type se_handler,\@abi-omnipotent
3990
.align 16
3991
se_handler:
3992
push %rsi
3993
push %rdi
3994
push %rbx
3995
push %rbp
3996
push %r12
3997
push %r13
3998
push %r14
3999
push %r15
4000
pushfq
4001
sub \$64,%rsp
4002
4003
mov 120($context),%rax # pull context->Rax
4004
mov 248($context),%rbx # pull context->Rip
4005
4006
mov 8($disp),%rsi # disp->ImageBase
4007
mov 56($disp),%r11 # disp->HandlerData
4008
4009
mov 0(%r11),%r10d # HandlerData[0]
4010
lea (%rsi,%r10),%r10 # prologue label
4011
cmp %r10,%rbx # context->Rip<.Lprologue
4012
jb .Lcommon_seh_tail
4013
4014
mov 152($context),%rax # pull context->Rsp
4015
4016
mov 4(%r11),%r10d # HandlerData[1]
4017
lea (%rsi,%r10),%r10 # epilogue label
4018
cmp %r10,%rbx # context->Rip>=.Lepilogue
4019
jae .Lcommon_seh_tail
4020
4021
lea 48(%rax),%rax
4022
4023
mov -8(%rax),%rbx
4024
mov -16(%rax),%rbp
4025
mov -24(%rax),%r12
4026
mov -32(%rax),%r13
4027
mov -40(%rax),%r14
4028
mov -48(%rax),%r15
4029
mov %rbx,144($context) # restore context->Rbx
4030
mov %rbp,160($context) # restore context->Rbp
4031
mov %r12,216($context) # restore context->R12
4032
mov %r13,224($context) # restore context->R13
4033
mov %r14,232($context) # restore context->R14
4034
mov %r15,240($context) # restore context->R14
4035
4036
jmp .Lcommon_seh_tail
4037
.size se_handler,.-se_handler
4038
4039
.type avx_handler,\@abi-omnipotent
4040
.align 16
4041
avx_handler:
4042
push %rsi
4043
push %rdi
4044
push %rbx
4045
push %rbp
4046
push %r12
4047
push %r13
4048
push %r14
4049
push %r15
4050
pushfq
4051
sub \$64,%rsp
4052
4053
mov 120($context),%rax # pull context->Rax
4054
mov 248($context),%rbx # pull context->Rip
4055
4056
mov 8($disp),%rsi # disp->ImageBase
4057
mov 56($disp),%r11 # disp->HandlerData
4058
4059
mov 0(%r11),%r10d # HandlerData[0]
4060
lea (%rsi,%r10),%r10 # prologue label
4061
cmp %r10,%rbx # context->Rip<prologue label
4062
jb .Lcommon_seh_tail
4063
4064
mov 152($context),%rax # pull context->Rsp
4065
4066
mov 4(%r11),%r10d # HandlerData[1]
4067
lea (%rsi,%r10),%r10 # epilogue label
4068
cmp %r10,%rbx # context->Rip>=epilogue label
4069
jae .Lcommon_seh_tail
4070
4071
mov 208($context),%rax # pull context->R11
4072
4073
lea 0x50(%rax),%rsi
4074
lea 0xf8(%rax),%rax
4075
lea 512($context),%rdi # &context.Xmm6
4076
mov \$20,%ecx
4077
.long 0xa548f3fc # cld; rep movsq
4078
4079
.Lcommon_seh_tail:
4080
mov 8(%rax),%rdi
4081
mov 16(%rax),%rsi
4082
mov %rax,152($context) # restore context->Rsp
4083
mov %rsi,168($context) # restore context->Rsi
4084
mov %rdi,176($context) # restore context->Rdi
4085
4086
mov 40($disp),%rdi # disp->ContextRecord
4087
mov $context,%rsi # context
4088
mov \$154,%ecx # sizeof(CONTEXT)
4089
.long 0xa548f3fc # cld; rep movsq
4090
4091
mov $disp,%rsi
4092
xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER
4093
mov 8(%rsi),%rdx # arg2, disp->ImageBase
4094
mov 0(%rsi),%r8 # arg3, disp->ControlPc
4095
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4096
mov 40(%rsi),%r10 # disp->ContextRecord
4097
lea 56(%rsi),%r11 # &disp->HandlerData
4098
lea 24(%rsi),%r12 # &disp->EstablisherFrame
4099
mov %r10,32(%rsp) # arg5
4100
mov %r11,40(%rsp) # arg6
4101
mov %r12,48(%rsp) # arg7
4102
mov %rcx,56(%rsp) # arg8, (NULL)
4103
call *__imp_RtlVirtualUnwind(%rip)
4104
4105
mov \$1,%eax # ExceptionContinueSearch
4106
add \$64,%rsp
4107
popfq
4108
pop %r15
4109
pop %r14
4110
pop %r13
4111
pop %r12
4112
pop %rbp
4113
pop %rbx
4114
pop %rdi
4115
pop %rsi
4116
RET
4117
.size avx_handler,.-avx_handler
4118
4119
.section .pdata
4120
.align 4
4121
.rva .LSEH_begin_poly1305_block_init_arch
4122
.rva .LSEH_end_poly1305_block_init_arch
4123
.rva .LSEH_info_poly1305_block_init_arch
4124
4125
.rva .LSEH_begin_poly1305_blocks_x86_64
4126
.rva .LSEH_end_poly1305_blocks_x86_64
4127
.rva .LSEH_info_poly1305_blocks_x86_64
4128
4129
.rva .LSEH_begin_poly1305_emit_x86_64
4130
.rva .LSEH_end_poly1305_emit_x86_64
4131
.rva .LSEH_info_poly1305_emit_x86_64
4132
___
4133
$code.=<<___ if ($avx);
4134
.rva .LSEH_begin_poly1305_blocks_avx
4135
.rva .Lbase2_64_avx
4136
.rva .LSEH_info_poly1305_blocks_avx_1
4137
4138
.rva .Lbase2_64_avx
4139
.rva .Leven_avx
4140
.rva .LSEH_info_poly1305_blocks_avx_2
4141
4142
.rva .Leven_avx
4143
.rva .LSEH_end_poly1305_blocks_avx
4144
.rva .LSEH_info_poly1305_blocks_avx_3
4145
4146
.rva .LSEH_begin_poly1305_emit_avx
4147
.rva .LSEH_end_poly1305_emit_avx
4148
.rva .LSEH_info_poly1305_emit_avx
4149
___
4150
$code.=<<___ if ($avx>1);
4151
.rva .LSEH_begin_poly1305_blocks_avx2
4152
.rva .Lbase2_64_avx2
4153
.rva .LSEH_info_poly1305_blocks_avx2_1
4154
4155
.rva .Lbase2_64_avx2
4156
.rva .Leven_avx2
4157
.rva .LSEH_info_poly1305_blocks_avx2_2
4158
4159
.rva .Leven_avx2
4160
.rva .LSEH_end_poly1305_blocks_avx2
4161
.rva .LSEH_info_poly1305_blocks_avx2_3
4162
___
4163
$code.=<<___ if ($avx>2);
4164
.rva .LSEH_begin_poly1305_blocks_avx512
4165
.rva .LSEH_end_poly1305_blocks_avx512
4166
.rva .LSEH_info_poly1305_blocks_avx512
4167
___
4168
$code.=<<___;
4169
.section .xdata
4170
.align 8
4171
.LSEH_info_poly1305_block_init_arch:
4172
.byte 9,0,0,0
4173
.rva se_handler
4174
.rva .LSEH_begin_poly1305_block_init_arch,.LSEH_begin_poly1305_block_init_arch
4175
4176
.LSEH_info_poly1305_blocks_x86_64:
4177
.byte 9,0,0,0
4178
.rva se_handler
4179
.rva .Lblocks_body,.Lblocks_epilogue
4180
4181
.LSEH_info_poly1305_emit_x86_64:
4182
.byte 9,0,0,0
4183
.rva se_handler
4184
.rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
4185
___
4186
$code.=<<___ if ($avx);
4187
.LSEH_info_poly1305_blocks_avx_1:
4188
.byte 9,0,0,0
4189
.rva se_handler
4190
.rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
4191
4192
.LSEH_info_poly1305_blocks_avx_2:
4193
.byte 9,0,0,0
4194
.rva se_handler
4195
.rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
4196
4197
.LSEH_info_poly1305_blocks_avx_3:
4198
.byte 9,0,0,0
4199
.rva avx_handler
4200
.rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
4201
4202
.LSEH_info_poly1305_emit_avx:
4203
.byte 9,0,0,0
4204
.rva se_handler
4205
.rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
4206
___
4207
$code.=<<___ if ($avx>1);
4208
.LSEH_info_poly1305_blocks_avx2_1:
4209
.byte 9,0,0,0
4210
.rva se_handler
4211
.rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
4212
4213
.LSEH_info_poly1305_blocks_avx2_2:
4214
.byte 9,0,0,0
4215
.rva se_handler
4216
.rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
4217
4218
.LSEH_info_poly1305_blocks_avx2_3:
4219
.byte 9,0,0,0
4220
.rva avx_handler
4221
.rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
4222
___
4223
$code.=<<___ if ($avx>2);
4224
.LSEH_info_poly1305_blocks_avx512:
4225
.byte 9,0,0,0
4226
.rva avx_handler
4227
.rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
4228
___
4229
}
4230
4231
open SELF,$0;
4232
while(<SELF>) {
4233
next if (/^#!/);
4234
last if (!s/^#/\/\// and !/^$/);
4235
print;
4236
}
4237
close SELF;
4238
4239
foreach (split('\n',$code)) {
4240
s/\`([^\`]*)\`/eval($1)/ge;
4241
s/%r([a-z]+)#d/%e$1/g;
4242
s/%r([0-9]+)#d/%r$1d/g;
4243
s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
4244
4245
if ($kernel) {
4246
s/(^\.type.*),[0-9]+$/\1/;
4247
s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
4248
next if /^\.cfi.*/;
4249
}
4250
4251
print $_,"\n";
4252
}
4253
close STDOUT;
4254
4255