Path: blob/master/lib/crypto/x86/poly1305-x86_64-cryptogams.pl
26292 views
#!/usr/bin/env perl1# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause2#3# Copyright (C) 2017-2018 Samuel Neves <[email protected]>. All Rights Reserved.4# Copyright (C) 2017-2019 Jason A. Donenfeld <[email protected]>. All Rights Reserved.5# Copyright (C) 2006-2017 CRYPTOGAMS by <[email protected]>. All Rights Reserved.6#7# This code is taken from the OpenSSL project but the author, Andy Polyakov,8# has relicensed it under the licenses specified in the SPDX header above.9# The original headers, including the original license headers, are10# included below for completeness.11#12# ====================================================================13# Written by Andy Polyakov <[email protected]> for the OpenSSL14# project. The module is, however, dual licensed under OpenSSL and15# CRYPTOGAMS licenses depending on where you obtain it. For further16# details see http://www.openssl.org/~appro/cryptogams/.17# ====================================================================18#19# This module implements Poly1305 hash for x86_64.20#21# March 201522#23# Initial release.24#25# December 201626#27# Add AVX512F+VL+BW code path.28#29# November 201730#31# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be32# executed even on Knights Landing. Trigger for modification was33# observation that AVX512 code paths can negatively affect overall34# Skylake-X system performance. Since we are likely to suppress35# AVX512F capability flag [at least on Skylake-X], conversion serves36# as kind of "investment protection". Note that next *lake processor,37# Cannonlake, has AVX512IFMA code path to execute...38#39# Numbers are cycles per processed byte with poly1305_blocks alone,40# measured with rdtsc at fixed clock frequency.41#42# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-51243# P4 4.46/+120% -44# Core 2 2.41/+90% -45# Westmere 1.88/+120% -46# Sandy Bridge 1.39/+140% 1.1047# Haswell 1.14/+175% 1.11 0.6548# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]49# Silvermont 2.83/+95% -50# Knights L 3.60/? 1.65 1.10 0.41(***)51# Goldmont 1.70/+180% -52# VIA Nano 1.82/+150% -53# Sledgehammer 1.38/+160% -54# Bulldozer 2.30/+130% 0.9755# Ryzen 1.15/+200% 1.08 1.1856#57# (*) improvement coefficients relative to clang are more modest and58# are ~50% on most processors, in both cases we are comparing to59# __int128 code;60# (**) SSE2 implementation was attempted, but among non-AVX processors61# it was faster than integer-only code only on older Intel P4 and62# Core processors, 50-30%, less newer processor is, but slower on63# contemporary ones, for example almost 2x slower on Atom, and as64# former are naturally disappearing, SSE2 is deemed unnecessary;65# (***) strangely enough performance seems to vary from core to core,66# listed result is best case;6768$flavour = shift;69$output = shift;70if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }7172$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);73$kernel=0; $kernel=1 if (!$flavour && !$output);7475if (!$kernel) {76$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;77( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or78( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or79die "can't locate x86_64-xlate.pl";8081open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";82*STDOUT=*OUT;8384if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`85=~ /GNU assembler version ([2-9]\.[0-9]+)/) {86$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);87}8889if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&90`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {91$avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);92$avx += 1 if ($1==2.11 && $2>=8);93}9495if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&96`ml64 2>&1` =~ /Version ([0-9]+)\./) {97$avx = ($1>=10) + ($1>=11);98}99100if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {101$avx = ($2>=3.0) + ($2>3.0);102}103} else {104$avx = 4; # The kernel uses ifdefs for this.105}106107sub declare_function() {108my ($name, $align, $nargs) = @_;109if($kernel) {110$code .= "SYM_FUNC_START($name)\n";111$code .= ".L$name:\n";112} else {113$code .= ".globl $name\n";114$code .= ".type $name,\@function,$nargs\n";115$code .= ".align $align\n";116$code .= "$name:\n";117}118}119120sub declare_typed_function() {121my ($name, $align, $nargs) = @_;122if($kernel) {123$code .= "SYM_TYPED_FUNC_START($name)\n";124$code .= ".L$name:\n";125} else {126$code .= ".globl $name\n";127$code .= ".type $name,\@function,$nargs\n";128$code .= ".align $align\n";129$code .= "$name:\n";130}131}132133sub end_function() {134my ($name) = @_;135if($kernel) {136$code .= "SYM_FUNC_END($name)\n";137} else {138$code .= ".size $name,.-$name\n";139}140}141142$code.=<<___ if $kernel;143#include <linux/cfi_types.h>144___145146if ($avx) {147$code.=<<___ if $kernel;148.section .rodata149___150$code.=<<___;151.align 64152.Lconst:153.Lmask24:154.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0155.L129:156.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0157.Lmask26:158.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0159.Lpermd_avx2:160.long 2,2,2,3,2,0,2,1161.Lpermd_avx512:162.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7163164.L2_44_inp_permd:165.long 0,1,1,2,2,3,7,7166.L2_44_inp_shift:167.quad 0,12,24,64168.L2_44_mask:169.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff170.L2_44_shift_rgt:171.quad 44,44,42,64172.L2_44_shift_lft:173.quad 8,8,10,64174175.align 64176.Lx_mask44:177.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff178.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff179.Lx_mask42:180.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff181.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff182___183}184$code.=<<___ if (!$kernel);185.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"186.align 16187___188189my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");190my ($mac,$nonce)=($inp,$len); # *_emit arguments191my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");192my ($h0,$h1,$h2)=("%r14","%rbx","%r10");193194sub poly1305_iteration {195# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1196# output: $h0-$h2 *= $r0-$r1197$code.=<<___;198mulq $h0 # h0*r1199mov %rax,$d2200mov $r0,%rax201mov %rdx,$d3202203mulq $h0 # h0*r0204mov %rax,$h0 # future $h0205mov $r0,%rax206mov %rdx,$d1207208mulq $h1 # h1*r0209add %rax,$d2210mov $s1,%rax211adc %rdx,$d3212213mulq $h1 # h1*s1214mov $h2,$h1 # borrow $h1215add %rax,$h0216adc %rdx,$d1217218imulq $s1,$h1 # h2*s1219add $h1,$d2220mov $d1,$h1221adc \$0,$d3222223imulq $r0,$h2 # h2*r0224add $d2,$h1225mov \$-4,%rax # mask value226adc $h2,$d3227228and $d3,%rax # last reduction step229mov $d3,$h2230shr \$2,$d3231and \$3,$h2232add $d3,%rax233add %rax,$h0234adc \$0,$h1235adc \$0,$h2236___237}238239########################################################################240# Layout of opaque area is following.241#242# unsigned __int64 h[3]; # current hash value base 2^64243# unsigned __int64 r[2]; # key value base 2^64244245$code.=<<___;246.text247___248$code.=<<___ if (!$kernel);249.extern OPENSSL_ia32cap_P250251.globl poly1305_block_init_arch252.hidden poly1305_block_init_arch253.globl poly1305_blocks_x86_64254.hidden poly1305_blocks_x86_64255.globl poly1305_emit_x86_64256.hidden poly1305_emit_x86_64257___258&declare_typed_function("poly1305_block_init_arch", 32, 3);259$code.=<<___;260xor %eax,%eax261mov %rax,0($ctx) # initialize hash value262mov %rax,8($ctx)263mov %rax,16($ctx)264265test $inp,$inp266je .Lno_key267___268$code.=<<___ if (!$kernel);269lea poly1305_blocks_x86_64(%rip),%r10270lea poly1305_emit_x86_64(%rip),%r11271___272$code.=<<___ if (!$kernel && $avx);273mov OPENSSL_ia32cap_P+4(%rip),%r9274lea poly1305_blocks_avx(%rip),%rax275lea poly1305_emit_avx(%rip),%rcx276bt \$`60-32`,%r9 # AVX?277cmovc %rax,%r10278cmovc %rcx,%r11279___280$code.=<<___ if (!$kernel && $avx>1);281lea poly1305_blocks_avx2(%rip),%rax282bt \$`5+32`,%r9 # AVX2?283cmovc %rax,%r10284___285$code.=<<___ if (!$kernel && $avx>3);286mov \$`(1<<31|1<<21|1<<16)`,%rax287shr \$32,%r9288and %rax,%r9289cmp %rax,%r9290je .Linit_base2_44291___292$code.=<<___;293mov \$0x0ffffffc0fffffff,%rax294mov \$0x0ffffffc0ffffffc,%rcx295and 0($inp),%rax296and 8($inp),%rcx297mov %rax,24($ctx)298mov %rcx,32($ctx)299___300$code.=<<___ if (!$kernel && $flavour !~ /elf32/);301mov %r10,0(%rdx)302mov %r11,8(%rdx)303___304$code.=<<___ if (!$kernel && $flavour =~ /elf32/);305mov %r10d,0(%rdx)306mov %r11d,4(%rdx)307___308$code.=<<___;309mov \$1,%eax310.Lno_key:311RET312___313&end_function("poly1305_block_init_arch");314315&declare_function("poly1305_blocks_x86_64", 32, 4);316$code.=<<___;317.cfi_startproc318.Lblocks:319shr \$4,$len320jz .Lno_data # too short321322push %rbx323.cfi_push %rbx324push %r12325.cfi_push %r12326push %r13327.cfi_push %r13328push %r14329.cfi_push %r14330push %r15331.cfi_push %r15332push $ctx333.cfi_push $ctx334.Lblocks_body:335336mov $len,%r15 # reassign $len337338mov 24($ctx),$r0 # load r339mov 32($ctx),$s1340341mov 0($ctx),$h0 # load hash value342mov 8($ctx),$h1343mov 16($ctx),$h2344345mov $s1,$r1346shr \$2,$s1347mov $r1,%rax348add $r1,$s1 # s1 = r1 + (r1 >> 2)349jmp .Loop350351.align 32352.Loop:353add 0($inp),$h0 # accumulate input354adc 8($inp),$h1355lea 16($inp),$inp356adc $padbit,$h2357___358359&poly1305_iteration();360361$code.=<<___;362mov $r1,%rax363dec %r15 # len-=16364jnz .Loop365366mov 0(%rsp),$ctx367.cfi_restore $ctx368369mov $h0,0($ctx) # store hash value370mov $h1,8($ctx)371mov $h2,16($ctx)372373mov 8(%rsp),%r15374.cfi_restore %r15375mov 16(%rsp),%r14376.cfi_restore %r14377mov 24(%rsp),%r13378.cfi_restore %r13379mov 32(%rsp),%r12380.cfi_restore %r12381mov 40(%rsp),%rbx382.cfi_restore %rbx383lea 48(%rsp),%rsp384.cfi_adjust_cfa_offset -48385.Lno_data:386.Lblocks_epilogue:387RET388.cfi_endproc389___390&end_function("poly1305_blocks_x86_64");391392&declare_function("poly1305_emit_x86_64", 32, 3);393$code.=<<___;394.Lemit:395mov 0($ctx),%r8 # load hash value396mov 8($ctx),%r9397mov 16($ctx),%r10398399mov %r8,%rax400add \$5,%r8 # compare to modulus401mov %r9,%rcx402adc \$0,%r9403adc \$0,%r10404shr \$2,%r10 # did 130-bit value overflow?405cmovnz %r8,%rax406cmovnz %r9,%rcx407408add 0($nonce),%rax # accumulate nonce409adc 8($nonce),%rcx410mov %rax,0($mac) # write result411mov %rcx,8($mac)412413RET414___415&end_function("poly1305_emit_x86_64");416if ($avx) {417418########################################################################419# Layout of opaque area is following.420#421# unsigned __int32 h[5]; # current hash value base 2^26422# unsigned __int32 is_base2_26;423# unsigned __int64 r[2]; # key value base 2^64424# unsigned __int64 pad;425# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];426#427# where r^n are base 2^26 digits of degrees of multiplier key. There are428# 5 digits, but last four are interleaved with multiples of 5, totalling429# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.430431my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =432map("%xmm$_",(0..15));433434$code.=<<___;435.type __poly1305_block,\@abi-omnipotent436.align 32437__poly1305_block:438push $ctx439___440&poly1305_iteration();441$code.=<<___;442pop $ctx443RET444.size __poly1305_block,.-__poly1305_block445446.type __poly1305_init_avx,\@abi-omnipotent447.align 32448__poly1305_init_avx:449push %rbp450mov %rsp,%rbp451mov $r0,$h0452mov $r1,$h1453xor $h2,$h2454455lea 48+64($ctx),$ctx # size optimization456457mov $r1,%rax458call __poly1305_block # r^2459460mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26461mov \$0x3ffffff,%edx462mov $h0,$d1463and $h0#d,%eax464mov $r0,$d2465and $r0#d,%edx466mov %eax,`16*0+0-64`($ctx)467shr \$26,$d1468mov %edx,`16*0+4-64`($ctx)469shr \$26,$d2470471mov \$0x3ffffff,%eax472mov \$0x3ffffff,%edx473and $d1#d,%eax474and $d2#d,%edx475mov %eax,`16*1+0-64`($ctx)476lea (%rax,%rax,4),%eax # *5477mov %edx,`16*1+4-64`($ctx)478lea (%rdx,%rdx,4),%edx # *5479mov %eax,`16*2+0-64`($ctx)480shr \$26,$d1481mov %edx,`16*2+4-64`($ctx)482shr \$26,$d2483484mov $h1,%rax485mov $r1,%rdx486shl \$12,%rax487shl \$12,%rdx488or $d1,%rax489or $d2,%rdx490and \$0x3ffffff,%eax491and \$0x3ffffff,%edx492mov %eax,`16*3+0-64`($ctx)493lea (%rax,%rax,4),%eax # *5494mov %edx,`16*3+4-64`($ctx)495lea (%rdx,%rdx,4),%edx # *5496mov %eax,`16*4+0-64`($ctx)497mov $h1,$d1498mov %edx,`16*4+4-64`($ctx)499mov $r1,$d2500501mov \$0x3ffffff,%eax502mov \$0x3ffffff,%edx503shr \$14,$d1504shr \$14,$d2505and $d1#d,%eax506and $d2#d,%edx507mov %eax,`16*5+0-64`($ctx)508lea (%rax,%rax,4),%eax # *5509mov %edx,`16*5+4-64`($ctx)510lea (%rdx,%rdx,4),%edx # *5511mov %eax,`16*6+0-64`($ctx)512shr \$26,$d1513mov %edx,`16*6+4-64`($ctx)514shr \$26,$d2515516mov $h2,%rax517shl \$24,%rax518or %rax,$d1519mov $d1#d,`16*7+0-64`($ctx)520lea ($d1,$d1,4),$d1 # *5521mov $d2#d,`16*7+4-64`($ctx)522lea ($d2,$d2,4),$d2 # *5523mov $d1#d,`16*8+0-64`($ctx)524mov $d2#d,`16*8+4-64`($ctx)525526mov $r1,%rax527call __poly1305_block # r^3528529mov \$0x3ffffff,%eax # save r^3 base 2^26530mov $h0,$d1531and $h0#d,%eax532shr \$26,$d1533mov %eax,`16*0+12-64`($ctx)534535mov \$0x3ffffff,%edx536and $d1#d,%edx537mov %edx,`16*1+12-64`($ctx)538lea (%rdx,%rdx,4),%edx # *5539shr \$26,$d1540mov %edx,`16*2+12-64`($ctx)541542mov $h1,%rax543shl \$12,%rax544or $d1,%rax545and \$0x3ffffff,%eax546mov %eax,`16*3+12-64`($ctx)547lea (%rax,%rax,4),%eax # *5548mov $h1,$d1549mov %eax,`16*4+12-64`($ctx)550551mov \$0x3ffffff,%edx552shr \$14,$d1553and $d1#d,%edx554mov %edx,`16*5+12-64`($ctx)555lea (%rdx,%rdx,4),%edx # *5556shr \$26,$d1557mov %edx,`16*6+12-64`($ctx)558559mov $h2,%rax560shl \$24,%rax561or %rax,$d1562mov $d1#d,`16*7+12-64`($ctx)563lea ($d1,$d1,4),$d1 # *5564mov $d1#d,`16*8+12-64`($ctx)565566mov $r1,%rax567call __poly1305_block # r^4568569mov \$0x3ffffff,%eax # save r^4 base 2^26570mov $h0,$d1571and $h0#d,%eax572shr \$26,$d1573mov %eax,`16*0+8-64`($ctx)574575mov \$0x3ffffff,%edx576and $d1#d,%edx577mov %edx,`16*1+8-64`($ctx)578lea (%rdx,%rdx,4),%edx # *5579shr \$26,$d1580mov %edx,`16*2+8-64`($ctx)581582mov $h1,%rax583shl \$12,%rax584or $d1,%rax585and \$0x3ffffff,%eax586mov %eax,`16*3+8-64`($ctx)587lea (%rax,%rax,4),%eax # *5588mov $h1,$d1589mov %eax,`16*4+8-64`($ctx)590591mov \$0x3ffffff,%edx592shr \$14,$d1593and $d1#d,%edx594mov %edx,`16*5+8-64`($ctx)595lea (%rdx,%rdx,4),%edx # *5596shr \$26,$d1597mov %edx,`16*6+8-64`($ctx)598599mov $h2,%rax600shl \$24,%rax601or %rax,$d1602mov $d1#d,`16*7+8-64`($ctx)603lea ($d1,$d1,4),$d1 # *5604mov $d1#d,`16*8+8-64`($ctx)605606lea -48-64($ctx),$ctx # size [de-]optimization607pop %rbp608RET609.size __poly1305_init_avx,.-__poly1305_init_avx610___611612&declare_function("poly1305_blocks_avx", 32, 4);613$code.=<<___;614.cfi_startproc615mov 20($ctx),%r8d # is_base2_26616cmp \$128,$len617jae .Lblocks_avx618test %r8d,%r8d619jz .Lblocks620621.Lblocks_avx:622and \$-16,$len623jz .Lno_data_avx624625vzeroupper626627test %r8d,%r8d628jz .Lbase2_64_avx629630test \$31,$len631jz .Leven_avx632633push %rbp634.cfi_push %rbp635mov %rsp,%rbp636push %rbx637.cfi_push %rbx638push %r12639.cfi_push %r12640push %r13641.cfi_push %r13642push %r14643.cfi_push %r14644push %r15645.cfi_push %r15646.Lblocks_avx_body:647648mov $len,%r15 # reassign $len649650mov 0($ctx),$d1 # load hash value651mov 8($ctx),$d2652mov 16($ctx),$h2#d653654mov 24($ctx),$r0 # load r655mov 32($ctx),$s1656657################################# base 2^26 -> base 2^64658mov $d1#d,$h0#d659and \$`-1*(1<<31)`,$d1660mov $d2,$r1 # borrow $r1661mov $d2#d,$h1#d662and \$`-1*(1<<31)`,$d2663664shr \$6,$d1665shl \$52,$r1666add $d1,$h0667shr \$12,$h1668shr \$18,$d2669add $r1,$h0670adc $d2,$h1671672mov $h2,$d1673shl \$40,$d1674shr \$24,$h2675add $d1,$h1676adc \$0,$h2 # can be partially reduced...677678mov \$-4,$d2 # ... so reduce679mov $h2,$d1680and $h2,$d2681shr \$2,$d1682and \$3,$h2683add $d2,$d1 # =*5684add $d1,$h0685adc \$0,$h1686adc \$0,$h2687688mov $s1,$r1689mov $s1,%rax690shr \$2,$s1691add $r1,$s1 # s1 = r1 + (r1 >> 2)692693add 0($inp),$h0 # accumulate input694adc 8($inp),$h1695lea 16($inp),$inp696adc $padbit,$h2697698call __poly1305_block699700test $padbit,$padbit # if $padbit is zero,701jz .Lstore_base2_64_avx # store hash in base 2^64 format702703################################# base 2^64 -> base 2^26704mov $h0,%rax705mov $h0,%rdx706shr \$52,$h0707mov $h1,$r0708mov $h1,$r1709shr \$26,%rdx710and \$0x3ffffff,%rax # h[0]711shl \$12,$r0712and \$0x3ffffff,%rdx # h[1]713shr \$14,$h1714or $r0,$h0715shl \$24,$h2716and \$0x3ffffff,$h0 # h[2]717shr \$40,$r1718and \$0x3ffffff,$h1 # h[3]719or $r1,$h2 # h[4]720721sub \$16,%r15722jz .Lstore_base2_26_avx723724vmovd %rax#d,$H0725vmovd %rdx#d,$H1726vmovd $h0#d,$H2727vmovd $h1#d,$H3728vmovd $h2#d,$H4729jmp .Lproceed_avx730731.align 32732.Lstore_base2_64_avx:733mov $h0,0($ctx)734mov $h1,8($ctx)735mov $h2,16($ctx) # note that is_base2_26 is zeroed736jmp .Ldone_avx737738.align 16739.Lstore_base2_26_avx:740mov %rax#d,0($ctx) # store hash value base 2^26741mov %rdx#d,4($ctx)742mov $h0#d,8($ctx)743mov $h1#d,12($ctx)744mov $h2#d,16($ctx)745.align 16746.Ldone_avx:747pop %r15748.cfi_restore %r15749pop %r14750.cfi_restore %r14751pop %r13752.cfi_restore %r13753pop %r12754.cfi_restore %r12755pop %rbx756.cfi_restore %rbx757pop %rbp758.cfi_restore %rbp759.Lno_data_avx:760.Lblocks_avx_epilogue:761RET762.cfi_endproc763764.align 32765.Lbase2_64_avx:766.cfi_startproc767push %rbp768.cfi_push %rbp769mov %rsp,%rbp770push %rbx771.cfi_push %rbx772push %r12773.cfi_push %r12774push %r13775.cfi_push %r13776push %r14777.cfi_push %r14778push %r15779.cfi_push %r15780.Lbase2_64_avx_body:781782mov $len,%r15 # reassign $len783784mov 24($ctx),$r0 # load r785mov 32($ctx),$s1786787mov 0($ctx),$h0 # load hash value788mov 8($ctx),$h1789mov 16($ctx),$h2#d790791mov $s1,$r1792mov $s1,%rax793shr \$2,$s1794add $r1,$s1 # s1 = r1 + (r1 >> 2)795796test \$31,$len797jz .Linit_avx798799add 0($inp),$h0 # accumulate input800adc 8($inp),$h1801lea 16($inp),$inp802adc $padbit,$h2803sub \$16,%r15804805call __poly1305_block806807.Linit_avx:808################################# base 2^64 -> base 2^26809mov $h0,%rax810mov $h0,%rdx811shr \$52,$h0812mov $h1,$d1813mov $h1,$d2814shr \$26,%rdx815and \$0x3ffffff,%rax # h[0]816shl \$12,$d1817and \$0x3ffffff,%rdx # h[1]818shr \$14,$h1819or $d1,$h0820shl \$24,$h2821and \$0x3ffffff,$h0 # h[2]822shr \$40,$d2823and \$0x3ffffff,$h1 # h[3]824or $d2,$h2 # h[4]825826vmovd %rax#d,$H0827vmovd %rdx#d,$H1828vmovd $h0#d,$H2829vmovd $h1#d,$H3830vmovd $h2#d,$H4831movl \$1,20($ctx) # set is_base2_26832833call __poly1305_init_avx834835.Lproceed_avx:836mov %r15,$len837pop %r15838.cfi_restore %r15839pop %r14840.cfi_restore %r14841pop %r13842.cfi_restore %r13843pop %r12844.cfi_restore %r12845pop %rbx846.cfi_restore %rbx847pop %rbp848.cfi_restore %rbp849.Lbase2_64_avx_epilogue:850jmp .Ldo_avx851.cfi_endproc852853.align 32854.Leven_avx:855.cfi_startproc856vmovd 4*0($ctx),$H0 # load hash value857vmovd 4*1($ctx),$H1858vmovd 4*2($ctx),$H2859vmovd 4*3($ctx),$H3860vmovd 4*4($ctx),$H4861862.Ldo_avx:863___864$code.=<<___ if (!$win64);865lea 8(%rsp),%r10866.cfi_def_cfa_register %r10867and \$-32,%rsp868sub \$-8,%rsp869lea -0x58(%rsp),%r11870sub \$0x178,%rsp871___872$code.=<<___ if ($win64);873lea -0xf8(%rsp),%r11874sub \$0x218,%rsp875vmovdqa %xmm6,0x50(%r11)876vmovdqa %xmm7,0x60(%r11)877vmovdqa %xmm8,0x70(%r11)878vmovdqa %xmm9,0x80(%r11)879vmovdqa %xmm10,0x90(%r11)880vmovdqa %xmm11,0xa0(%r11)881vmovdqa %xmm12,0xb0(%r11)882vmovdqa %xmm13,0xc0(%r11)883vmovdqa %xmm14,0xd0(%r11)884vmovdqa %xmm15,0xe0(%r11)885.Ldo_avx_body:886___887$code.=<<___;888sub \$64,$len889lea -32($inp),%rax890cmovc %rax,$inp891892vmovdqu `16*3`($ctx),$D4 # preload r0^2893lea `16*3+64`($ctx),$ctx # size optimization894lea .Lconst(%rip),%rcx895896################################################################897# load input898vmovdqu 16*2($inp),$T0899vmovdqu 16*3($inp),$T1900vmovdqa 64(%rcx),$MASK # .Lmask26901902vpsrldq \$6,$T0,$T2 # splat input903vpsrldq \$6,$T1,$T3904vpunpckhqdq $T1,$T0,$T4 # 4905vpunpcklqdq $T1,$T0,$T0 # 0:1906vpunpcklqdq $T3,$T2,$T3 # 2:3907908vpsrlq \$40,$T4,$T4 # 4909vpsrlq \$26,$T0,$T1910vpand $MASK,$T0,$T0 # 0911vpsrlq \$4,$T3,$T2912vpand $MASK,$T1,$T1 # 1913vpsrlq \$30,$T3,$T3914vpand $MASK,$T2,$T2 # 2915vpand $MASK,$T3,$T3 # 3916vpor 32(%rcx),$T4,$T4 # padbit, yes, always917918jbe .Lskip_loop_avx919920# expand and copy pre-calculated table to stack921vmovdqu `16*1-64`($ctx),$D1922vmovdqu `16*2-64`($ctx),$D2923vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434924vpshufd \$0x44,$D4,$D0 # xx12 -> 1212925vmovdqa $D3,-0x90(%r11)926vmovdqa $D0,0x00(%rsp)927vpshufd \$0xEE,$D1,$D4928vmovdqu `16*3-64`($ctx),$D0929vpshufd \$0x44,$D1,$D1930vmovdqa $D4,-0x80(%r11)931vmovdqa $D1,0x10(%rsp)932vpshufd \$0xEE,$D2,$D3933vmovdqu `16*4-64`($ctx),$D1934vpshufd \$0x44,$D2,$D2935vmovdqa $D3,-0x70(%r11)936vmovdqa $D2,0x20(%rsp)937vpshufd \$0xEE,$D0,$D4938vmovdqu `16*5-64`($ctx),$D2939vpshufd \$0x44,$D0,$D0940vmovdqa $D4,-0x60(%r11)941vmovdqa $D0,0x30(%rsp)942vpshufd \$0xEE,$D1,$D3943vmovdqu `16*6-64`($ctx),$D0944vpshufd \$0x44,$D1,$D1945vmovdqa $D3,-0x50(%r11)946vmovdqa $D1,0x40(%rsp)947vpshufd \$0xEE,$D2,$D4948vmovdqu `16*7-64`($ctx),$D1949vpshufd \$0x44,$D2,$D2950vmovdqa $D4,-0x40(%r11)951vmovdqa $D2,0x50(%rsp)952vpshufd \$0xEE,$D0,$D3953vmovdqu `16*8-64`($ctx),$D2954vpshufd \$0x44,$D0,$D0955vmovdqa $D3,-0x30(%r11)956vmovdqa $D0,0x60(%rsp)957vpshufd \$0xEE,$D1,$D4958vpshufd \$0x44,$D1,$D1959vmovdqa $D4,-0x20(%r11)960vmovdqa $D1,0x70(%rsp)961vpshufd \$0xEE,$D2,$D3962vmovdqa 0x00(%rsp),$D4 # preload r0^2963vpshufd \$0x44,$D2,$D2964vmovdqa $D3,-0x10(%r11)965vmovdqa $D2,0x80(%rsp)966967jmp .Loop_avx968969.align 32970.Loop_avx:971################################################################972# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2973# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r974# \___________________/975# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2976# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r977# \___________________/ \____________________/978#979# Note that we start with inp[2:3]*r^2. This is because it980# doesn't depend on reduction in previous iteration.981################################################################982# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4983# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4984# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4985# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4986# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4987#988# though note that $Tx and $Hx are "reversed" in this section,989# and $D4 is preloaded with r0^2...990991vpmuludq $T0,$D4,$D0 # d0 = h0*r0992vpmuludq $T1,$D4,$D1 # d1 = h1*r0993vmovdqa $H2,0x20(%r11) # offload hash994vpmuludq $T2,$D4,$D2 # d3 = h2*r0995vmovdqa 0x10(%rsp),$H2 # r1^2996vpmuludq $T3,$D4,$D3 # d3 = h3*r0997vpmuludq $T4,$D4,$D4 # d4 = h4*r0998999vmovdqa $H0,0x00(%r11) #1000vpmuludq 0x20(%rsp),$T4,$H0 # h4*s11001vmovdqa $H1,0x10(%r11) #1002vpmuludq $T3,$H2,$H1 # h3*r11003vpaddq $H0,$D0,$D0 # d0 += h4*s11004vpaddq $H1,$D4,$D4 # d4 += h3*r11005vmovdqa $H3,0x30(%r11) #1006vpmuludq $T2,$H2,$H0 # h2*r11007vpmuludq $T1,$H2,$H1 # h1*r11008vpaddq $H0,$D3,$D3 # d3 += h2*r11009vmovdqa 0x30(%rsp),$H3 # r2^21010vpaddq $H1,$D2,$D2 # d2 += h1*r11011vmovdqa $H4,0x40(%r11) #1012vpmuludq $T0,$H2,$H2 # h0*r11013vpmuludq $T2,$H3,$H0 # h2*r21014vpaddq $H2,$D1,$D1 # d1 += h0*r110151016vmovdqa 0x40(%rsp),$H4 # s2^21017vpaddq $H0,$D4,$D4 # d4 += h2*r21018vpmuludq $T1,$H3,$H1 # h1*r21019vpmuludq $T0,$H3,$H3 # h0*r21020vpaddq $H1,$D3,$D3 # d3 += h1*r21021vmovdqa 0x50(%rsp),$H2 # r3^21022vpaddq $H3,$D2,$D2 # d2 += h0*r21023vpmuludq $T4,$H4,$H0 # h4*s21024vpmuludq $T3,$H4,$H4 # h3*s21025vpaddq $H0,$D1,$D1 # d1 += h4*s21026vmovdqa 0x60(%rsp),$H3 # s3^21027vpaddq $H4,$D0,$D0 # d0 += h3*s210281029vmovdqa 0x80(%rsp),$H4 # s4^21030vpmuludq $T1,$H2,$H1 # h1*r31031vpmuludq $T0,$H2,$H2 # h0*r31032vpaddq $H1,$D4,$D4 # d4 += h1*r31033vpaddq $H2,$D3,$D3 # d3 += h0*r31034vpmuludq $T4,$H3,$H0 # h4*s31035vpmuludq $T3,$H3,$H1 # h3*s31036vpaddq $H0,$D2,$D2 # d2 += h4*s31037vmovdqu 16*0($inp),$H0 # load input1038vpaddq $H1,$D1,$D1 # d1 += h3*s31039vpmuludq $T2,$H3,$H3 # h2*s31040vpmuludq $T2,$H4,$T2 # h2*s41041vpaddq $H3,$D0,$D0 # d0 += h2*s310421043vmovdqu 16*1($inp),$H1 #1044vpaddq $T2,$D1,$D1 # d1 += h2*s41045vpmuludq $T3,$H4,$T3 # h3*s41046vpmuludq $T4,$H4,$T4 # h4*s41047vpsrldq \$6,$H0,$H2 # splat input1048vpaddq $T3,$D2,$D2 # d2 += h3*s41049vpaddq $T4,$D3,$D3 # d3 += h4*s41050vpsrldq \$6,$H1,$H3 #1051vpmuludq 0x70(%rsp),$T0,$T4 # h0*r41052vpmuludq $T1,$H4,$T0 # h1*s41053vpunpckhqdq $H1,$H0,$H4 # 41054vpaddq $T4,$D4,$D4 # d4 += h0*r41055vmovdqa -0x90(%r11),$T4 # r0^41056vpaddq $T0,$D0,$D0 # d0 += h1*s410571058vpunpcklqdq $H1,$H0,$H0 # 0:11059vpunpcklqdq $H3,$H2,$H3 # 2:310601061#vpsrlq \$40,$H4,$H4 # 41062vpsrldq \$`40/8`,$H4,$H4 # 41063vpsrlq \$26,$H0,$H11064vpand $MASK,$H0,$H0 # 01065vpsrlq \$4,$H3,$H21066vpand $MASK,$H1,$H1 # 11067vpand 0(%rcx),$H4,$H4 # .Lmask241068vpsrlq \$30,$H3,$H31069vpand $MASK,$H2,$H2 # 21070vpand $MASK,$H3,$H3 # 31071vpor 32(%rcx),$H4,$H4 # padbit, yes, always10721073vpaddq 0x00(%r11),$H0,$H0 # add hash value1074vpaddq 0x10(%r11),$H1,$H11075vpaddq 0x20(%r11),$H2,$H21076vpaddq 0x30(%r11),$H3,$H31077vpaddq 0x40(%r11),$H4,$H410781079lea 16*2($inp),%rax1080lea 16*4($inp),$inp1081sub \$64,$len1082cmovc %rax,$inp10831084################################################################1085# Now we accumulate (inp[0:1]+hash)*r^41086################################################################1087# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r41088# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r41089# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r41090# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r41091# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r410921093vpmuludq $H0,$T4,$T0 # h0*r01094vpmuludq $H1,$T4,$T1 # h1*r01095vpaddq $T0,$D0,$D01096vpaddq $T1,$D1,$D11097vmovdqa -0x80(%r11),$T2 # r1^41098vpmuludq $H2,$T4,$T0 # h2*r01099vpmuludq $H3,$T4,$T1 # h3*r01100vpaddq $T0,$D2,$D21101vpaddq $T1,$D3,$D31102vpmuludq $H4,$T4,$T4 # h4*r01103vpmuludq -0x70(%r11),$H4,$T0 # h4*s11104vpaddq $T4,$D4,$D411051106vpaddq $T0,$D0,$D0 # d0 += h4*s11107vpmuludq $H2,$T2,$T1 # h2*r11108vpmuludq $H3,$T2,$T0 # h3*r11109vpaddq $T1,$D3,$D3 # d3 += h2*r11110vmovdqa -0x60(%r11),$T3 # r2^41111vpaddq $T0,$D4,$D4 # d4 += h3*r11112vpmuludq $H1,$T2,$T1 # h1*r11113vpmuludq $H0,$T2,$T2 # h0*r11114vpaddq $T1,$D2,$D2 # d2 += h1*r11115vpaddq $T2,$D1,$D1 # d1 += h0*r111161117vmovdqa -0x50(%r11),$T4 # s2^41118vpmuludq $H2,$T3,$T0 # h2*r21119vpmuludq $H1,$T3,$T1 # h1*r21120vpaddq $T0,$D4,$D4 # d4 += h2*r21121vpaddq $T1,$D3,$D3 # d3 += h1*r21122vmovdqa -0x40(%r11),$T2 # r3^41123vpmuludq $H0,$T3,$T3 # h0*r21124vpmuludq $H4,$T4,$T0 # h4*s21125vpaddq $T3,$D2,$D2 # d2 += h0*r21126vpaddq $T0,$D1,$D1 # d1 += h4*s21127vmovdqa -0x30(%r11),$T3 # s3^41128vpmuludq $H3,$T4,$T4 # h3*s21129vpmuludq $H1,$T2,$T1 # h1*r31130vpaddq $T4,$D0,$D0 # d0 += h3*s211311132vmovdqa -0x10(%r11),$T4 # s4^41133vpaddq $T1,$D4,$D4 # d4 += h1*r31134vpmuludq $H0,$T2,$T2 # h0*r31135vpmuludq $H4,$T3,$T0 # h4*s31136vpaddq $T2,$D3,$D3 # d3 += h0*r31137vpaddq $T0,$D2,$D2 # d2 += h4*s31138vmovdqu 16*2($inp),$T0 # load input1139vpmuludq $H3,$T3,$T2 # h3*s31140vpmuludq $H2,$T3,$T3 # h2*s31141vpaddq $T2,$D1,$D1 # d1 += h3*s31142vmovdqu 16*3($inp),$T1 #1143vpaddq $T3,$D0,$D0 # d0 += h2*s311441145vpmuludq $H2,$T4,$H2 # h2*s41146vpmuludq $H3,$T4,$H3 # h3*s41147vpsrldq \$6,$T0,$T2 # splat input1148vpaddq $H2,$D1,$D1 # d1 += h2*s41149vpmuludq $H4,$T4,$H4 # h4*s41150vpsrldq \$6,$T1,$T3 #1151vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s41152vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s41153vpmuludq -0x20(%r11),$H0,$H4 # h0*r41154vpmuludq $H1,$T4,$H01155vpunpckhqdq $T1,$T0,$T4 # 41156vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r41157vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s411581159vpunpcklqdq $T1,$T0,$T0 # 0:11160vpunpcklqdq $T3,$T2,$T3 # 2:311611162#vpsrlq \$40,$T4,$T4 # 41163vpsrldq \$`40/8`,$T4,$T4 # 41164vpsrlq \$26,$T0,$T11165vmovdqa 0x00(%rsp),$D4 # preload r0^21166vpand $MASK,$T0,$T0 # 01167vpsrlq \$4,$T3,$T21168vpand $MASK,$T1,$T1 # 11169vpand 0(%rcx),$T4,$T4 # .Lmask241170vpsrlq \$30,$T3,$T31171vpand $MASK,$T2,$T2 # 21172vpand $MASK,$T3,$T3 # 31173vpor 32(%rcx),$T4,$T4 # padbit, yes, always11741175################################################################1176# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein1177# and P. Schwabe11781179vpsrlq \$26,$H3,$D31180vpand $MASK,$H3,$H31181vpaddq $D3,$H4,$H4 # h3 -> h411821183vpsrlq \$26,$H0,$D01184vpand $MASK,$H0,$H01185vpaddq $D0,$D1,$H1 # h0 -> h111861187vpsrlq \$26,$H4,$D01188vpand $MASK,$H4,$H411891190vpsrlq \$26,$H1,$D11191vpand $MASK,$H1,$H11192vpaddq $D1,$H2,$H2 # h1 -> h211931194vpaddq $D0,$H0,$H01195vpsllq \$2,$D0,$D01196vpaddq $D0,$H0,$H0 # h4 -> h011971198vpsrlq \$26,$H2,$D21199vpand $MASK,$H2,$H21200vpaddq $D2,$H3,$H3 # h2 -> h312011202vpsrlq \$26,$H0,$D01203vpand $MASK,$H0,$H01204vpaddq $D0,$H1,$H1 # h0 -> h112051206vpsrlq \$26,$H3,$D31207vpand $MASK,$H3,$H31208vpaddq $D3,$H4,$H4 # h3 -> h412091210ja .Loop_avx12111212.Lskip_loop_avx:1213################################################################1214# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^112151216vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x21217add \$32,$len1218jnz .Long_tail_avx12191220vpaddq $H2,$T2,$T21221vpaddq $H0,$T0,$T01222vpaddq $H1,$T1,$T11223vpaddq $H3,$T3,$T31224vpaddq $H4,$T4,$T412251226.Long_tail_avx:1227vmovdqa $H2,0x20(%r11)1228vmovdqa $H0,0x00(%r11)1229vmovdqa $H1,0x10(%r11)1230vmovdqa $H3,0x30(%r11)1231vmovdqa $H4,0x40(%r11)12321233# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r41234# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r41235# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r41236# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r41237# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r412381239vpmuludq $T2,$D4,$D2 # d2 = h2*r01240vpmuludq $T0,$D4,$D0 # d0 = h0*r01241vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n1242vpmuludq $T1,$D4,$D1 # d1 = h1*r01243vpmuludq $T3,$D4,$D3 # d3 = h3*r01244vpmuludq $T4,$D4,$D4 # d4 = h4*r012451246vpmuludq $T3,$H2,$H0 # h3*r11247vpaddq $H0,$D4,$D4 # d4 += h3*r11248vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n1249vpmuludq $T2,$H2,$H1 # h2*r11250vpaddq $H1,$D3,$D3 # d3 += h2*r11251vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n1252vpmuludq $T1,$H2,$H0 # h1*r11253vpaddq $H0,$D2,$D2 # d2 += h1*r11254vpmuludq $T0,$H2,$H2 # h0*r11255vpaddq $H2,$D1,$D1 # d1 += h0*r11256vpmuludq $T4,$H3,$H3 # h4*s11257vpaddq $H3,$D0,$D0 # d0 += h4*s112581259vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n1260vpmuludq $T2,$H4,$H1 # h2*r21261vpaddq $H1,$D4,$D4 # d4 += h2*r21262vpmuludq $T1,$H4,$H0 # h1*r21263vpaddq $H0,$D3,$D3 # d3 += h1*r21264vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n1265vpmuludq $T0,$H4,$H4 # h0*r21266vpaddq $H4,$D2,$D2 # d2 += h0*r21267vpmuludq $T4,$H2,$H1 # h4*s21268vpaddq $H1,$D1,$D1 # d1 += h4*s21269vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n1270vpmuludq $T3,$H2,$H2 # h3*s21271vpaddq $H2,$D0,$D0 # d0 += h3*s212721273vpmuludq $T1,$H3,$H0 # h1*r31274vpaddq $H0,$D4,$D4 # d4 += h1*r31275vpmuludq $T0,$H3,$H3 # h0*r31276vpaddq $H3,$D3,$D3 # d3 += h0*r31277vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n1278vpmuludq $T4,$H4,$H1 # h4*s31279vpaddq $H1,$D2,$D2 # d2 += h4*s31280vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n1281vpmuludq $T3,$H4,$H0 # h3*s31282vpaddq $H0,$D1,$D1 # d1 += h3*s31283vpmuludq $T2,$H4,$H4 # h2*s31284vpaddq $H4,$D0,$D0 # d0 += h2*s312851286vpmuludq $T0,$H2,$H2 # h0*r41287vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r41288vpmuludq $T4,$H3,$H1 # h4*s41289vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s41290vpmuludq $T3,$H3,$H0 # h3*s41291vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s41292vpmuludq $T2,$H3,$H1 # h2*s41293vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s41294vpmuludq $T1,$H3,$H3 # h1*s41295vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s412961297jz .Lshort_tail_avx12981299vmovdqu 16*0($inp),$H0 # load input1300vmovdqu 16*1($inp),$H113011302vpsrldq \$6,$H0,$H2 # splat input1303vpsrldq \$6,$H1,$H31304vpunpckhqdq $H1,$H0,$H4 # 41305vpunpcklqdq $H1,$H0,$H0 # 0:11306vpunpcklqdq $H3,$H2,$H3 # 2:313071308vpsrlq \$40,$H4,$H4 # 41309vpsrlq \$26,$H0,$H11310vpand $MASK,$H0,$H0 # 01311vpsrlq \$4,$H3,$H21312vpand $MASK,$H1,$H1 # 11313vpsrlq \$30,$H3,$H31314vpand $MASK,$H2,$H2 # 21315vpand $MASK,$H3,$H3 # 31316vpor 32(%rcx),$H4,$H4 # padbit, yes, always13171318vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x41319vpaddq 0x00(%r11),$H0,$H01320vpaddq 0x10(%r11),$H1,$H11321vpaddq 0x20(%r11),$H2,$H21322vpaddq 0x30(%r11),$H3,$H31323vpaddq 0x40(%r11),$H4,$H413241325################################################################1326# multiply (inp[0:1]+hash) by r^4:r^3 and accumulate13271328vpmuludq $H0,$T4,$T0 # h0*r01329vpaddq $T0,$D0,$D0 # d0 += h0*r01330vpmuludq $H1,$T4,$T1 # h1*r01331vpaddq $T1,$D1,$D1 # d1 += h1*r01332vpmuludq $H2,$T4,$T0 # h2*r01333vpaddq $T0,$D2,$D2 # d2 += h2*r01334vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n1335vpmuludq $H3,$T4,$T1 # h3*r01336vpaddq $T1,$D3,$D3 # d3 += h3*r01337vpmuludq $H4,$T4,$T4 # h4*r01338vpaddq $T4,$D4,$D4 # d4 += h4*r013391340vpmuludq $H3,$T2,$T0 # h3*r11341vpaddq $T0,$D4,$D4 # d4 += h3*r11342vpshufd \$0x32,`16*2-64`($ctx),$T3 # s11343vpmuludq $H2,$T2,$T1 # h2*r11344vpaddq $T1,$D3,$D3 # d3 += h2*r11345vpshufd \$0x32,`16*3-64`($ctx),$T4 # r21346vpmuludq $H1,$T2,$T0 # h1*r11347vpaddq $T0,$D2,$D2 # d2 += h1*r11348vpmuludq $H0,$T2,$T2 # h0*r11349vpaddq $T2,$D1,$D1 # d1 += h0*r11350vpmuludq $H4,$T3,$T3 # h4*s11351vpaddq $T3,$D0,$D0 # d0 += h4*s113521353vpshufd \$0x32,`16*4-64`($ctx),$T2 # s21354vpmuludq $H2,$T4,$T1 # h2*r21355vpaddq $T1,$D4,$D4 # d4 += h2*r21356vpmuludq $H1,$T4,$T0 # h1*r21357vpaddq $T0,$D3,$D3 # d3 += h1*r21358vpshufd \$0x32,`16*5-64`($ctx),$T3 # r31359vpmuludq $H0,$T4,$T4 # h0*r21360vpaddq $T4,$D2,$D2 # d2 += h0*r21361vpmuludq $H4,$T2,$T1 # h4*s21362vpaddq $T1,$D1,$D1 # d1 += h4*s21363vpshufd \$0x32,`16*6-64`($ctx),$T4 # s31364vpmuludq $H3,$T2,$T2 # h3*s21365vpaddq $T2,$D0,$D0 # d0 += h3*s213661367vpmuludq $H1,$T3,$T0 # h1*r31368vpaddq $T0,$D4,$D4 # d4 += h1*r31369vpmuludq $H0,$T3,$T3 # h0*r31370vpaddq $T3,$D3,$D3 # d3 += h0*r31371vpshufd \$0x32,`16*7-64`($ctx),$T2 # r41372vpmuludq $H4,$T4,$T1 # h4*s31373vpaddq $T1,$D2,$D2 # d2 += h4*s31374vpshufd \$0x32,`16*8-64`($ctx),$T3 # s41375vpmuludq $H3,$T4,$T0 # h3*s31376vpaddq $T0,$D1,$D1 # d1 += h3*s31377vpmuludq $H2,$T4,$T4 # h2*s31378vpaddq $T4,$D0,$D0 # d0 += h2*s313791380vpmuludq $H0,$T2,$T2 # h0*r41381vpaddq $T2,$D4,$D4 # d4 += h0*r41382vpmuludq $H4,$T3,$T1 # h4*s41383vpaddq $T1,$D3,$D3 # d3 += h4*s41384vpmuludq $H3,$T3,$T0 # h3*s41385vpaddq $T0,$D2,$D2 # d2 += h3*s41386vpmuludq $H2,$T3,$T1 # h2*s41387vpaddq $T1,$D1,$D1 # d1 += h2*s41388vpmuludq $H1,$T3,$T3 # h1*s41389vpaddq $T3,$D0,$D0 # d0 += h1*s413901391.Lshort_tail_avx:1392################################################################1393# horizontal addition13941395vpsrldq \$8,$D4,$T41396vpsrldq \$8,$D3,$T31397vpsrldq \$8,$D1,$T11398vpsrldq \$8,$D0,$T01399vpsrldq \$8,$D2,$T21400vpaddq $T3,$D3,$D31401vpaddq $T4,$D4,$D41402vpaddq $T0,$D0,$D01403vpaddq $T1,$D1,$D11404vpaddq $T2,$D2,$D214051406################################################################1407# lazy reduction14081409vpsrlq \$26,$D3,$H31410vpand $MASK,$D3,$D31411vpaddq $H3,$D4,$D4 # h3 -> h414121413vpsrlq \$26,$D0,$H01414vpand $MASK,$D0,$D01415vpaddq $H0,$D1,$D1 # h0 -> h114161417vpsrlq \$26,$D4,$H41418vpand $MASK,$D4,$D414191420vpsrlq \$26,$D1,$H11421vpand $MASK,$D1,$D11422vpaddq $H1,$D2,$D2 # h1 -> h214231424vpaddq $H4,$D0,$D01425vpsllq \$2,$H4,$H41426vpaddq $H4,$D0,$D0 # h4 -> h014271428vpsrlq \$26,$D2,$H21429vpand $MASK,$D2,$D21430vpaddq $H2,$D3,$D3 # h2 -> h314311432vpsrlq \$26,$D0,$H01433vpand $MASK,$D0,$D01434vpaddq $H0,$D1,$D1 # h0 -> h114351436vpsrlq \$26,$D3,$H31437vpand $MASK,$D3,$D31438vpaddq $H3,$D4,$D4 # h3 -> h414391440vmovd $D0,`4*0-48-64`($ctx) # save partially reduced1441vmovd $D1,`4*1-48-64`($ctx)1442vmovd $D2,`4*2-48-64`($ctx)1443vmovd $D3,`4*3-48-64`($ctx)1444vmovd $D4,`4*4-48-64`($ctx)1445___1446$code.=<<___ if ($win64);1447vmovdqa 0x50(%r11),%xmm61448vmovdqa 0x60(%r11),%xmm71449vmovdqa 0x70(%r11),%xmm81450vmovdqa 0x80(%r11),%xmm91451vmovdqa 0x90(%r11),%xmm101452vmovdqa 0xa0(%r11),%xmm111453vmovdqa 0xb0(%r11),%xmm121454vmovdqa 0xc0(%r11),%xmm131455vmovdqa 0xd0(%r11),%xmm141456vmovdqa 0xe0(%r11),%xmm151457lea 0xf8(%r11),%rsp1458.Ldo_avx_epilogue:1459___1460$code.=<<___ if (!$win64);1461lea -8(%r10),%rsp1462.cfi_def_cfa_register %rsp1463___1464$code.=<<___;1465vzeroupper1466RET1467.cfi_endproc1468___1469&end_function("poly1305_blocks_avx");14701471&declare_function("poly1305_emit_avx", 32, 3);1472$code.=<<___;1473cmpl \$0,20($ctx) # is_base2_26?1474je .Lemit14751476mov 0($ctx),%eax # load hash value base 2^261477mov 4($ctx),%ecx1478mov 8($ctx),%r8d1479mov 12($ctx),%r11d1480mov 16($ctx),%r10d14811482shl \$26,%rcx # base 2^26 -> base 2^641483mov %r8,%r91484shl \$52,%r81485add %rcx,%rax1486shr \$12,%r91487add %rax,%r8 # h01488adc \$0,%r914891490shl \$14,%r111491mov %r10,%rax1492shr \$24,%r101493add %r11,%r91494shl \$40,%rax1495add %rax,%r9 # h11496adc \$0,%r10 # h214971498mov %r10,%rax # could be partially reduced, so reduce1499mov %r10,%rcx1500and \$3,%r101501shr \$2,%rax1502and \$-4,%rcx1503add %rcx,%rax1504add %rax,%r81505adc \$0,%r91506adc \$0,%r1015071508mov %r8,%rax1509add \$5,%r8 # compare to modulus1510mov %r9,%rcx1511adc \$0,%r91512adc \$0,%r101513shr \$2,%r10 # did 130-bit value overflow?1514cmovnz %r8,%rax1515cmovnz %r9,%rcx15161517add 0($nonce),%rax # accumulate nonce1518adc 8($nonce),%rcx1519mov %rax,0($mac) # write result1520mov %rcx,8($mac)15211522RET1523___1524&end_function("poly1305_emit_avx");15251526if ($avx>1) {15271528my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =1529map("%ymm$_",(0..15));1530my $S4=$MASK;15311532sub poly1305_blocks_avxN {1533my ($avx512) = @_;1534my $suffix = $avx512 ? "_avx512" : "";1535$code.=<<___;1536.cfi_startproc1537mov 20($ctx),%r8d # is_base2_261538cmp \$128,$len1539jae .Lblocks_avx2$suffix1540test %r8d,%r8d1541jz .Lblocks15421543.Lblocks_avx2$suffix:1544and \$-16,$len1545jz .Lno_data_avx2$suffix15461547vzeroupper15481549test %r8d,%r8d1550jz .Lbase2_64_avx2$suffix15511552test \$63,$len1553jz .Leven_avx2$suffix15541555push %rbp1556.cfi_push %rbp1557mov %rsp,%rbp1558push %rbx1559.cfi_push %rbx1560push %r121561.cfi_push %r121562push %r131563.cfi_push %r131564push %r141565.cfi_push %r141566push %r151567.cfi_push %r151568.Lblocks_avx2_body$suffix:15691570mov $len,%r15 # reassign $len15711572mov 0($ctx),$d1 # load hash value1573mov 8($ctx),$d21574mov 16($ctx),$h2#d15751576mov 24($ctx),$r0 # load r1577mov 32($ctx),$s115781579################################# base 2^26 -> base 2^641580mov $d1#d,$h0#d1581and \$`-1*(1<<31)`,$d11582mov $d2,$r1 # borrow $r11583mov $d2#d,$h1#d1584and \$`-1*(1<<31)`,$d215851586shr \$6,$d11587shl \$52,$r11588add $d1,$h01589shr \$12,$h11590shr \$18,$d21591add $r1,$h01592adc $d2,$h115931594mov $h2,$d11595shl \$40,$d11596shr \$24,$h21597add $d1,$h11598adc \$0,$h2 # can be partially reduced...15991600mov \$-4,$d2 # ... so reduce1601mov $h2,$d11602and $h2,$d21603shr \$2,$d11604and \$3,$h21605add $d2,$d1 # =*51606add $d1,$h01607adc \$0,$h11608adc \$0,$h216091610mov $s1,$r11611mov $s1,%rax1612shr \$2,$s11613add $r1,$s1 # s1 = r1 + (r1 >> 2)16141615.Lbase2_26_pre_avx2$suffix:1616add 0($inp),$h0 # accumulate input1617adc 8($inp),$h11618lea 16($inp),$inp1619adc $padbit,$h21620sub \$16,%r1516211622call __poly1305_block1623mov $r1,%rax16241625test \$63,%r151626jnz .Lbase2_26_pre_avx2$suffix16271628test $padbit,$padbit # if $padbit is zero,1629jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format16301631################################# base 2^64 -> base 2^261632mov $h0,%rax1633mov $h0,%rdx1634shr \$52,$h01635mov $h1,$r01636mov $h1,$r11637shr \$26,%rdx1638and \$0x3ffffff,%rax # h[0]1639shl \$12,$r01640and \$0x3ffffff,%rdx # h[1]1641shr \$14,$h11642or $r0,$h01643shl \$24,$h21644and \$0x3ffffff,$h0 # h[2]1645shr \$40,$r11646and \$0x3ffffff,$h1 # h[3]1647or $r1,$h2 # h[4]16481649test %r15,%r151650jz .Lstore_base2_26_avx2$suffix16511652vmovd %rax#d,%x#$H01653vmovd %rdx#d,%x#$H11654vmovd $h0#d,%x#$H21655vmovd $h1#d,%x#$H31656vmovd $h2#d,%x#$H41657jmp .Lproceed_avx2$suffix16581659.align 321660.Lstore_base2_64_avx2$suffix:1661mov $h0,0($ctx)1662mov $h1,8($ctx)1663mov $h2,16($ctx) # note that is_base2_26 is zeroed1664jmp .Ldone_avx2$suffix16651666.align 161667.Lstore_base2_26_avx2$suffix:1668mov %rax#d,0($ctx) # store hash value base 2^261669mov %rdx#d,4($ctx)1670mov $h0#d,8($ctx)1671mov $h1#d,12($ctx)1672mov $h2#d,16($ctx)1673.align 161674.Ldone_avx2$suffix:1675pop %r151676.cfi_restore %r151677pop %r141678.cfi_restore %r141679pop %r131680.cfi_restore %r131681pop %r121682.cfi_restore %r121683pop %rbx1684.cfi_restore %rbx1685pop %rbp1686.cfi_restore %rbp1687.Lno_data_avx2$suffix:1688.Lblocks_avx2_epilogue$suffix:1689RET1690.cfi_endproc16911692.align 321693.Lbase2_64_avx2$suffix:1694.cfi_startproc1695push %rbp1696.cfi_push %rbp1697mov %rsp,%rbp1698push %rbx1699.cfi_push %rbx1700push %r121701.cfi_push %r121702push %r131703.cfi_push %r131704push %r141705.cfi_push %r141706push %r151707.cfi_push %r151708.Lbase2_64_avx2_body$suffix:17091710mov $len,%r15 # reassign $len17111712mov 24($ctx),$r0 # load r1713mov 32($ctx),$s117141715mov 0($ctx),$h0 # load hash value1716mov 8($ctx),$h11717mov 16($ctx),$h2#d17181719mov $s1,$r11720mov $s1,%rax1721shr \$2,$s11722add $r1,$s1 # s1 = r1 + (r1 >> 2)17231724test \$63,$len1725jz .Linit_avx2$suffix17261727.Lbase2_64_pre_avx2$suffix:1728add 0($inp),$h0 # accumulate input1729adc 8($inp),$h11730lea 16($inp),$inp1731adc $padbit,$h21732sub \$16,%r1517331734call __poly1305_block1735mov $r1,%rax17361737test \$63,%r151738jnz .Lbase2_64_pre_avx2$suffix17391740.Linit_avx2$suffix:1741################################# base 2^64 -> base 2^261742mov $h0,%rax1743mov $h0,%rdx1744shr \$52,$h01745mov $h1,$d11746mov $h1,$d21747shr \$26,%rdx1748and \$0x3ffffff,%rax # h[0]1749shl \$12,$d11750and \$0x3ffffff,%rdx # h[1]1751shr \$14,$h11752or $d1,$h01753shl \$24,$h21754and \$0x3ffffff,$h0 # h[2]1755shr \$40,$d21756and \$0x3ffffff,$h1 # h[3]1757or $d2,$h2 # h[4]17581759vmovd %rax#d,%x#$H01760vmovd %rdx#d,%x#$H11761vmovd $h0#d,%x#$H21762vmovd $h1#d,%x#$H31763vmovd $h2#d,%x#$H41764movl \$1,20($ctx) # set is_base2_2617651766call __poly1305_init_avx17671768.Lproceed_avx2$suffix:1769mov %r15,$len # restore $len1770___1771$code.=<<___ if (!$kernel);1772mov OPENSSL_ia32cap_P+8(%rip),%r9d1773mov \$`(1<<31|1<<30|1<<16)`,%r11d1774___1775$code.=<<___;1776pop %r151777.cfi_restore %r151778pop %r141779.cfi_restore %r141780pop %r131781.cfi_restore %r131782pop %r121783.cfi_restore %r121784pop %rbx1785.cfi_restore %rbx1786pop %rbp1787.cfi_restore %rbp1788.Lbase2_64_avx2_epilogue$suffix:1789jmp .Ldo_avx2$suffix1790.cfi_endproc17911792.align 321793.Leven_avx2$suffix:1794.cfi_startproc1795___1796$code.=<<___ if (!$kernel);1797mov OPENSSL_ia32cap_P+8(%rip),%r9d1798___1799$code.=<<___;1800vmovd 4*0($ctx),%x#$H0 # load hash value base 2^261801vmovd 4*1($ctx),%x#$H11802vmovd 4*2($ctx),%x#$H21803vmovd 4*3($ctx),%x#$H31804vmovd 4*4($ctx),%x#$H418051806.Ldo_avx2$suffix:1807___1808$code.=<<___ if (!$kernel && $avx>2);1809cmp \$512,$len1810jb .Lskip_avx5121811and %r11d,%r9d1812test \$`1<<16`,%r9d # check for AVX512F1813jnz .Lblocks_avx5121814.Lskip_avx512$suffix:1815___1816$code.=<<___ if ($avx > 2 && $avx512 && $kernel);1817cmp \$512,$len1818jae .Lblocks_avx5121819___1820$code.=<<___ if (!$win64);1821lea 8(%rsp),%r101822.cfi_def_cfa_register %r101823sub \$0x128,%rsp1824___1825$code.=<<___ if ($win64);1826lea 8(%rsp),%r101827sub \$0x1c8,%rsp1828vmovdqa %xmm6,-0xb0(%r10)1829vmovdqa %xmm7,-0xa0(%r10)1830vmovdqa %xmm8,-0x90(%r10)1831vmovdqa %xmm9,-0x80(%r10)1832vmovdqa %xmm10,-0x70(%r10)1833vmovdqa %xmm11,-0x60(%r10)1834vmovdqa %xmm12,-0x50(%r10)1835vmovdqa %xmm13,-0x40(%r10)1836vmovdqa %xmm14,-0x30(%r10)1837vmovdqa %xmm15,-0x20(%r10)1838.Ldo_avx2_body$suffix:1839___1840$code.=<<___;1841lea .Lconst(%rip),%rcx1842lea 48+64($ctx),$ctx # size optimization1843vmovdqa 96(%rcx),$T0 # .Lpermd_avx218441845# expand and copy pre-calculated table to stack1846vmovdqu `16*0-64`($ctx),%x#$T21847and \$-512,%rsp1848vmovdqu `16*1-64`($ctx),%x#$T31849vmovdqu `16*2-64`($ctx),%x#$T41850vmovdqu `16*3-64`($ctx),%x#$D01851vmovdqu `16*4-64`($ctx),%x#$D11852vmovdqu `16*5-64`($ctx),%x#$D21853lea 0x90(%rsp),%rax # size optimization1854vmovdqu `16*6-64`($ctx),%x#$D31855vpermd $T2,$T0,$T2 # 00003412 -> 142434441856vmovdqu `16*7-64`($ctx),%x#$D41857vpermd $T3,$T0,$T31858vmovdqu `16*8-64`($ctx),%x#$MASK1859vpermd $T4,$T0,$T41860vmovdqa $T2,0x00(%rsp)1861vpermd $D0,$T0,$D01862vmovdqa $T3,0x20-0x90(%rax)1863vpermd $D1,$T0,$D11864vmovdqa $T4,0x40-0x90(%rax)1865vpermd $D2,$T0,$D21866vmovdqa $D0,0x60-0x90(%rax)1867vpermd $D3,$T0,$D31868vmovdqa $D1,0x80-0x90(%rax)1869vpermd $D4,$T0,$D41870vmovdqa $D2,0xa0-0x90(%rax)1871vpermd $MASK,$T0,$MASK1872vmovdqa $D3,0xc0-0x90(%rax)1873vmovdqa $D4,0xe0-0x90(%rax)1874vmovdqa $MASK,0x100-0x90(%rax)1875vmovdqa 64(%rcx),$MASK # .Lmask2618761877################################################################1878# load input1879vmovdqu 16*0($inp),%x#$T01880vmovdqu 16*1($inp),%x#$T11881vinserti128 \$1,16*2($inp),$T0,$T01882vinserti128 \$1,16*3($inp),$T1,$T11883lea 16*4($inp),$inp18841885vpsrldq \$6,$T0,$T2 # splat input1886vpsrldq \$6,$T1,$T31887vpunpckhqdq $T1,$T0,$T4 # 41888vpunpcklqdq $T3,$T2,$T2 # 2:31889vpunpcklqdq $T1,$T0,$T0 # 0:118901891vpsrlq \$30,$T2,$T31892vpsrlq \$4,$T2,$T21893vpsrlq \$26,$T0,$T11894vpsrlq \$40,$T4,$T4 # 41895vpand $MASK,$T2,$T2 # 21896vpand $MASK,$T0,$T0 # 01897vpand $MASK,$T1,$T1 # 11898vpand $MASK,$T3,$T3 # 31899vpor 32(%rcx),$T4,$T4 # padbit, yes, always19001901vpaddq $H2,$T2,$H2 # accumulate input1902sub \$64,$len1903jz .Ltail_avx2$suffix1904jmp .Loop_avx2$suffix19051906.align 321907.Loop_avx2$suffix:1908################################################################1909# ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^41910# ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^31911# ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^21912# ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^11913# \________/\__________/1914################################################################1915#vpaddq $H2,$T2,$H2 # accumulate input1916vpaddq $H0,$T0,$H01917vmovdqa `32*0`(%rsp),$T0 # r0^41918vpaddq $H1,$T1,$H11919vmovdqa `32*1`(%rsp),$T1 # r1^41920vpaddq $H3,$T3,$H31921vmovdqa `32*3`(%rsp),$T2 # r2^41922vpaddq $H4,$T4,$H41923vmovdqa `32*6-0x90`(%rax),$T3 # s3^41924vmovdqa `32*8-0x90`(%rax),$S4 # s4^419251926# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r41927# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r41928# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r41929# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r41930# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r41931#1932# however, as h2 is "chronologically" first one available pull1933# corresponding operations up, so it's1934#1935# d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r41936# d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r41937# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r41938# d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r31939# d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r419401941vpmuludq $H2,$T0,$D2 # d2 = h2*r01942vpmuludq $H2,$T1,$D3 # d3 = h2*r11943vpmuludq $H2,$T2,$D4 # d4 = h2*r21944vpmuludq $H2,$T3,$D0 # d0 = h2*s31945vpmuludq $H2,$S4,$D1 # d1 = h2*s419461947vpmuludq $H0,$T1,$T4 # h0*r11948vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp1949vpaddq $T4,$D1,$D1 # d1 += h0*r11950vpaddq $H2,$D2,$D2 # d2 += h1*r11951vpmuludq $H3,$T1,$T4 # h3*r11952vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s11953vpaddq $T4,$D4,$D4 # d4 += h3*r11954vpaddq $H2,$D0,$D0 # d0 += h4*s11955vmovdqa `32*4-0x90`(%rax),$T1 # s219561957vpmuludq $H0,$T0,$T4 # h0*r01958vpmuludq $H1,$T0,$H2 # h1*r01959vpaddq $T4,$D0,$D0 # d0 += h0*r01960vpaddq $H2,$D1,$D1 # d1 += h1*r01961vpmuludq $H3,$T0,$T4 # h3*r01962vpmuludq $H4,$T0,$H2 # h4*r01963vmovdqu 16*0($inp),%x#$T0 # load input1964vpaddq $T4,$D3,$D3 # d3 += h3*r01965vpaddq $H2,$D4,$D4 # d4 += h4*r01966vinserti128 \$1,16*2($inp),$T0,$T019671968vpmuludq $H3,$T1,$T4 # h3*s21969vpmuludq $H4,$T1,$H2 # h4*s21970vmovdqu 16*1($inp),%x#$T11971vpaddq $T4,$D0,$D0 # d0 += h3*s21972vpaddq $H2,$D1,$D1 # d1 += h4*s21973vmovdqa `32*5-0x90`(%rax),$H2 # r31974vpmuludq $H1,$T2,$T4 # h1*r21975vpmuludq $H0,$T2,$T2 # h0*r21976vpaddq $T4,$D3,$D3 # d3 += h1*r21977vpaddq $T2,$D2,$D2 # d2 += h0*r21978vinserti128 \$1,16*3($inp),$T1,$T11979lea 16*4($inp),$inp19801981vpmuludq $H1,$H2,$T4 # h1*r31982vpmuludq $H0,$H2,$H2 # h0*r31983vpsrldq \$6,$T0,$T2 # splat input1984vpaddq $T4,$D4,$D4 # d4 += h1*r31985vpaddq $H2,$D3,$D3 # d3 += h0*r31986vpmuludq $H3,$T3,$T4 # h3*s31987vpmuludq $H4,$T3,$H2 # h4*s31988vpsrldq \$6,$T1,$T31989vpaddq $T4,$D1,$D1 # d1 += h3*s31990vpaddq $H2,$D2,$D2 # d2 += h4*s31991vpunpckhqdq $T1,$T0,$T4 # 419921993vpmuludq $H3,$S4,$H3 # h3*s41994vpmuludq $H4,$S4,$H4 # h4*s41995vpunpcklqdq $T1,$T0,$T0 # 0:11996vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r41997vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r41998vpunpcklqdq $T3,$T2,$T3 # 2:31999vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r42000vpmuludq $H1,$S4,$H0 # h1*s42001vmovdqa 64(%rcx),$MASK # .Lmask262002vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r42003vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s420042005################################################################2006# lazy reduction (interleaved with tail of input splat)20072008vpsrlq \$26,$H3,$D32009vpand $MASK,$H3,$H32010vpaddq $D3,$H4,$H4 # h3 -> h420112012vpsrlq \$26,$H0,$D02013vpand $MASK,$H0,$H02014vpaddq $D0,$D1,$H1 # h0 -> h120152016vpsrlq \$26,$H4,$D42017vpand $MASK,$H4,$H420182019vpsrlq \$4,$T3,$T220202021vpsrlq \$26,$H1,$D12022vpand $MASK,$H1,$H12023vpaddq $D1,$H2,$H2 # h1 -> h220242025vpaddq $D4,$H0,$H02026vpsllq \$2,$D4,$D42027vpaddq $D4,$H0,$H0 # h4 -> h020282029vpand $MASK,$T2,$T2 # 22030vpsrlq \$26,$T0,$T120312032vpsrlq \$26,$H2,$D22033vpand $MASK,$H2,$H22034vpaddq $D2,$H3,$H3 # h2 -> h320352036vpaddq $T2,$H2,$H2 # modulo-scheduled2037vpsrlq \$30,$T3,$T320382039vpsrlq \$26,$H0,$D02040vpand $MASK,$H0,$H02041vpaddq $D0,$H1,$H1 # h0 -> h120422043vpsrlq \$40,$T4,$T4 # 420442045vpsrlq \$26,$H3,$D32046vpand $MASK,$H3,$H32047vpaddq $D3,$H4,$H4 # h3 -> h420482049vpand $MASK,$T0,$T0 # 02050vpand $MASK,$T1,$T1 # 12051vpand $MASK,$T3,$T3 # 32052vpor 32(%rcx),$T4,$T4 # padbit, yes, always20532054sub \$64,$len2055jnz .Loop_avx2$suffix20562057.byte 0x66,0x902058.Ltail_avx2$suffix:2059################################################################2060# while above multiplications were by r^4 in all lanes, in last2061# iteration we multiply least significant lane by r^4 and most2062# significant one by r, so copy of above except that references2063# to the precomputed table are displaced by 4...20642065#vpaddq $H2,$T2,$H2 # accumulate input2066vpaddq $H0,$T0,$H02067vmovdqu `32*0+4`(%rsp),$T0 # r0^42068vpaddq $H1,$T1,$H12069vmovdqu `32*1+4`(%rsp),$T1 # r1^42070vpaddq $H3,$T3,$H32071vmovdqu `32*3+4`(%rsp),$T2 # r2^42072vpaddq $H4,$T4,$H42073vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^42074vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^420752076vpmuludq $H2,$T0,$D2 # d2 = h2*r02077vpmuludq $H2,$T1,$D3 # d3 = h2*r12078vpmuludq $H2,$T2,$D4 # d4 = h2*r22079vpmuludq $H2,$T3,$D0 # d0 = h2*s32080vpmuludq $H2,$S4,$D1 # d1 = h2*s420812082vpmuludq $H0,$T1,$T4 # h0*r12083vpmuludq $H1,$T1,$H2 # h1*r12084vpaddq $T4,$D1,$D1 # d1 += h0*r12085vpaddq $H2,$D2,$D2 # d2 += h1*r12086vpmuludq $H3,$T1,$T4 # h3*r12087vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s12088vpaddq $T4,$D4,$D4 # d4 += h3*r12089vpaddq $H2,$D0,$D0 # d0 += h4*s120902091vpmuludq $H0,$T0,$T4 # h0*r02092vpmuludq $H1,$T0,$H2 # h1*r02093vpaddq $T4,$D0,$D0 # d0 += h0*r02094vmovdqu `32*4+4-0x90`(%rax),$T1 # s22095vpaddq $H2,$D1,$D1 # d1 += h1*r02096vpmuludq $H3,$T0,$T4 # h3*r02097vpmuludq $H4,$T0,$H2 # h4*r02098vpaddq $T4,$D3,$D3 # d3 += h3*r02099vpaddq $H2,$D4,$D4 # d4 += h4*r021002101vpmuludq $H3,$T1,$T4 # h3*s22102vpmuludq $H4,$T1,$H2 # h4*s22103vpaddq $T4,$D0,$D0 # d0 += h3*s22104vpaddq $H2,$D1,$D1 # d1 += h4*s22105vmovdqu `32*5+4-0x90`(%rax),$H2 # r32106vpmuludq $H1,$T2,$T4 # h1*r22107vpmuludq $H0,$T2,$T2 # h0*r22108vpaddq $T4,$D3,$D3 # d3 += h1*r22109vpaddq $T2,$D2,$D2 # d2 += h0*r221102111vpmuludq $H1,$H2,$T4 # h1*r32112vpmuludq $H0,$H2,$H2 # h0*r32113vpaddq $T4,$D4,$D4 # d4 += h1*r32114vpaddq $H2,$D3,$D3 # d3 += h0*r32115vpmuludq $H3,$T3,$T4 # h3*s32116vpmuludq $H4,$T3,$H2 # h4*s32117vpaddq $T4,$D1,$D1 # d1 += h3*s32118vpaddq $H2,$D2,$D2 # d2 += h4*s321192120vpmuludq $H3,$S4,$H3 # h3*s42121vpmuludq $H4,$S4,$H4 # h4*s42122vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r42123vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r42124vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r42125vpmuludq $H1,$S4,$H0 # h1*s42126vmovdqa 64(%rcx),$MASK # .Lmask262127vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r42128vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s421292130################################################################2131# horizontal addition21322133vpsrldq \$8,$D1,$T12134vpsrldq \$8,$H2,$T22135vpsrldq \$8,$H3,$T32136vpsrldq \$8,$H4,$T42137vpsrldq \$8,$H0,$T02138vpaddq $T1,$D1,$D12139vpaddq $T2,$H2,$H22140vpaddq $T3,$H3,$H32141vpaddq $T4,$H4,$H42142vpaddq $T0,$H0,$H021432144vpermq \$0x2,$H3,$T32145vpermq \$0x2,$H4,$T42146vpermq \$0x2,$H0,$T02147vpermq \$0x2,$D1,$T12148vpermq \$0x2,$H2,$T22149vpaddq $T3,$H3,$H32150vpaddq $T4,$H4,$H42151vpaddq $T0,$H0,$H02152vpaddq $T1,$D1,$D12153vpaddq $T2,$H2,$H221542155################################################################2156# lazy reduction21572158vpsrlq \$26,$H3,$D32159vpand $MASK,$H3,$H32160vpaddq $D3,$H4,$H4 # h3 -> h421612162vpsrlq \$26,$H0,$D02163vpand $MASK,$H0,$H02164vpaddq $D0,$D1,$H1 # h0 -> h121652166vpsrlq \$26,$H4,$D42167vpand $MASK,$H4,$H421682169vpsrlq \$26,$H1,$D12170vpand $MASK,$H1,$H12171vpaddq $D1,$H2,$H2 # h1 -> h221722173vpaddq $D4,$H0,$H02174vpsllq \$2,$D4,$D42175vpaddq $D4,$H0,$H0 # h4 -> h021762177vpsrlq \$26,$H2,$D22178vpand $MASK,$H2,$H22179vpaddq $D2,$H3,$H3 # h2 -> h321802181vpsrlq \$26,$H0,$D02182vpand $MASK,$H0,$H02183vpaddq $D0,$H1,$H1 # h0 -> h121842185vpsrlq \$26,$H3,$D32186vpand $MASK,$H3,$H32187vpaddq $D3,$H4,$H4 # h3 -> h421882189vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced2190vmovd %x#$H1,`4*1-48-64`($ctx)2191vmovd %x#$H2,`4*2-48-64`($ctx)2192vmovd %x#$H3,`4*3-48-64`($ctx)2193vmovd %x#$H4,`4*4-48-64`($ctx)2194___2195$code.=<<___ if ($win64);2196vmovdqa -0xb0(%r10),%xmm62197vmovdqa -0xa0(%r10),%xmm72198vmovdqa -0x90(%r10),%xmm82199vmovdqa -0x80(%r10),%xmm92200vmovdqa -0x70(%r10),%xmm102201vmovdqa -0x60(%r10),%xmm112202vmovdqa -0x50(%r10),%xmm122203vmovdqa -0x40(%r10),%xmm132204vmovdqa -0x30(%r10),%xmm142205vmovdqa -0x20(%r10),%xmm152206lea -8(%r10),%rsp2207.Ldo_avx2_epilogue$suffix:2208___2209$code.=<<___ if (!$win64);2210lea -8(%r10),%rsp2211.cfi_def_cfa_register %rsp2212___2213$code.=<<___;2214vzeroupper2215RET2216.cfi_endproc2217___2218if($avx > 2 && $avx512) {2219my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));2220my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));2221my $PADBIT="%zmm30";22222223map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain2224map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));2225map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));2226map(s/%y/%z/,($MASK));22272228$code.=<<___;2229.cfi_startproc2230.Lblocks_avx512:2231mov \$15,%eax2232kmovw %eax,%k22233___2234$code.=<<___ if (!$win64);2235lea 8(%rsp),%r102236.cfi_def_cfa_register %r102237sub \$0x128,%rsp2238___2239$code.=<<___ if ($win64);2240lea 8(%rsp),%r102241sub \$0x1c8,%rsp2242vmovdqa %xmm6,-0xb0(%r10)2243vmovdqa %xmm7,-0xa0(%r10)2244vmovdqa %xmm8,-0x90(%r10)2245vmovdqa %xmm9,-0x80(%r10)2246vmovdqa %xmm10,-0x70(%r10)2247vmovdqa %xmm11,-0x60(%r10)2248vmovdqa %xmm12,-0x50(%r10)2249vmovdqa %xmm13,-0x40(%r10)2250vmovdqa %xmm14,-0x30(%r10)2251vmovdqa %xmm15,-0x20(%r10)2252.Ldo_avx512_body:2253___2254$code.=<<___;2255lea .Lconst(%rip),%rcx2256lea 48+64($ctx),$ctx # size optimization2257vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx222582259# expand pre-calculated table2260vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}2261and \$-512,%rsp2262vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}2263mov \$0x20,%rax2264vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}2265vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}2266vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}2267vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}2268vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}2269vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}2270vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}2271vpermd $D0,$T2,$R0 # 00003412 -> 142434442272vpbroadcastq 64(%rcx),$MASK # .Lmask262273vpermd $D1,$T2,$R12274vpermd $T0,$T2,$S12275vpermd $D2,$T2,$R22276vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 02277vpsrlq \$32,$R0,$T0 # 14243444 -> 010203042278vpermd $T1,$T2,$S22279vmovdqu64 $R1,0x00(%rsp,%rax){%k2}2280vpsrlq \$32,$R1,$T12281vpermd $D3,$T2,$R32282vmovdqa64 $S1,0x40(%rsp){%k2}2283vpermd $T3,$T2,$S32284vpermd $D4,$T2,$R42285vmovdqu64 $R2,0x40(%rsp,%rax){%k2}2286vpermd $T4,$T2,$S42287vmovdqa64 $S2,0x80(%rsp){%k2}2288vmovdqu64 $R3,0x80(%rsp,%rax){%k2}2289vmovdqa64 $S3,0xc0(%rsp){%k2}2290vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}2291vmovdqa64 $S4,0x100(%rsp){%k2}22922293################################################################2294# calculate 5th through 8th powers of the key2295#2296# d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r12297# d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r22298# d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r32299# d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r42300# d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r023012302vpmuludq $T0,$R0,$D0 # d0 = r0'*r02303vpmuludq $T0,$R1,$D1 # d1 = r0'*r12304vpmuludq $T0,$R2,$D2 # d2 = r0'*r22305vpmuludq $T0,$R3,$D3 # d3 = r0'*r32306vpmuludq $T0,$R4,$D4 # d4 = r0'*r42307vpsrlq \$32,$R2,$T223082309vpmuludq $T1,$S4,$M02310vpmuludq $T1,$R0,$M12311vpmuludq $T1,$R1,$M22312vpmuludq $T1,$R2,$M32313vpmuludq $T1,$R3,$M42314vpsrlq \$32,$R3,$T32315vpaddq $M0,$D0,$D0 # d0 += r1'*5*r42316vpaddq $M1,$D1,$D1 # d1 += r1'*r02317vpaddq $M2,$D2,$D2 # d2 += r1'*r12318vpaddq $M3,$D3,$D3 # d3 += r1'*r22319vpaddq $M4,$D4,$D4 # d4 += r1'*r323202321vpmuludq $T2,$S3,$M02322vpmuludq $T2,$S4,$M12323vpmuludq $T2,$R1,$M32324vpmuludq $T2,$R2,$M42325vpmuludq $T2,$R0,$M22326vpsrlq \$32,$R4,$T42327vpaddq $M0,$D0,$D0 # d0 += r2'*5*r32328vpaddq $M1,$D1,$D1 # d1 += r2'*5*r42329vpaddq $M3,$D3,$D3 # d3 += r2'*r12330vpaddq $M4,$D4,$D4 # d4 += r2'*r22331vpaddq $M2,$D2,$D2 # d2 += r2'*r023322333vpmuludq $T3,$S2,$M02334vpmuludq $T3,$R0,$M32335vpmuludq $T3,$R1,$M42336vpmuludq $T3,$S3,$M12337vpmuludq $T3,$S4,$M22338vpaddq $M0,$D0,$D0 # d0 += r3'*5*r22339vpaddq $M3,$D3,$D3 # d3 += r3'*r02340vpaddq $M4,$D4,$D4 # d4 += r3'*r12341vpaddq $M1,$D1,$D1 # d1 += r3'*5*r32342vpaddq $M2,$D2,$D2 # d2 += r3'*5*r423432344vpmuludq $T4,$S4,$M32345vpmuludq $T4,$R0,$M42346vpmuludq $T4,$S1,$M02347vpmuludq $T4,$S2,$M12348vpmuludq $T4,$S3,$M22349vpaddq $M3,$D3,$D3 # d3 += r2'*5*r42350vpaddq $M4,$D4,$D4 # d4 += r2'*r02351vpaddq $M0,$D0,$D0 # d0 += r2'*5*r12352vpaddq $M1,$D1,$D1 # d1 += r2'*5*r22353vpaddq $M2,$D2,$D2 # d2 += r2'*5*r323542355################################################################2356# load input2357vmovdqu64 16*0($inp),%z#$T32358vmovdqu64 16*4($inp),%z#$T42359lea 16*8($inp),$inp23602361################################################################2362# lazy reduction23632364vpsrlq \$26,$D3,$M32365vpandq $MASK,$D3,$D32366vpaddq $M3,$D4,$D4 # d3 -> d423672368vpsrlq \$26,$D0,$M02369vpandq $MASK,$D0,$D02370vpaddq $M0,$D1,$D1 # d0 -> d123712372vpsrlq \$26,$D4,$M42373vpandq $MASK,$D4,$D423742375vpsrlq \$26,$D1,$M12376vpandq $MASK,$D1,$D12377vpaddq $M1,$D2,$D2 # d1 -> d223782379vpaddq $M4,$D0,$D02380vpsllq \$2,$M4,$M42381vpaddq $M4,$D0,$D0 # d4 -> d023822383vpsrlq \$26,$D2,$M22384vpandq $MASK,$D2,$D22385vpaddq $M2,$D3,$D3 # d2 -> d323862387vpsrlq \$26,$D0,$M02388vpandq $MASK,$D0,$D02389vpaddq $M0,$D1,$D1 # d0 -> d123902391vpsrlq \$26,$D3,$M32392vpandq $MASK,$D3,$D32393vpaddq $M3,$D4,$D4 # d3 -> d423942395################################################################2396# at this point we have 14243444 in $R0-$S4 and 05060708 in2397# $D0-$D4, ...23982399vpunpcklqdq $T4,$T3,$T0 # transpose input2400vpunpckhqdq $T4,$T3,$T424012402# ... since input 64-bit lanes are ordered as 73625140, we could2403# "vperm" it to 76543210 (here and in each loop iteration), *or*2404# we could just flow along, hence the goal for $R0-$S4 is2405# 1858286838784888 ...24062407vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:2408mov \$0x7777,%eax2409kmovw %eax,%k124102411vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---2412vpermd $R1,$M0,$R12413vpermd $R2,$M0,$R22414vpermd $R3,$M0,$R32415vpermd $R4,$M0,$R424162417vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 18582868387848882418vpermd $D1,$M0,${R1}{%k1}2419vpermd $D2,$M0,${R2}{%k1}2420vpermd $D3,$M0,${R3}{%k1}2421vpermd $D4,$M0,${R4}{%k1}24222423vpslld \$2,$R1,$S1 # *52424vpslld \$2,$R2,$S22425vpslld \$2,$R3,$S32426vpslld \$2,$R4,$S42427vpaddd $R1,$S1,$S12428vpaddd $R2,$S2,$S22429vpaddd $R3,$S3,$S32430vpaddd $R4,$S4,$S424312432vpbroadcastq 32(%rcx),$PADBIT # .L12924332434vpsrlq \$52,$T0,$T2 # splat input2435vpsllq \$12,$T4,$T32436vporq $T3,$T2,$T22437vpsrlq \$26,$T0,$T12438vpsrlq \$14,$T4,$T32439vpsrlq \$40,$T4,$T4 # 42440vpandq $MASK,$T2,$T2 # 22441vpandq $MASK,$T0,$T0 # 02442#vpandq $MASK,$T1,$T1 # 12443#vpandq $MASK,$T3,$T3 # 32444#vporq $PADBIT,$T4,$T4 # padbit, yes, always24452446vpaddq $H2,$T2,$H2 # accumulate input2447sub \$192,$len2448jbe .Ltail_avx5122449jmp .Loop_avx51224502451.align 322452.Loop_avx512:2453################################################################2454# ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^82455# ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^72456# ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^62457# ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^52458# ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^42459# ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^32460# ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^22461# ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^12462# \________/\___________/2463################################################################2464#vpaddq $H2,$T2,$H2 # accumulate input24652466# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r42467# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r42468# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r42469# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r42470# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r42471#2472# however, as h2 is "chronologically" first one available pull2473# corresponding operations up, so it's2474#2475# d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r42476# d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r02477# d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r12478# d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r22479# d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r324802481vpmuludq $H2,$R1,$D3 # d3 = h2*r12482vpaddq $H0,$T0,$H02483vpmuludq $H2,$R2,$D4 # d4 = h2*r22484vpandq $MASK,$T1,$T1 # 12485vpmuludq $H2,$S3,$D0 # d0 = h2*s32486vpandq $MASK,$T3,$T3 # 32487vpmuludq $H2,$S4,$D1 # d1 = h2*s42488vporq $PADBIT,$T4,$T4 # padbit, yes, always2489vpmuludq $H2,$R0,$D2 # d2 = h2*r02490vpaddq $H1,$T1,$H1 # accumulate input2491vpaddq $H3,$T3,$H32492vpaddq $H4,$T4,$H424932494vmovdqu64 16*0($inp),$T3 # load input2495vmovdqu64 16*4($inp),$T42496lea 16*8($inp),$inp2497vpmuludq $H0,$R3,$M32498vpmuludq $H0,$R4,$M42499vpmuludq $H0,$R0,$M02500vpmuludq $H0,$R1,$M12501vpaddq $M3,$D3,$D3 # d3 += h0*r32502vpaddq $M4,$D4,$D4 # d4 += h0*r42503vpaddq $M0,$D0,$D0 # d0 += h0*r02504vpaddq $M1,$D1,$D1 # d1 += h0*r125052506vpmuludq $H1,$R2,$M32507vpmuludq $H1,$R3,$M42508vpmuludq $H1,$S4,$M02509vpmuludq $H0,$R2,$M22510vpaddq $M3,$D3,$D3 # d3 += h1*r22511vpaddq $M4,$D4,$D4 # d4 += h1*r32512vpaddq $M0,$D0,$D0 # d0 += h1*s42513vpaddq $M2,$D2,$D2 # d2 += h0*r225142515vpunpcklqdq $T4,$T3,$T0 # transpose input2516vpunpckhqdq $T4,$T3,$T425172518vpmuludq $H3,$R0,$M32519vpmuludq $H3,$R1,$M42520vpmuludq $H1,$R0,$M12521vpmuludq $H1,$R1,$M22522vpaddq $M3,$D3,$D3 # d3 += h3*r02523vpaddq $M4,$D4,$D4 # d4 += h3*r12524vpaddq $M1,$D1,$D1 # d1 += h1*r02525vpaddq $M2,$D2,$D2 # d2 += h1*r125262527vpmuludq $H4,$S4,$M32528vpmuludq $H4,$R0,$M42529vpmuludq $H3,$S2,$M02530vpmuludq $H3,$S3,$M12531vpaddq $M3,$D3,$D3 # d3 += h4*s42532vpmuludq $H3,$S4,$M22533vpaddq $M4,$D4,$D4 # d4 += h4*r02534vpaddq $M0,$D0,$D0 # d0 += h3*s22535vpaddq $M1,$D1,$D1 # d1 += h3*s32536vpaddq $M2,$D2,$D2 # d2 += h3*s425372538vpmuludq $H4,$S1,$M02539vpmuludq $H4,$S2,$M12540vpmuludq $H4,$S3,$M22541vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s12542vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s22543vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s325442545################################################################2546# lazy reduction (interleaved with input splat)25472548vpsrlq \$52,$T0,$T2 # splat input2549vpsllq \$12,$T4,$T325502551vpsrlq \$26,$D3,$H32552vpandq $MASK,$D3,$D32553vpaddq $H3,$D4,$H4 # h3 -> h425542555vporq $T3,$T2,$T225562557vpsrlq \$26,$H0,$D02558vpandq $MASK,$H0,$H02559vpaddq $D0,$H1,$H1 # h0 -> h125602561vpandq $MASK,$T2,$T2 # 225622563vpsrlq \$26,$H4,$D42564vpandq $MASK,$H4,$H425652566vpsrlq \$26,$H1,$D12567vpandq $MASK,$H1,$H12568vpaddq $D1,$H2,$H2 # h1 -> h225692570vpaddq $D4,$H0,$H02571vpsllq \$2,$D4,$D42572vpaddq $D4,$H0,$H0 # h4 -> h025732574vpaddq $T2,$H2,$H2 # modulo-scheduled2575vpsrlq \$26,$T0,$T125762577vpsrlq \$26,$H2,$D22578vpandq $MASK,$H2,$H22579vpaddq $D2,$D3,$H3 # h2 -> h325802581vpsrlq \$14,$T4,$T325822583vpsrlq \$26,$H0,$D02584vpandq $MASK,$H0,$H02585vpaddq $D0,$H1,$H1 # h0 -> h125862587vpsrlq \$40,$T4,$T4 # 425882589vpsrlq \$26,$H3,$D32590vpandq $MASK,$H3,$H32591vpaddq $D3,$H4,$H4 # h3 -> h425922593vpandq $MASK,$T0,$T0 # 02594#vpandq $MASK,$T1,$T1 # 12595#vpandq $MASK,$T3,$T3 # 32596#vporq $PADBIT,$T4,$T4 # padbit, yes, always25972598sub \$128,$len2599ja .Loop_avx51226002601.Ltail_avx512:2602################################################################2603# while above multiplications were by r^8 in all lanes, in last2604# iteration we multiply least significant lane by r^8 and most2605# significant one by r, that's why table gets shifted...26062607vpsrlq \$32,$R0,$R0 # 01050206030704082608vpsrlq \$32,$R1,$R12609vpsrlq \$32,$R2,$R22610vpsrlq \$32,$S3,$S32611vpsrlq \$32,$S4,$S42612vpsrlq \$32,$R3,$R32613vpsrlq \$32,$R4,$R42614vpsrlq \$32,$S1,$S12615vpsrlq \$32,$S2,$S226162617################################################################2618# load either next or last 64 byte of input2619lea ($inp,$len),$inp26202621#vpaddq $H2,$T2,$H2 # accumulate input2622vpaddq $H0,$T0,$H026232624vpmuludq $H2,$R1,$D3 # d3 = h2*r12625vpmuludq $H2,$R2,$D4 # d4 = h2*r22626vpmuludq $H2,$S3,$D0 # d0 = h2*s32627vpandq $MASK,$T1,$T1 # 12628vpmuludq $H2,$S4,$D1 # d1 = h2*s42629vpandq $MASK,$T3,$T3 # 32630vpmuludq $H2,$R0,$D2 # d2 = h2*r02631vporq $PADBIT,$T4,$T4 # padbit, yes, always2632vpaddq $H1,$T1,$H1 # accumulate input2633vpaddq $H3,$T3,$H32634vpaddq $H4,$T4,$H426352636vmovdqu 16*0($inp),%x#$T02637vpmuludq $H0,$R3,$M32638vpmuludq $H0,$R4,$M42639vpmuludq $H0,$R0,$M02640vpmuludq $H0,$R1,$M12641vpaddq $M3,$D3,$D3 # d3 += h0*r32642vpaddq $M4,$D4,$D4 # d4 += h0*r42643vpaddq $M0,$D0,$D0 # d0 += h0*r02644vpaddq $M1,$D1,$D1 # d1 += h0*r126452646vmovdqu 16*1($inp),%x#$T12647vpmuludq $H1,$R2,$M32648vpmuludq $H1,$R3,$M42649vpmuludq $H1,$S4,$M02650vpmuludq $H0,$R2,$M22651vpaddq $M3,$D3,$D3 # d3 += h1*r22652vpaddq $M4,$D4,$D4 # d4 += h1*r32653vpaddq $M0,$D0,$D0 # d0 += h1*s42654vpaddq $M2,$D2,$D2 # d2 += h0*r226552656vinserti128 \$1,16*2($inp),%y#$T0,%y#$T02657vpmuludq $H3,$R0,$M32658vpmuludq $H3,$R1,$M42659vpmuludq $H1,$R0,$M12660vpmuludq $H1,$R1,$M22661vpaddq $M3,$D3,$D3 # d3 += h3*r02662vpaddq $M4,$D4,$D4 # d4 += h3*r12663vpaddq $M1,$D1,$D1 # d1 += h1*r02664vpaddq $M2,$D2,$D2 # d2 += h1*r126652666vinserti128 \$1,16*3($inp),%y#$T1,%y#$T12667vpmuludq $H4,$S4,$M32668vpmuludq $H4,$R0,$M42669vpmuludq $H3,$S2,$M02670vpmuludq $H3,$S3,$M12671vpmuludq $H3,$S4,$M22672vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s42673vpaddq $M4,$D4,$D4 # d4 += h4*r02674vpaddq $M0,$D0,$D0 # d0 += h3*s22675vpaddq $M1,$D1,$D1 # d1 += h3*s32676vpaddq $M2,$D2,$D2 # d2 += h3*s426772678vpmuludq $H4,$S1,$M02679vpmuludq $H4,$S2,$M12680vpmuludq $H4,$S3,$M22681vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s12682vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s22683vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s326842685################################################################2686# horizontal addition26872688mov \$1,%eax2689vpermq \$0xb1,$H3,$D32690vpermq \$0xb1,$D4,$H42691vpermq \$0xb1,$H0,$D02692vpermq \$0xb1,$H1,$D12693vpermq \$0xb1,$H2,$D22694vpaddq $D3,$H3,$H32695vpaddq $D4,$H4,$H42696vpaddq $D0,$H0,$H02697vpaddq $D1,$H1,$H12698vpaddq $D2,$H2,$H226992700kmovw %eax,%k32701vpermq \$0x2,$H3,$D32702vpermq \$0x2,$H4,$D42703vpermq \$0x2,$H0,$D02704vpermq \$0x2,$H1,$D12705vpermq \$0x2,$H2,$D22706vpaddq $D3,$H3,$H32707vpaddq $D4,$H4,$H42708vpaddq $D0,$H0,$H02709vpaddq $D1,$H1,$H12710vpaddq $D2,$H2,$H227112712vextracti64x4 \$0x1,$H3,%y#$D32713vextracti64x4 \$0x1,$H4,%y#$D42714vextracti64x4 \$0x1,$H0,%y#$D02715vextracti64x4 \$0x1,$H1,%y#$D12716vextracti64x4 \$0x1,$H2,%y#$D22717vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case2718vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx22719vpaddq $D0,$H0,${H0}{%k3}{z}2720vpaddq $D1,$H1,${H1}{%k3}{z}2721vpaddq $D2,$H2,${H2}{%k3}{z}2722___2723map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));2724map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));2725$code.=<<___;2726################################################################2727# lazy reduction (interleaved with input splat)27282729vpsrlq \$26,$H3,$D32730vpand $MASK,$H3,$H32731vpsrldq \$6,$T0,$T2 # splat input2732vpsrldq \$6,$T1,$T32733vpunpckhqdq $T1,$T0,$T4 # 42734vpaddq $D3,$H4,$H4 # h3 -> h427352736vpsrlq \$26,$H0,$D02737vpand $MASK,$H0,$H02738vpunpcklqdq $T3,$T2,$T2 # 2:32739vpunpcklqdq $T1,$T0,$T0 # 0:12740vpaddq $D0,$H1,$H1 # h0 -> h127412742vpsrlq \$26,$H4,$D42743vpand $MASK,$H4,$H427442745vpsrlq \$26,$H1,$D12746vpand $MASK,$H1,$H12747vpsrlq \$30,$T2,$T32748vpsrlq \$4,$T2,$T22749vpaddq $D1,$H2,$H2 # h1 -> h227502751vpaddq $D4,$H0,$H02752vpsllq \$2,$D4,$D42753vpsrlq \$26,$T0,$T12754vpsrlq \$40,$T4,$T4 # 42755vpaddq $D4,$H0,$H0 # h4 -> h027562757vpsrlq \$26,$H2,$D22758vpand $MASK,$H2,$H22759vpand $MASK,$T2,$T2 # 22760vpand $MASK,$T0,$T0 # 02761vpaddq $D2,$H3,$H3 # h2 -> h327622763vpsrlq \$26,$H0,$D02764vpand $MASK,$H0,$H02765vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx22766vpand $MASK,$T1,$T1 # 12767vpaddq $D0,$H1,$H1 # h0 -> h127682769vpsrlq \$26,$H3,$D32770vpand $MASK,$H3,$H32771vpand $MASK,$T3,$T3 # 32772vpor 32(%rcx),$T4,$T4 # padbit, yes, always2773vpaddq $D3,$H4,$H4 # h3 -> h427742775lea 0x90(%rsp),%rax # size optimization for .Ltail_avx22776add \$64,$len2777jnz .Ltail_avx2$suffix27782779vpsubq $T2,$H2,$H2 # undo input accumulation2780vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced2781vmovd %x#$H1,`4*1-48-64`($ctx)2782vmovd %x#$H2,`4*2-48-64`($ctx)2783vmovd %x#$H3,`4*3-48-64`($ctx)2784vmovd %x#$H4,`4*4-48-64`($ctx)2785vzeroall2786___2787$code.=<<___ if ($win64);2788movdqa -0xb0(%r10),%xmm62789movdqa -0xa0(%r10),%xmm72790movdqa -0x90(%r10),%xmm82791movdqa -0x80(%r10),%xmm92792movdqa -0x70(%r10),%xmm102793movdqa -0x60(%r10),%xmm112794movdqa -0x50(%r10),%xmm122795movdqa -0x40(%r10),%xmm132796movdqa -0x30(%r10),%xmm142797movdqa -0x20(%r10),%xmm152798lea -8(%r10),%rsp2799.Ldo_avx512_epilogue:2800___2801$code.=<<___ if (!$win64);2802lea -8(%r10),%rsp2803.cfi_def_cfa_register %rsp2804___2805$code.=<<___;2806RET2807.cfi_endproc2808___28092810}28112812}28132814&declare_function("poly1305_blocks_avx2", 32, 4);2815poly1305_blocks_avxN(0);2816&end_function("poly1305_blocks_avx2");28172818#######################################################################2819if ($avx>2) {2820# On entry we have input length divisible by 64. But since inner loop2821# processes 128 bytes per iteration, cases when length is not divisible2822# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this2823# reason stack layout is kept identical to poly1305_blocks_avx2. If not2824# for this tail, we wouldn't have to even allocate stack frame...28252826&declare_function("poly1305_blocks_avx512", 32, 4);2827poly1305_blocks_avxN(1);2828&end_function("poly1305_blocks_avx512");28292830if (!$kernel && $avx>3) {2831########################################################################2832# VPMADD52 version using 2^44 radix.2833#2834# One can argue that base 2^52 would be more natural. Well, even though2835# some operations would be more natural, one has to recognize couple of2836# things. Base 2^52 doesn't provide advantage over base 2^44 if you look2837# at amount of multiply-n-accumulate operations. Secondly, it makes it2838# impossible to pre-compute multiples of 5 [referred to as s[]/sN in2839# reference implementations], which means that more such operations2840# would have to be performed in inner loop, which in turn makes critical2841# path longer. In other words, even though base 2^44 reduction might2842# look less elegant, overall critical path is actually shorter...28432844########################################################################2845# Layout of opaque area is following.2846#2847# unsigned __int64 h[3]; # current hash value base 2^442848# unsigned __int64 s[2]; # key value*20 base 2^442849# unsigned __int64 r[3]; # key value base 2^442850# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];2851# # r^n positions reflect2852# # placement in register, not2853# # memory, R[3] is R[1]*2028542855$code.=<<___;2856.type poly1305_init_base2_44,\@function,32857.align 322858poly1305_init_base2_44:2859xor %eax,%eax2860mov %rax,0($ctx) # initialize hash value2861mov %rax,8($ctx)2862mov %rax,16($ctx)28632864.Linit_base2_44:2865lea poly1305_blocks_vpmadd52(%rip),%r102866lea poly1305_emit_base2_44(%rip),%r1128672868mov \$0x0ffffffc0fffffff,%rax2869mov \$0x0ffffffc0ffffffc,%rcx2870and 0($inp),%rax2871mov \$0x00000fffffffffff,%r82872and 8($inp),%rcx2873mov \$0x00000fffffffffff,%r92874and %rax,%r82875shrd \$44,%rcx,%rax2876mov %r8,40($ctx) # r02877and %r9,%rax2878shr \$24,%rcx2879mov %rax,48($ctx) # r12880lea (%rax,%rax,4),%rax # *52881mov %rcx,56($ctx) # r22882shl \$2,%rax # magic <<22883lea (%rcx,%rcx,4),%rcx # *52884shl \$2,%rcx # magic <<22885mov %rax,24($ctx) # s12886mov %rcx,32($ctx) # s22887movq \$-1,64($ctx) # write impossible value2888___2889$code.=<<___ if ($flavour !~ /elf32/);2890mov %r10,0(%rdx)2891mov %r11,8(%rdx)2892___2893$code.=<<___ if ($flavour =~ /elf32/);2894mov %r10d,0(%rdx)2895mov %r11d,4(%rdx)2896___2897$code.=<<___;2898mov \$1,%eax2899RET2900.size poly1305_init_base2_44,.-poly1305_init_base2_442901___2902{2903my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));2904my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));2905my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));29062907$code.=<<___;2908.type poly1305_blocks_vpmadd52,\@function,42909.align 322910poly1305_blocks_vpmadd52:2911shr \$4,$len2912jz .Lno_data_vpmadd52 # too short29132914shl \$40,$padbit2915mov 64($ctx),%r8 # peek on power of the key29162917# if powers of the key are not calculated yet, process up to 32918# blocks with this single-block subroutine, otherwise ensure that2919# length is divisible by 2 blocks and pass the rest down to next2920# subroutine...29212922mov \$3,%rax2923mov \$1,%r102924cmp \$4,$len # is input long2925cmovae %r10,%rax2926test %r8,%r8 # is power value impossible?2927cmovns %r10,%rax29282929and $len,%rax # is input of favourable length?2930jz .Lblocks_vpmadd52_4x29312932sub %rax,$len2933mov \$7,%r10d2934mov \$1,%r11d2935kmovw %r10d,%k72936lea .L2_44_inp_permd(%rip),%r102937kmovw %r11d,%k129382939vmovq $padbit,%x#$PAD2940vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd2941vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift2942vpermq \$0xcf,$PAD,$PAD2943vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask29442945vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value2946vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys2947vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}2948vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}29492950vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt2951vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft29522953jmp .Loop_vpmadd5229542955.align 322956.Loop_vpmadd52:2957vmovdqu32 0($inp),%x#$T0 # load input as ----32102958lea 16($inp),$inp29592960vpermd $T0,$inp_permd,$T0 # ----3210 -> --3221102961vpsrlvq $inp_shift,$T0,$T02962vpandq $reduc_mask,$T0,$T02963vporq $PAD,$T0,$T029642965vpaddq $T0,$Dlo,$Dlo # accumulate input29662967vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value2968vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}2969vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}29702971vpxord $Dlo,$Dlo,$Dlo2972vpxord $Dhi,$Dhi,$Dhi29732974vpmadd52luq $r2r1r0,$H0,$Dlo2975vpmadd52huq $r2r1r0,$H0,$Dhi29762977vpmadd52luq $r1r0s2,$H1,$Dlo2978vpmadd52huq $r1r0s2,$H1,$Dhi29792980vpmadd52luq $r0s2s1,$H2,$Dlo2981vpmadd52huq $r0s2s1,$H2,$Dhi29822983vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword2984vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword2985vpandq $reduc_mask,$Dlo,$Dlo29862987vpaddq $T0,$Dhi,$Dhi29882989vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword29902991vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)29922993vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word2994vpandq $reduc_mask,$Dlo,$Dlo29952996vpermq \$0b10010011,$T0,$T029972998vpaddq $T0,$Dlo,$Dlo29993000vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}30013002vpaddq $T0,$Dlo,$Dlo3003vpsllq \$2,$T0,$T030043005vpaddq $T0,$Dlo,$Dlo30063007dec %rax # len-=163008jnz .Loop_vpmadd5230093010vmovdqu64 $Dlo,0($ctx){%k7} # store hash value30113012test $len,$len3013jnz .Lblocks_vpmadd52_4x30143015.Lno_data_vpmadd52:3016RET3017.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd523018___3019}3020{3021########################################################################3022# As implied by its name 4x subroutine processes 4 blocks in parallel3023# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power3024# and is handled in 256-bit %ymm registers.30253026my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));3027my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));3028my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));30293030$code.=<<___;3031.type poly1305_blocks_vpmadd52_4x,\@function,43032.align 323033poly1305_blocks_vpmadd52_4x:3034shr \$4,$len3035jz .Lno_data_vpmadd52_4x # too short30363037shl \$40,$padbit3038mov 64($ctx),%r8 # peek on power of the key30393040.Lblocks_vpmadd52_4x:3041vpbroadcastq $padbit,$PAD30423043vmovdqa64 .Lx_mask44(%rip),$mask443044mov \$5,%eax3045vmovdqa64 .Lx_mask42(%rip),$mask423046kmovw %eax,%k1 # used in 2x path30473048test %r8,%r8 # is power value impossible?3049js .Linit_vpmadd52 # if it is, then init R[4]30503051vmovq 0($ctx),%x#$H0 # load current hash value3052vmovq 8($ctx),%x#$H13053vmovq 16($ctx),%x#$H230543055test \$3,$len # is length 4*n+2?3056jnz .Lblocks_vpmadd52_2x_do30573058.Lblocks_vpmadd52_4x_do:3059vpbroadcastq 64($ctx),$R0 # load 4th power of the key3060vpbroadcastq 96($ctx),$R13061vpbroadcastq 128($ctx),$R23062vpbroadcastq 160($ctx),$S130633064.Lblocks_vpmadd52_4x_key_loaded:3065vpsllq \$2,$R2,$S2 # S2 = R2*5*43066vpaddq $R2,$S2,$S23067vpsllq \$2,$S2,$S230683069test \$7,$len # is len 8*n?3070jz .Lblocks_vpmadd52_8x30713072vmovdqu64 16*0($inp),$T2 # load data3073vmovdqu64 16*2($inp),$T33074lea 16*4($inp),$inp30753076vpunpcklqdq $T3,$T2,$T1 # transpose data3077vpunpckhqdq $T3,$T2,$T330783079# at this point 64-bit lanes are ordered as 3-1-2-030803081vpsrlq \$24,$T3,$T2 # splat the data3082vporq $PAD,$T2,$T23083vpaddq $T2,$H2,$H2 # accumulate input3084vpandq $mask44,$T1,$T03085vpsrlq \$44,$T1,$T13086vpsllq \$20,$T3,$T33087vporq $T3,$T1,$T13088vpandq $mask44,$T1,$T130893090sub \$4,$len3091jz .Ltail_vpmadd52_4x3092jmp .Loop_vpmadd52_4x3093ud230943095.align 323096.Linit_vpmadd52:3097vmovq 24($ctx),%x#$S1 # load key3098vmovq 56($ctx),%x#$H23099vmovq 32($ctx),%x#$S23100vmovq 40($ctx),%x#$R03101vmovq 48($ctx),%x#$R131023103vmovdqa $R0,$H03104vmovdqa $R1,$H13105vmovdqa $H2,$R231063107mov \$2,%eax31083109.Lmul_init_vpmadd52:3110vpxorq $D0lo,$D0lo,$D0lo3111vpmadd52luq $H2,$S1,$D0lo3112vpxorq $D0hi,$D0hi,$D0hi3113vpmadd52huq $H2,$S1,$D0hi3114vpxorq $D1lo,$D1lo,$D1lo3115vpmadd52luq $H2,$S2,$D1lo3116vpxorq $D1hi,$D1hi,$D1hi3117vpmadd52huq $H2,$S2,$D1hi3118vpxorq $D2lo,$D2lo,$D2lo3119vpmadd52luq $H2,$R0,$D2lo3120vpxorq $D2hi,$D2hi,$D2hi3121vpmadd52huq $H2,$R0,$D2hi31223123vpmadd52luq $H0,$R0,$D0lo3124vpmadd52huq $H0,$R0,$D0hi3125vpmadd52luq $H0,$R1,$D1lo3126vpmadd52huq $H0,$R1,$D1hi3127vpmadd52luq $H0,$R2,$D2lo3128vpmadd52huq $H0,$R2,$D2hi31293130vpmadd52luq $H1,$S2,$D0lo3131vpmadd52huq $H1,$S2,$D0hi3132vpmadd52luq $H1,$R0,$D1lo3133vpmadd52huq $H1,$R0,$D1hi3134vpmadd52luq $H1,$R1,$D2lo3135vpmadd52huq $H1,$R1,$D2hi31363137################################################################3138# partial reduction3139vpsrlq \$44,$D0lo,$tmp3140vpsllq \$8,$D0hi,$D0hi3141vpandq $mask44,$D0lo,$H03142vpaddq $tmp,$D0hi,$D0hi31433144vpaddq $D0hi,$D1lo,$D1lo31453146vpsrlq \$44,$D1lo,$tmp3147vpsllq \$8,$D1hi,$D1hi3148vpandq $mask44,$D1lo,$H13149vpaddq $tmp,$D1hi,$D1hi31503151vpaddq $D1hi,$D2lo,$D2lo31523153vpsrlq \$42,$D2lo,$tmp3154vpsllq \$10,$D2hi,$D2hi3155vpandq $mask42,$D2lo,$H23156vpaddq $tmp,$D2hi,$D2hi31573158vpaddq $D2hi,$H0,$H03159vpsllq \$2,$D2hi,$D2hi31603161vpaddq $D2hi,$H0,$H031623163vpsrlq \$44,$H0,$tmp # additional step3164vpandq $mask44,$H0,$H031653166vpaddq $tmp,$H1,$H131673168dec %eax3169jz .Ldone_init_vpmadd5231703171vpunpcklqdq $R1,$H1,$R1 # 1,23172vpbroadcastq %x#$H1,%x#$H1 # 2,23173vpunpcklqdq $R2,$H2,$R23174vpbroadcastq %x#$H2,%x#$H23175vpunpcklqdq $R0,$H0,$R03176vpbroadcastq %x#$H0,%x#$H031773178vpsllq \$2,$R1,$S1 # S1 = R1*5*43179vpsllq \$2,$R2,$S2 # S2 = R2*5*43180vpaddq $R1,$S1,$S13181vpaddq $R2,$S2,$S23182vpsllq \$2,$S1,$S13183vpsllq \$2,$S2,$S231843185jmp .Lmul_init_vpmadd523186ud231873188.align 323189.Ldone_init_vpmadd52:3190vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,43191vinserti128 \$1,%x#$R2,$H2,$R23192vinserti128 \$1,%x#$R0,$H0,$R031933194vpermq \$0b11011000,$R1,$R1 # 1,3,2,43195vpermq \$0b11011000,$R2,$R23196vpermq \$0b11011000,$R0,$R031973198vpsllq \$2,$R1,$S1 # S1 = R1*5*43199vpaddq $R1,$S1,$S13200vpsllq \$2,$S1,$S132013202vmovq 0($ctx),%x#$H0 # load current hash value3203vmovq 8($ctx),%x#$H13204vmovq 16($ctx),%x#$H232053206test \$3,$len # is length 4*n+2?3207jnz .Ldone_init_vpmadd52_2x32083209vmovdqu64 $R0,64($ctx) # save key powers3210vpbroadcastq %x#$R0,$R0 # broadcast 4th power3211vmovdqu64 $R1,96($ctx)3212vpbroadcastq %x#$R1,$R13213vmovdqu64 $R2,128($ctx)3214vpbroadcastq %x#$R2,$R23215vmovdqu64 $S1,160($ctx)3216vpbroadcastq %x#$S1,$S132173218jmp .Lblocks_vpmadd52_4x_key_loaded3219ud232203221.align 323222.Ldone_init_vpmadd52_2x:3223vmovdqu64 $R0,64($ctx) # save key powers3224vpsrldq \$8,$R0,$R0 # 0-1-0-23225vmovdqu64 $R1,96($ctx)3226vpsrldq \$8,$R1,$R13227vmovdqu64 $R2,128($ctx)3228vpsrldq \$8,$R2,$R23229vmovdqu64 $S1,160($ctx)3230vpsrldq \$8,$S1,$S13231jmp .Lblocks_vpmadd52_2x_key_loaded3232ud232333234.align 323235.Lblocks_vpmadd52_2x_do:3236vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers3237vmovdqu64 160+8($ctx),${S1}{%k1}{z}3238vmovdqu64 64+8($ctx),${R0}{%k1}{z}3239vmovdqu64 96+8($ctx),${R1}{%k1}{z}32403241.Lblocks_vpmadd52_2x_key_loaded:3242vmovdqu64 16*0($inp),$T2 # load data3243vpxorq $T3,$T3,$T33244lea 16*2($inp),$inp32453246vpunpcklqdq $T3,$T2,$T1 # transpose data3247vpunpckhqdq $T3,$T2,$T332483249# at this point 64-bit lanes are ordered as x-1-x-032503251vpsrlq \$24,$T3,$T2 # splat the data3252vporq $PAD,$T2,$T23253vpaddq $T2,$H2,$H2 # accumulate input3254vpandq $mask44,$T1,$T03255vpsrlq \$44,$T1,$T13256vpsllq \$20,$T3,$T33257vporq $T3,$T1,$T13258vpandq $mask44,$T1,$T132593260jmp .Ltail_vpmadd52_2x3261ud232623263.align 323264.Loop_vpmadd52_4x:3265#vpaddq $T2,$H2,$H2 # accumulate input3266vpaddq $T0,$H0,$H03267vpaddq $T1,$H1,$H132683269vpxorq $D0lo,$D0lo,$D0lo3270vpmadd52luq $H2,$S1,$D0lo3271vpxorq $D0hi,$D0hi,$D0hi3272vpmadd52huq $H2,$S1,$D0hi3273vpxorq $D1lo,$D1lo,$D1lo3274vpmadd52luq $H2,$S2,$D1lo3275vpxorq $D1hi,$D1hi,$D1hi3276vpmadd52huq $H2,$S2,$D1hi3277vpxorq $D2lo,$D2lo,$D2lo3278vpmadd52luq $H2,$R0,$D2lo3279vpxorq $D2hi,$D2hi,$D2hi3280vpmadd52huq $H2,$R0,$D2hi32813282vmovdqu64 16*0($inp),$T2 # load data3283vmovdqu64 16*2($inp),$T33284lea 16*4($inp),$inp3285vpmadd52luq $H0,$R0,$D0lo3286vpmadd52huq $H0,$R0,$D0hi3287vpmadd52luq $H0,$R1,$D1lo3288vpmadd52huq $H0,$R1,$D1hi3289vpmadd52luq $H0,$R2,$D2lo3290vpmadd52huq $H0,$R2,$D2hi32913292vpunpcklqdq $T3,$T2,$T1 # transpose data3293vpunpckhqdq $T3,$T2,$T33294vpmadd52luq $H1,$S2,$D0lo3295vpmadd52huq $H1,$S2,$D0hi3296vpmadd52luq $H1,$R0,$D1lo3297vpmadd52huq $H1,$R0,$D1hi3298vpmadd52luq $H1,$R1,$D2lo3299vpmadd52huq $H1,$R1,$D2hi33003301################################################################3302# partial reduction (interleaved with data splat)3303vpsrlq \$44,$D0lo,$tmp3304vpsllq \$8,$D0hi,$D0hi3305vpandq $mask44,$D0lo,$H03306vpaddq $tmp,$D0hi,$D0hi33073308vpsrlq \$24,$T3,$T23309vporq $PAD,$T2,$T23310vpaddq $D0hi,$D1lo,$D1lo33113312vpsrlq \$44,$D1lo,$tmp3313vpsllq \$8,$D1hi,$D1hi3314vpandq $mask44,$D1lo,$H13315vpaddq $tmp,$D1hi,$D1hi33163317vpandq $mask44,$T1,$T03318vpsrlq \$44,$T1,$T13319vpsllq \$20,$T3,$T33320vpaddq $D1hi,$D2lo,$D2lo33213322vpsrlq \$42,$D2lo,$tmp3323vpsllq \$10,$D2hi,$D2hi3324vpandq $mask42,$D2lo,$H23325vpaddq $tmp,$D2hi,$D2hi33263327vpaddq $T2,$H2,$H2 # accumulate input3328vpaddq $D2hi,$H0,$H03329vpsllq \$2,$D2hi,$D2hi33303331vpaddq $D2hi,$H0,$H03332vporq $T3,$T1,$T13333vpandq $mask44,$T1,$T133343335vpsrlq \$44,$H0,$tmp # additional step3336vpandq $mask44,$H0,$H033373338vpaddq $tmp,$H1,$H133393340sub \$4,$len # len-=643341jnz .Loop_vpmadd52_4x33423343.Ltail_vpmadd52_4x:3344vmovdqu64 128($ctx),$R2 # load all key powers3345vmovdqu64 160($ctx),$S13346vmovdqu64 64($ctx),$R03347vmovdqu64 96($ctx),$R133483349.Ltail_vpmadd52_2x:3350vpsllq \$2,$R2,$S2 # S2 = R2*5*43351vpaddq $R2,$S2,$S23352vpsllq \$2,$S2,$S233533354#vpaddq $T2,$H2,$H2 # accumulate input3355vpaddq $T0,$H0,$H03356vpaddq $T1,$H1,$H133573358vpxorq $D0lo,$D0lo,$D0lo3359vpmadd52luq $H2,$S1,$D0lo3360vpxorq $D0hi,$D0hi,$D0hi3361vpmadd52huq $H2,$S1,$D0hi3362vpxorq $D1lo,$D1lo,$D1lo3363vpmadd52luq $H2,$S2,$D1lo3364vpxorq $D1hi,$D1hi,$D1hi3365vpmadd52huq $H2,$S2,$D1hi3366vpxorq $D2lo,$D2lo,$D2lo3367vpmadd52luq $H2,$R0,$D2lo3368vpxorq $D2hi,$D2hi,$D2hi3369vpmadd52huq $H2,$R0,$D2hi33703371vpmadd52luq $H0,$R0,$D0lo3372vpmadd52huq $H0,$R0,$D0hi3373vpmadd52luq $H0,$R1,$D1lo3374vpmadd52huq $H0,$R1,$D1hi3375vpmadd52luq $H0,$R2,$D2lo3376vpmadd52huq $H0,$R2,$D2hi33773378vpmadd52luq $H1,$S2,$D0lo3379vpmadd52huq $H1,$S2,$D0hi3380vpmadd52luq $H1,$R0,$D1lo3381vpmadd52huq $H1,$R0,$D1hi3382vpmadd52luq $H1,$R1,$D2lo3383vpmadd52huq $H1,$R1,$D2hi33843385################################################################3386# horizontal addition33873388mov \$1,%eax3389kmovw %eax,%k13390vpsrldq \$8,$D0lo,$T03391vpsrldq \$8,$D0hi,$H03392vpsrldq \$8,$D1lo,$T13393vpsrldq \$8,$D1hi,$H13394vpaddq $T0,$D0lo,$D0lo3395vpaddq $H0,$D0hi,$D0hi3396vpsrldq \$8,$D2lo,$T23397vpsrldq \$8,$D2hi,$H23398vpaddq $T1,$D1lo,$D1lo3399vpaddq $H1,$D1hi,$D1hi3400vpermq \$0x2,$D0lo,$T03401vpermq \$0x2,$D0hi,$H03402vpaddq $T2,$D2lo,$D2lo3403vpaddq $H2,$D2hi,$D2hi34043405vpermq \$0x2,$D1lo,$T13406vpermq \$0x2,$D1hi,$H13407vpaddq $T0,$D0lo,${D0lo}{%k1}{z}3408vpaddq $H0,$D0hi,${D0hi}{%k1}{z}3409vpermq \$0x2,$D2lo,$T23410vpermq \$0x2,$D2hi,$H23411vpaddq $T1,$D1lo,${D1lo}{%k1}{z}3412vpaddq $H1,$D1hi,${D1hi}{%k1}{z}3413vpaddq $T2,$D2lo,${D2lo}{%k1}{z}3414vpaddq $H2,$D2hi,${D2hi}{%k1}{z}34153416################################################################3417# partial reduction3418vpsrlq \$44,$D0lo,$tmp3419vpsllq \$8,$D0hi,$D0hi3420vpandq $mask44,$D0lo,$H03421vpaddq $tmp,$D0hi,$D0hi34223423vpaddq $D0hi,$D1lo,$D1lo34243425vpsrlq \$44,$D1lo,$tmp3426vpsllq \$8,$D1hi,$D1hi3427vpandq $mask44,$D1lo,$H13428vpaddq $tmp,$D1hi,$D1hi34293430vpaddq $D1hi,$D2lo,$D2lo34313432vpsrlq \$42,$D2lo,$tmp3433vpsllq \$10,$D2hi,$D2hi3434vpandq $mask42,$D2lo,$H23435vpaddq $tmp,$D2hi,$D2hi34363437vpaddq $D2hi,$H0,$H03438vpsllq \$2,$D2hi,$D2hi34393440vpaddq $D2hi,$H0,$H034413442vpsrlq \$44,$H0,$tmp # additional step3443vpandq $mask44,$H0,$H034443445vpaddq $tmp,$H1,$H13446# at this point $len is3447# either 4*n+2 or 0...3448sub \$2,$len # len-=323449ja .Lblocks_vpmadd52_4x_do34503451vmovq %x#$H0,0($ctx)3452vmovq %x#$H1,8($ctx)3453vmovq %x#$H2,16($ctx)3454vzeroall34553456.Lno_data_vpmadd52_4x:3457RET3458.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x3459___3460}3461{3462########################################################################3463# As implied by its name 8x subroutine processes 8 blocks in parallel...3464# This is intermediate version, as it's used only in cases when input3465# length is either 8*n, 8*n+1 or 8*n+2...34663467my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));3468my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));3469my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));3470my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));34713472$code.=<<___;3473.type poly1305_blocks_vpmadd52_8x,\@function,43474.align 323475poly1305_blocks_vpmadd52_8x:3476shr \$4,$len3477jz .Lno_data_vpmadd52_8x # too short34783479shl \$40,$padbit3480mov 64($ctx),%r8 # peek on power of the key34813482vmovdqa64 .Lx_mask44(%rip),$mask443483vmovdqa64 .Lx_mask42(%rip),$mask4234843485test %r8,%r8 # is power value impossible?3486js .Linit_vpmadd52 # if it is, then init R[4]34873488vmovq 0($ctx),%x#$H0 # load current hash value3489vmovq 8($ctx),%x#$H13490vmovq 16($ctx),%x#$H234913492.Lblocks_vpmadd52_8x:3493################################################################3494# fist we calculate more key powers34953496vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers3497vmovdqu64 160($ctx),$S13498vmovdqu64 64($ctx),$R03499vmovdqu64 96($ctx),$R135003501vpsllq \$2,$R2,$S2 # S2 = R2*5*43502vpaddq $R2,$S2,$S23503vpsllq \$2,$S2,$S235043505vpbroadcastq %x#$R2,$RR2 # broadcast 4th power3506vpbroadcastq %x#$R0,$RR03507vpbroadcastq %x#$R1,$RR135083509vpxorq $D0lo,$D0lo,$D0lo3510vpmadd52luq $RR2,$S1,$D0lo3511vpxorq $D0hi,$D0hi,$D0hi3512vpmadd52huq $RR2,$S1,$D0hi3513vpxorq $D1lo,$D1lo,$D1lo3514vpmadd52luq $RR2,$S2,$D1lo3515vpxorq $D1hi,$D1hi,$D1hi3516vpmadd52huq $RR2,$S2,$D1hi3517vpxorq $D2lo,$D2lo,$D2lo3518vpmadd52luq $RR2,$R0,$D2lo3519vpxorq $D2hi,$D2hi,$D2hi3520vpmadd52huq $RR2,$R0,$D2hi35213522vpmadd52luq $RR0,$R0,$D0lo3523vpmadd52huq $RR0,$R0,$D0hi3524vpmadd52luq $RR0,$R1,$D1lo3525vpmadd52huq $RR0,$R1,$D1hi3526vpmadd52luq $RR0,$R2,$D2lo3527vpmadd52huq $RR0,$R2,$D2hi35283529vpmadd52luq $RR1,$S2,$D0lo3530vpmadd52huq $RR1,$S2,$D0hi3531vpmadd52luq $RR1,$R0,$D1lo3532vpmadd52huq $RR1,$R0,$D1hi3533vpmadd52luq $RR1,$R1,$D2lo3534vpmadd52huq $RR1,$R1,$D2hi35353536################################################################3537# partial reduction3538vpsrlq \$44,$D0lo,$tmp3539vpsllq \$8,$D0hi,$D0hi3540vpandq $mask44,$D0lo,$RR03541vpaddq $tmp,$D0hi,$D0hi35423543vpaddq $D0hi,$D1lo,$D1lo35443545vpsrlq \$44,$D1lo,$tmp3546vpsllq \$8,$D1hi,$D1hi3547vpandq $mask44,$D1lo,$RR13548vpaddq $tmp,$D1hi,$D1hi35493550vpaddq $D1hi,$D2lo,$D2lo35513552vpsrlq \$42,$D2lo,$tmp3553vpsllq \$10,$D2hi,$D2hi3554vpandq $mask42,$D2lo,$RR23555vpaddq $tmp,$D2hi,$D2hi35563557vpaddq $D2hi,$RR0,$RR03558vpsllq \$2,$D2hi,$D2hi35593560vpaddq $D2hi,$RR0,$RR035613562vpsrlq \$44,$RR0,$tmp # additional step3563vpandq $mask44,$RR0,$RR035643565vpaddq $tmp,$RR1,$RR135663567################################################################3568# At this point Rx holds 1324 powers, RRx - 5768, and the goal3569# is 15263748, which reflects how data is loaded...35703571vpunpcklqdq $R2,$RR2,$T2 # 37483572vpunpckhqdq $R2,$RR2,$R2 # 15263573vpunpcklqdq $R0,$RR0,$T03574vpunpckhqdq $R0,$RR0,$R03575vpunpcklqdq $R1,$RR1,$T13576vpunpckhqdq $R1,$RR1,$R13577___3578######## switch to %zmm3579map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);3580map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);3581map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);3582map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);35833584$code.=<<___;3585vshufi64x2 \$0x44,$R2,$T2,$RR2 # 152637483586vshufi64x2 \$0x44,$R0,$T0,$RR03587vshufi64x2 \$0x44,$R1,$T1,$RR135883589vmovdqu64 16*0($inp),$T2 # load data3590vmovdqu64 16*4($inp),$T33591lea 16*8($inp),$inp35923593vpsllq \$2,$RR2,$SS2 # S2 = R2*5*43594vpsllq \$2,$RR1,$SS1 # S1 = R1*5*43595vpaddq $RR2,$SS2,$SS23596vpaddq $RR1,$SS1,$SS13597vpsllq \$2,$SS2,$SS23598vpsllq \$2,$SS1,$SS135993600vpbroadcastq $padbit,$PAD3601vpbroadcastq %x#$mask44,$mask443602vpbroadcastq %x#$mask42,$mask4236033604vpbroadcastq %x#$SS1,$S1 # broadcast 8th power3605vpbroadcastq %x#$SS2,$S23606vpbroadcastq %x#$RR0,$R03607vpbroadcastq %x#$RR1,$R13608vpbroadcastq %x#$RR2,$R236093610vpunpcklqdq $T3,$T2,$T1 # transpose data3611vpunpckhqdq $T3,$T2,$T336123613# at this point 64-bit lanes are ordered as 7362514036143615vpsrlq \$24,$T3,$T2 # splat the data3616vporq $PAD,$T2,$T23617vpaddq $T2,$H2,$H2 # accumulate input3618vpandq $mask44,$T1,$T03619vpsrlq \$44,$T1,$T13620vpsllq \$20,$T3,$T33621vporq $T3,$T1,$T13622vpandq $mask44,$T1,$T136233624sub \$8,$len3625jz .Ltail_vpmadd52_8x3626jmp .Loop_vpmadd52_8x36273628.align 323629.Loop_vpmadd52_8x:3630#vpaddq $T2,$H2,$H2 # accumulate input3631vpaddq $T0,$H0,$H03632vpaddq $T1,$H1,$H136333634vpxorq $D0lo,$D0lo,$D0lo3635vpmadd52luq $H2,$S1,$D0lo3636vpxorq $D0hi,$D0hi,$D0hi3637vpmadd52huq $H2,$S1,$D0hi3638vpxorq $D1lo,$D1lo,$D1lo3639vpmadd52luq $H2,$S2,$D1lo3640vpxorq $D1hi,$D1hi,$D1hi3641vpmadd52huq $H2,$S2,$D1hi3642vpxorq $D2lo,$D2lo,$D2lo3643vpmadd52luq $H2,$R0,$D2lo3644vpxorq $D2hi,$D2hi,$D2hi3645vpmadd52huq $H2,$R0,$D2hi36463647vmovdqu64 16*0($inp),$T2 # load data3648vmovdqu64 16*4($inp),$T33649lea 16*8($inp),$inp3650vpmadd52luq $H0,$R0,$D0lo3651vpmadd52huq $H0,$R0,$D0hi3652vpmadd52luq $H0,$R1,$D1lo3653vpmadd52huq $H0,$R1,$D1hi3654vpmadd52luq $H0,$R2,$D2lo3655vpmadd52huq $H0,$R2,$D2hi36563657vpunpcklqdq $T3,$T2,$T1 # transpose data3658vpunpckhqdq $T3,$T2,$T33659vpmadd52luq $H1,$S2,$D0lo3660vpmadd52huq $H1,$S2,$D0hi3661vpmadd52luq $H1,$R0,$D1lo3662vpmadd52huq $H1,$R0,$D1hi3663vpmadd52luq $H1,$R1,$D2lo3664vpmadd52huq $H1,$R1,$D2hi36653666################################################################3667# partial reduction (interleaved with data splat)3668vpsrlq \$44,$D0lo,$tmp3669vpsllq \$8,$D0hi,$D0hi3670vpandq $mask44,$D0lo,$H03671vpaddq $tmp,$D0hi,$D0hi36723673vpsrlq \$24,$T3,$T23674vporq $PAD,$T2,$T23675vpaddq $D0hi,$D1lo,$D1lo36763677vpsrlq \$44,$D1lo,$tmp3678vpsllq \$8,$D1hi,$D1hi3679vpandq $mask44,$D1lo,$H13680vpaddq $tmp,$D1hi,$D1hi36813682vpandq $mask44,$T1,$T03683vpsrlq \$44,$T1,$T13684vpsllq \$20,$T3,$T33685vpaddq $D1hi,$D2lo,$D2lo36863687vpsrlq \$42,$D2lo,$tmp3688vpsllq \$10,$D2hi,$D2hi3689vpandq $mask42,$D2lo,$H23690vpaddq $tmp,$D2hi,$D2hi36913692vpaddq $T2,$H2,$H2 # accumulate input3693vpaddq $D2hi,$H0,$H03694vpsllq \$2,$D2hi,$D2hi36953696vpaddq $D2hi,$H0,$H03697vporq $T3,$T1,$T13698vpandq $mask44,$T1,$T136993700vpsrlq \$44,$H0,$tmp # additional step3701vpandq $mask44,$H0,$H037023703vpaddq $tmp,$H1,$H137043705sub \$8,$len # len-=1283706jnz .Loop_vpmadd52_8x37073708.Ltail_vpmadd52_8x:3709#vpaddq $T2,$H2,$H2 # accumulate input3710vpaddq $T0,$H0,$H03711vpaddq $T1,$H1,$H137123713vpxorq $D0lo,$D0lo,$D0lo3714vpmadd52luq $H2,$SS1,$D0lo3715vpxorq $D0hi,$D0hi,$D0hi3716vpmadd52huq $H2,$SS1,$D0hi3717vpxorq $D1lo,$D1lo,$D1lo3718vpmadd52luq $H2,$SS2,$D1lo3719vpxorq $D1hi,$D1hi,$D1hi3720vpmadd52huq $H2,$SS2,$D1hi3721vpxorq $D2lo,$D2lo,$D2lo3722vpmadd52luq $H2,$RR0,$D2lo3723vpxorq $D2hi,$D2hi,$D2hi3724vpmadd52huq $H2,$RR0,$D2hi37253726vpmadd52luq $H0,$RR0,$D0lo3727vpmadd52huq $H0,$RR0,$D0hi3728vpmadd52luq $H0,$RR1,$D1lo3729vpmadd52huq $H0,$RR1,$D1hi3730vpmadd52luq $H0,$RR2,$D2lo3731vpmadd52huq $H0,$RR2,$D2hi37323733vpmadd52luq $H1,$SS2,$D0lo3734vpmadd52huq $H1,$SS2,$D0hi3735vpmadd52luq $H1,$RR0,$D1lo3736vpmadd52huq $H1,$RR0,$D1hi3737vpmadd52luq $H1,$RR1,$D2lo3738vpmadd52huq $H1,$RR1,$D2hi37393740################################################################3741# horizontal addition37423743mov \$1,%eax3744kmovw %eax,%k13745vpsrldq \$8,$D0lo,$T03746vpsrldq \$8,$D0hi,$H03747vpsrldq \$8,$D1lo,$T13748vpsrldq \$8,$D1hi,$H13749vpaddq $T0,$D0lo,$D0lo3750vpaddq $H0,$D0hi,$D0hi3751vpsrldq \$8,$D2lo,$T23752vpsrldq \$8,$D2hi,$H23753vpaddq $T1,$D1lo,$D1lo3754vpaddq $H1,$D1hi,$D1hi3755vpermq \$0x2,$D0lo,$T03756vpermq \$0x2,$D0hi,$H03757vpaddq $T2,$D2lo,$D2lo3758vpaddq $H2,$D2hi,$D2hi37593760vpermq \$0x2,$D1lo,$T13761vpermq \$0x2,$D1hi,$H13762vpaddq $T0,$D0lo,$D0lo3763vpaddq $H0,$D0hi,$D0hi3764vpermq \$0x2,$D2lo,$T23765vpermq \$0x2,$D2hi,$H23766vpaddq $T1,$D1lo,$D1lo3767vpaddq $H1,$D1hi,$D1hi3768vextracti64x4 \$1,$D0lo,%y#$T03769vextracti64x4 \$1,$D0hi,%y#$H03770vpaddq $T2,$D2lo,$D2lo3771vpaddq $H2,$D2hi,$D2hi37723773vextracti64x4 \$1,$D1lo,%y#$T13774vextracti64x4 \$1,$D1hi,%y#$H13775vextracti64x4 \$1,$D2lo,%y#$T23776vextracti64x4 \$1,$D2hi,%y#$H23777___3778######## switch back to %ymm3779map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);3780map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);3781map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);37823783$code.=<<___;3784vpaddq $T0,$D0lo,${D0lo}{%k1}{z}3785vpaddq $H0,$D0hi,${D0hi}{%k1}{z}3786vpaddq $T1,$D1lo,${D1lo}{%k1}{z}3787vpaddq $H1,$D1hi,${D1hi}{%k1}{z}3788vpaddq $T2,$D2lo,${D2lo}{%k1}{z}3789vpaddq $H2,$D2hi,${D2hi}{%k1}{z}37903791################################################################3792# partial reduction3793vpsrlq \$44,$D0lo,$tmp3794vpsllq \$8,$D0hi,$D0hi3795vpandq $mask44,$D0lo,$H03796vpaddq $tmp,$D0hi,$D0hi37973798vpaddq $D0hi,$D1lo,$D1lo37993800vpsrlq \$44,$D1lo,$tmp3801vpsllq \$8,$D1hi,$D1hi3802vpandq $mask44,$D1lo,$H13803vpaddq $tmp,$D1hi,$D1hi38043805vpaddq $D1hi,$D2lo,$D2lo38063807vpsrlq \$42,$D2lo,$tmp3808vpsllq \$10,$D2hi,$D2hi3809vpandq $mask42,$D2lo,$H23810vpaddq $tmp,$D2hi,$D2hi38113812vpaddq $D2hi,$H0,$H03813vpsllq \$2,$D2hi,$D2hi38143815vpaddq $D2hi,$H0,$H038163817vpsrlq \$44,$H0,$tmp # additional step3818vpandq $mask44,$H0,$H038193820vpaddq $tmp,$H1,$H138213822################################################################38233824vmovq %x#$H0,0($ctx)3825vmovq %x#$H1,8($ctx)3826vmovq %x#$H2,16($ctx)3827vzeroall38283829.Lno_data_vpmadd52_8x:3830RET3831.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x3832___3833}3834$code.=<<___;3835.type poly1305_emit_base2_44,\@function,33836.align 323837poly1305_emit_base2_44:3838mov 0($ctx),%r8 # load hash value3839mov 8($ctx),%r93840mov 16($ctx),%r1038413842mov %r9,%rax3843shr \$20,%r93844shl \$44,%rax3845mov %r10,%rcx3846shr \$40,%r103847shl \$24,%rcx38483849add %rax,%r83850adc %rcx,%r93851adc \$0,%r1038523853mov %r8,%rax3854add \$5,%r8 # compare to modulus3855mov %r9,%rcx3856adc \$0,%r93857adc \$0,%r103858shr \$2,%r10 # did 130-bit value overflow?3859cmovnz %r8,%rax3860cmovnz %r9,%rcx38613862add 0($nonce),%rax # accumulate nonce3863adc 8($nonce),%rcx3864mov %rax,0($mac) # write result3865mov %rcx,8($mac)38663867RET3868.size poly1305_emit_base2_44,.-poly1305_emit_base2_443869___3870} } }3871}38723873if (!$kernel)3874{ # chacha20-poly1305 helpers3875my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order3876("%rdi","%rsi","%rdx","%rcx"); # Unix order3877$code.=<<___;3878.globl xor128_encrypt_n_pad3879.type xor128_encrypt_n_pad,\@abi-omnipotent3880.align 163881xor128_encrypt_n_pad:3882sub $otp,$inp3883sub $otp,$out3884mov $len,%r10 # put len aside3885shr \$4,$len # len / 163886jz .Ltail_enc3887nop3888.Loop_enc_xmm:3889movdqu ($inp,$otp),%xmm03890pxor ($otp),%xmm03891movdqu %xmm0,($out,$otp)3892movdqa %xmm0,($otp)3893lea 16($otp),$otp3894dec $len3895jnz .Loop_enc_xmm38963897and \$15,%r10 # len % 163898jz .Ldone_enc38993900.Ltail_enc:3901mov \$16,$len3902sub %r10,$len3903xor %eax,%eax3904.Loop_enc_byte:3905mov ($inp,$otp),%al3906xor ($otp),%al3907mov %al,($out,$otp)3908mov %al,($otp)3909lea 1($otp),$otp3910dec %r103911jnz .Loop_enc_byte39123913xor %eax,%eax3914.Loop_enc_pad:3915mov %al,($otp)3916lea 1($otp),$otp3917dec $len3918jnz .Loop_enc_pad39193920.Ldone_enc:3921mov $otp,%rax3922RET3923.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad39243925.globl xor128_decrypt_n_pad3926.type xor128_decrypt_n_pad,\@abi-omnipotent3927.align 163928xor128_decrypt_n_pad:3929sub $otp,$inp3930sub $otp,$out3931mov $len,%r10 # put len aside3932shr \$4,$len # len / 163933jz .Ltail_dec3934nop3935.Loop_dec_xmm:3936movdqu ($inp,$otp),%xmm03937movdqa ($otp),%xmm13938pxor %xmm0,%xmm13939movdqu %xmm1,($out,$otp)3940movdqa %xmm0,($otp)3941lea 16($otp),$otp3942dec $len3943jnz .Loop_dec_xmm39443945pxor %xmm1,%xmm13946and \$15,%r10 # len % 163947jz .Ldone_dec39483949.Ltail_dec:3950mov \$16,$len3951sub %r10,$len3952xor %eax,%eax3953xor %r11d,%r11d3954.Loop_dec_byte:3955mov ($inp,$otp),%r11b3956mov ($otp),%al3957xor %r11b,%al3958mov %al,($out,$otp)3959mov %r11b,($otp)3960lea 1($otp),$otp3961dec %r103962jnz .Loop_dec_byte39633964xor %eax,%eax3965.Loop_dec_pad:3966mov %al,($otp)3967lea 1($otp),$otp3968dec $len3969jnz .Loop_dec_pad39703971.Ldone_dec:3972mov $otp,%rax3973RET3974.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad3975___3976}39773978# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,3979# CONTEXT *context,DISPATCHER_CONTEXT *disp)3980if ($win64) {3981$rec="%rcx";3982$frame="%rdx";3983$context="%r8";3984$disp="%r9";39853986$code.=<<___;3987.extern __imp_RtlVirtualUnwind3988.type se_handler,\@abi-omnipotent3989.align 163990se_handler:3991push %rsi3992push %rdi3993push %rbx3994push %rbp3995push %r123996push %r133997push %r143998push %r153999pushfq4000sub \$64,%rsp40014002mov 120($context),%rax # pull context->Rax4003mov 248($context),%rbx # pull context->Rip40044005mov 8($disp),%rsi # disp->ImageBase4006mov 56($disp),%r11 # disp->HandlerData40074008mov 0(%r11),%r10d # HandlerData[0]4009lea (%rsi,%r10),%r10 # prologue label4010cmp %r10,%rbx # context->Rip<.Lprologue4011jb .Lcommon_seh_tail40124013mov 152($context),%rax # pull context->Rsp40144015mov 4(%r11),%r10d # HandlerData[1]4016lea (%rsi,%r10),%r10 # epilogue label4017cmp %r10,%rbx # context->Rip>=.Lepilogue4018jae .Lcommon_seh_tail40194020lea 48(%rax),%rax40214022mov -8(%rax),%rbx4023mov -16(%rax),%rbp4024mov -24(%rax),%r124025mov -32(%rax),%r134026mov -40(%rax),%r144027mov -48(%rax),%r154028mov %rbx,144($context) # restore context->Rbx4029mov %rbp,160($context) # restore context->Rbp4030mov %r12,216($context) # restore context->R124031mov %r13,224($context) # restore context->R134032mov %r14,232($context) # restore context->R144033mov %r15,240($context) # restore context->R1440344035jmp .Lcommon_seh_tail4036.size se_handler,.-se_handler40374038.type avx_handler,\@abi-omnipotent4039.align 164040avx_handler:4041push %rsi4042push %rdi4043push %rbx4044push %rbp4045push %r124046push %r134047push %r144048push %r154049pushfq4050sub \$64,%rsp40514052mov 120($context),%rax # pull context->Rax4053mov 248($context),%rbx # pull context->Rip40544055mov 8($disp),%rsi # disp->ImageBase4056mov 56($disp),%r11 # disp->HandlerData40574058mov 0(%r11),%r10d # HandlerData[0]4059lea (%rsi,%r10),%r10 # prologue label4060cmp %r10,%rbx # context->Rip<prologue label4061jb .Lcommon_seh_tail40624063mov 152($context),%rax # pull context->Rsp40644065mov 4(%r11),%r10d # HandlerData[1]4066lea (%rsi,%r10),%r10 # epilogue label4067cmp %r10,%rbx # context->Rip>=epilogue label4068jae .Lcommon_seh_tail40694070mov 208($context),%rax # pull context->R1140714072lea 0x50(%rax),%rsi4073lea 0xf8(%rax),%rax4074lea 512($context),%rdi # &context.Xmm64075mov \$20,%ecx4076.long 0xa548f3fc # cld; rep movsq40774078.Lcommon_seh_tail:4079mov 8(%rax),%rdi4080mov 16(%rax),%rsi4081mov %rax,152($context) # restore context->Rsp4082mov %rsi,168($context) # restore context->Rsi4083mov %rdi,176($context) # restore context->Rdi40844085mov 40($disp),%rdi # disp->ContextRecord4086mov $context,%rsi # context4087mov \$154,%ecx # sizeof(CONTEXT)4088.long 0xa548f3fc # cld; rep movsq40894090mov $disp,%rsi4091xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER4092mov 8(%rsi),%rdx # arg2, disp->ImageBase4093mov 0(%rsi),%r8 # arg3, disp->ControlPc4094mov 16(%rsi),%r9 # arg4, disp->FunctionEntry4095mov 40(%rsi),%r10 # disp->ContextRecord4096lea 56(%rsi),%r11 # &disp->HandlerData4097lea 24(%rsi),%r12 # &disp->EstablisherFrame4098mov %r10,32(%rsp) # arg54099mov %r11,40(%rsp) # arg64100mov %r12,48(%rsp) # arg74101mov %rcx,56(%rsp) # arg8, (NULL)4102call *__imp_RtlVirtualUnwind(%rip)41034104mov \$1,%eax # ExceptionContinueSearch4105add \$64,%rsp4106popfq4107pop %r154108pop %r144109pop %r134110pop %r124111pop %rbp4112pop %rbx4113pop %rdi4114pop %rsi4115RET4116.size avx_handler,.-avx_handler41174118.section .pdata4119.align 44120.rva .LSEH_begin_poly1305_block_init_arch4121.rva .LSEH_end_poly1305_block_init_arch4122.rva .LSEH_info_poly1305_block_init_arch41234124.rva .LSEH_begin_poly1305_blocks_x86_644125.rva .LSEH_end_poly1305_blocks_x86_644126.rva .LSEH_info_poly1305_blocks_x86_6441274128.rva .LSEH_begin_poly1305_emit_x86_644129.rva .LSEH_end_poly1305_emit_x86_644130.rva .LSEH_info_poly1305_emit_x86_644131___4132$code.=<<___ if ($avx);4133.rva .LSEH_begin_poly1305_blocks_avx4134.rva .Lbase2_64_avx4135.rva .LSEH_info_poly1305_blocks_avx_141364137.rva .Lbase2_64_avx4138.rva .Leven_avx4139.rva .LSEH_info_poly1305_blocks_avx_241404141.rva .Leven_avx4142.rva .LSEH_end_poly1305_blocks_avx4143.rva .LSEH_info_poly1305_blocks_avx_341444145.rva .LSEH_begin_poly1305_emit_avx4146.rva .LSEH_end_poly1305_emit_avx4147.rva .LSEH_info_poly1305_emit_avx4148___4149$code.=<<___ if ($avx>1);4150.rva .LSEH_begin_poly1305_blocks_avx24151.rva .Lbase2_64_avx24152.rva .LSEH_info_poly1305_blocks_avx2_141534154.rva .Lbase2_64_avx24155.rva .Leven_avx24156.rva .LSEH_info_poly1305_blocks_avx2_241574158.rva .Leven_avx24159.rva .LSEH_end_poly1305_blocks_avx24160.rva .LSEH_info_poly1305_blocks_avx2_34161___4162$code.=<<___ if ($avx>2);4163.rva .LSEH_begin_poly1305_blocks_avx5124164.rva .LSEH_end_poly1305_blocks_avx5124165.rva .LSEH_info_poly1305_blocks_avx5124166___4167$code.=<<___;4168.section .xdata4169.align 84170.LSEH_info_poly1305_block_init_arch:4171.byte 9,0,0,04172.rva se_handler4173.rva .LSEH_begin_poly1305_block_init_arch,.LSEH_begin_poly1305_block_init_arch41744175.LSEH_info_poly1305_blocks_x86_64:4176.byte 9,0,0,04177.rva se_handler4178.rva .Lblocks_body,.Lblocks_epilogue41794180.LSEH_info_poly1305_emit_x86_64:4181.byte 9,0,0,04182.rva se_handler4183.rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_644184___4185$code.=<<___ if ($avx);4186.LSEH_info_poly1305_blocks_avx_1:4187.byte 9,0,0,04188.rva se_handler4189.rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]41904191.LSEH_info_poly1305_blocks_avx_2:4192.byte 9,0,0,04193.rva se_handler4194.rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]41954196.LSEH_info_poly1305_blocks_avx_3:4197.byte 9,0,0,04198.rva avx_handler4199.rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]42004201.LSEH_info_poly1305_emit_avx:4202.byte 9,0,0,04203.rva se_handler4204.rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx4205___4206$code.=<<___ if ($avx>1);4207.LSEH_info_poly1305_blocks_avx2_1:4208.byte 9,0,0,04209.rva se_handler4210.rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]42114212.LSEH_info_poly1305_blocks_avx2_2:4213.byte 9,0,0,04214.rva se_handler4215.rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]42164217.LSEH_info_poly1305_blocks_avx2_3:4218.byte 9,0,0,04219.rva avx_handler4220.rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]4221___4222$code.=<<___ if ($avx>2);4223.LSEH_info_poly1305_blocks_avx512:4224.byte 9,0,0,04225.rva avx_handler4226.rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]4227___4228}42294230open SELF,$0;4231while(<SELF>) {4232next if (/^#!/);4233last if (!s/^#/\/\// and !/^$/);4234print;4235}4236close SELF;42374238foreach (split('\n',$code)) {4239s/\`([^\`]*)\`/eval($1)/ge;4240s/%r([a-z]+)#d/%e$1/g;4241s/%r([0-9]+)#d/%r$1d/g;4242s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;42434244if ($kernel) {4245s/(^\.type.*),[0-9]+$/\1/;4246s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;4247next if /^\.cfi.*/;4248}42494250print $_,"\n";4251}4252close STDOUT;425342544255