Path: blob/master/lib/crypto/x86/chacha-avx512vl-x86_64.S
26292 views
/* SPDX-License-Identifier: GPL-2.0+ */1/*2* ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions3*4* Copyright (C) 2018 Martin Willi5*/67#include <linux/linkage.h>89.section .rodata.cst32.CTR2BL, "aM", @progbits, 3210.align 3211CTR2BL: .octa 0x0000000000000000000000000000000012.octa 0x000000000000000000000000000000011314.section .rodata.cst32.CTR4BL, "aM", @progbits, 3215.align 3216CTR4BL: .octa 0x0000000000000000000000000000000217.octa 0x000000000000000000000000000000031819.section .rodata.cst32.CTR8BL, "aM", @progbits, 3220.align 3221CTR8BL: .octa 0x0000000300000002000000010000000022.octa 0x000000070000000600000005000000042324.text2526SYM_FUNC_START(chacha_2block_xor_avx512vl)27# %rdi: Input state matrix, s28# %rsi: up to 2 data blocks output, o29# %rdx: up to 2 data blocks input, i30# %rcx: input/output length in bytes31# %r8d: nrounds3233# This function encrypts two ChaCha blocks by loading the state34# matrix twice across four AVX registers. It performs matrix operations35# on four words in each matrix in parallel, but requires shuffling to36# rearrange the words after each round.3738vzeroupper3940# x0..3[0-2] = s0..341vbroadcasti128 0x00(%rdi),%ymm042vbroadcasti128 0x10(%rdi),%ymm143vbroadcasti128 0x20(%rdi),%ymm244vbroadcasti128 0x30(%rdi),%ymm34546vpaddd CTR2BL(%rip),%ymm3,%ymm34748vmovdqa %ymm0,%ymm849vmovdqa %ymm1,%ymm950vmovdqa %ymm2,%ymm1051vmovdqa %ymm3,%ymm115253.Ldoubleround:5455# x0 += x1, x3 = rotl32(x3 ^ x0, 16)56vpaddd %ymm1,%ymm0,%ymm057vpxord %ymm0,%ymm3,%ymm358vprold $16,%ymm3,%ymm35960# x2 += x3, x1 = rotl32(x1 ^ x2, 12)61vpaddd %ymm3,%ymm2,%ymm262vpxord %ymm2,%ymm1,%ymm163vprold $12,%ymm1,%ymm16465# x0 += x1, x3 = rotl32(x3 ^ x0, 8)66vpaddd %ymm1,%ymm0,%ymm067vpxord %ymm0,%ymm3,%ymm368vprold $8,%ymm3,%ymm36970# x2 += x3, x1 = rotl32(x1 ^ x2, 7)71vpaddd %ymm3,%ymm2,%ymm272vpxord %ymm2,%ymm1,%ymm173vprold $7,%ymm1,%ymm17475# x1 = shuffle32(x1, MASK(0, 3, 2, 1))76vpshufd $0x39,%ymm1,%ymm177# x2 = shuffle32(x2, MASK(1, 0, 3, 2))78vpshufd $0x4e,%ymm2,%ymm279# x3 = shuffle32(x3, MASK(2, 1, 0, 3))80vpshufd $0x93,%ymm3,%ymm38182# x0 += x1, x3 = rotl32(x3 ^ x0, 16)83vpaddd %ymm1,%ymm0,%ymm084vpxord %ymm0,%ymm3,%ymm385vprold $16,%ymm3,%ymm38687# x2 += x3, x1 = rotl32(x1 ^ x2, 12)88vpaddd %ymm3,%ymm2,%ymm289vpxord %ymm2,%ymm1,%ymm190vprold $12,%ymm1,%ymm19192# x0 += x1, x3 = rotl32(x3 ^ x0, 8)93vpaddd %ymm1,%ymm0,%ymm094vpxord %ymm0,%ymm3,%ymm395vprold $8,%ymm3,%ymm39697# x2 += x3, x1 = rotl32(x1 ^ x2, 7)98vpaddd %ymm3,%ymm2,%ymm299vpxord %ymm2,%ymm1,%ymm1100vprold $7,%ymm1,%ymm1101102# x1 = shuffle32(x1, MASK(2, 1, 0, 3))103vpshufd $0x93,%ymm1,%ymm1104# x2 = shuffle32(x2, MASK(1, 0, 3, 2))105vpshufd $0x4e,%ymm2,%ymm2106# x3 = shuffle32(x3, MASK(0, 3, 2, 1))107vpshufd $0x39,%ymm3,%ymm3108109sub $2,%r8d110jnz .Ldoubleround111112# o0 = i0 ^ (x0 + s0)113vpaddd %ymm8,%ymm0,%ymm7114cmp $0x10,%rcx115jl .Lxorpart2116vpxord 0x00(%rdx),%xmm7,%xmm6117vmovdqu %xmm6,0x00(%rsi)118vextracti128 $1,%ymm7,%xmm0119# o1 = i1 ^ (x1 + s1)120vpaddd %ymm9,%ymm1,%ymm7121cmp $0x20,%rcx122jl .Lxorpart2123vpxord 0x10(%rdx),%xmm7,%xmm6124vmovdqu %xmm6,0x10(%rsi)125vextracti128 $1,%ymm7,%xmm1126# o2 = i2 ^ (x2 + s2)127vpaddd %ymm10,%ymm2,%ymm7128cmp $0x30,%rcx129jl .Lxorpart2130vpxord 0x20(%rdx),%xmm7,%xmm6131vmovdqu %xmm6,0x20(%rsi)132vextracti128 $1,%ymm7,%xmm2133# o3 = i3 ^ (x3 + s3)134vpaddd %ymm11,%ymm3,%ymm7135cmp $0x40,%rcx136jl .Lxorpart2137vpxord 0x30(%rdx),%xmm7,%xmm6138vmovdqu %xmm6,0x30(%rsi)139vextracti128 $1,%ymm7,%xmm3140141# xor and write second block142vmovdqa %xmm0,%xmm7143cmp $0x50,%rcx144jl .Lxorpart2145vpxord 0x40(%rdx),%xmm7,%xmm6146vmovdqu %xmm6,0x40(%rsi)147148vmovdqa %xmm1,%xmm7149cmp $0x60,%rcx150jl .Lxorpart2151vpxord 0x50(%rdx),%xmm7,%xmm6152vmovdqu %xmm6,0x50(%rsi)153154vmovdqa %xmm2,%xmm7155cmp $0x70,%rcx156jl .Lxorpart2157vpxord 0x60(%rdx),%xmm7,%xmm6158vmovdqu %xmm6,0x60(%rsi)159160vmovdqa %xmm3,%xmm7161cmp $0x80,%rcx162jl .Lxorpart2163vpxord 0x70(%rdx),%xmm7,%xmm6164vmovdqu %xmm6,0x70(%rsi)165166.Ldone2:167vzeroupper168RET169170.Lxorpart2:171# xor remaining bytes from partial register into output172mov %rcx,%rax173and $0xf,%rcx174jz .Ldone2175mov %rax,%r9176and $~0xf,%r9177178mov $1,%rax179shld %cl,%rax,%rax180sub $1,%rax181kmovq %rax,%k1182183vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}184vpxord %xmm7,%xmm1,%xmm1185vmovdqu8 %xmm1,(%rsi,%r9){%k1}186187jmp .Ldone2188189SYM_FUNC_END(chacha_2block_xor_avx512vl)190191SYM_FUNC_START(chacha_4block_xor_avx512vl)192# %rdi: Input state matrix, s193# %rsi: up to 4 data blocks output, o194# %rdx: up to 4 data blocks input, i195# %rcx: input/output length in bytes196# %r8d: nrounds197198# This function encrypts four ChaCha blocks by loading the state199# matrix four times across eight AVX registers. It performs matrix200# operations on four words in two matrices in parallel, sequentially201# to the operations on the four words of the other two matrices. The202# required word shuffling has a rather high latency, we can do the203# arithmetic on two matrix-pairs without much slowdown.204205vzeroupper206207# x0..3[0-4] = s0..3208vbroadcasti128 0x00(%rdi),%ymm0209vbroadcasti128 0x10(%rdi),%ymm1210vbroadcasti128 0x20(%rdi),%ymm2211vbroadcasti128 0x30(%rdi),%ymm3212213vmovdqa %ymm0,%ymm4214vmovdqa %ymm1,%ymm5215vmovdqa %ymm2,%ymm6216vmovdqa %ymm3,%ymm7217218vpaddd CTR2BL(%rip),%ymm3,%ymm3219vpaddd CTR4BL(%rip),%ymm7,%ymm7220221vmovdqa %ymm0,%ymm11222vmovdqa %ymm1,%ymm12223vmovdqa %ymm2,%ymm13224vmovdqa %ymm3,%ymm14225vmovdqa %ymm7,%ymm15226227.Ldoubleround4:228229# x0 += x1, x3 = rotl32(x3 ^ x0, 16)230vpaddd %ymm1,%ymm0,%ymm0231vpxord %ymm0,%ymm3,%ymm3232vprold $16,%ymm3,%ymm3233234vpaddd %ymm5,%ymm4,%ymm4235vpxord %ymm4,%ymm7,%ymm7236vprold $16,%ymm7,%ymm7237238# x2 += x3, x1 = rotl32(x1 ^ x2, 12)239vpaddd %ymm3,%ymm2,%ymm2240vpxord %ymm2,%ymm1,%ymm1241vprold $12,%ymm1,%ymm1242243vpaddd %ymm7,%ymm6,%ymm6244vpxord %ymm6,%ymm5,%ymm5245vprold $12,%ymm5,%ymm5246247# x0 += x1, x3 = rotl32(x3 ^ x0, 8)248vpaddd %ymm1,%ymm0,%ymm0249vpxord %ymm0,%ymm3,%ymm3250vprold $8,%ymm3,%ymm3251252vpaddd %ymm5,%ymm4,%ymm4253vpxord %ymm4,%ymm7,%ymm7254vprold $8,%ymm7,%ymm7255256# x2 += x3, x1 = rotl32(x1 ^ x2, 7)257vpaddd %ymm3,%ymm2,%ymm2258vpxord %ymm2,%ymm1,%ymm1259vprold $7,%ymm1,%ymm1260261vpaddd %ymm7,%ymm6,%ymm6262vpxord %ymm6,%ymm5,%ymm5263vprold $7,%ymm5,%ymm5264265# x1 = shuffle32(x1, MASK(0, 3, 2, 1))266vpshufd $0x39,%ymm1,%ymm1267vpshufd $0x39,%ymm5,%ymm5268# x2 = shuffle32(x2, MASK(1, 0, 3, 2))269vpshufd $0x4e,%ymm2,%ymm2270vpshufd $0x4e,%ymm6,%ymm6271# x3 = shuffle32(x3, MASK(2, 1, 0, 3))272vpshufd $0x93,%ymm3,%ymm3273vpshufd $0x93,%ymm7,%ymm7274275# x0 += x1, x3 = rotl32(x3 ^ x0, 16)276vpaddd %ymm1,%ymm0,%ymm0277vpxord %ymm0,%ymm3,%ymm3278vprold $16,%ymm3,%ymm3279280vpaddd %ymm5,%ymm4,%ymm4281vpxord %ymm4,%ymm7,%ymm7282vprold $16,%ymm7,%ymm7283284# x2 += x3, x1 = rotl32(x1 ^ x2, 12)285vpaddd %ymm3,%ymm2,%ymm2286vpxord %ymm2,%ymm1,%ymm1287vprold $12,%ymm1,%ymm1288289vpaddd %ymm7,%ymm6,%ymm6290vpxord %ymm6,%ymm5,%ymm5291vprold $12,%ymm5,%ymm5292293# x0 += x1, x3 = rotl32(x3 ^ x0, 8)294vpaddd %ymm1,%ymm0,%ymm0295vpxord %ymm0,%ymm3,%ymm3296vprold $8,%ymm3,%ymm3297298vpaddd %ymm5,%ymm4,%ymm4299vpxord %ymm4,%ymm7,%ymm7300vprold $8,%ymm7,%ymm7301302# x2 += x3, x1 = rotl32(x1 ^ x2, 7)303vpaddd %ymm3,%ymm2,%ymm2304vpxord %ymm2,%ymm1,%ymm1305vprold $7,%ymm1,%ymm1306307vpaddd %ymm7,%ymm6,%ymm6308vpxord %ymm6,%ymm5,%ymm5309vprold $7,%ymm5,%ymm5310311# x1 = shuffle32(x1, MASK(2, 1, 0, 3))312vpshufd $0x93,%ymm1,%ymm1313vpshufd $0x93,%ymm5,%ymm5314# x2 = shuffle32(x2, MASK(1, 0, 3, 2))315vpshufd $0x4e,%ymm2,%ymm2316vpshufd $0x4e,%ymm6,%ymm6317# x3 = shuffle32(x3, MASK(0, 3, 2, 1))318vpshufd $0x39,%ymm3,%ymm3319vpshufd $0x39,%ymm7,%ymm7320321sub $2,%r8d322jnz .Ldoubleround4323324# o0 = i0 ^ (x0 + s0), first block325vpaddd %ymm11,%ymm0,%ymm10326cmp $0x10,%rcx327jl .Lxorpart4328vpxord 0x00(%rdx),%xmm10,%xmm9329vmovdqu %xmm9,0x00(%rsi)330vextracti128 $1,%ymm10,%xmm0331# o1 = i1 ^ (x1 + s1), first block332vpaddd %ymm12,%ymm1,%ymm10333cmp $0x20,%rcx334jl .Lxorpart4335vpxord 0x10(%rdx),%xmm10,%xmm9336vmovdqu %xmm9,0x10(%rsi)337vextracti128 $1,%ymm10,%xmm1338# o2 = i2 ^ (x2 + s2), first block339vpaddd %ymm13,%ymm2,%ymm10340cmp $0x30,%rcx341jl .Lxorpart4342vpxord 0x20(%rdx),%xmm10,%xmm9343vmovdqu %xmm9,0x20(%rsi)344vextracti128 $1,%ymm10,%xmm2345# o3 = i3 ^ (x3 + s3), first block346vpaddd %ymm14,%ymm3,%ymm10347cmp $0x40,%rcx348jl .Lxorpart4349vpxord 0x30(%rdx),%xmm10,%xmm9350vmovdqu %xmm9,0x30(%rsi)351vextracti128 $1,%ymm10,%xmm3352353# xor and write second block354vmovdqa %xmm0,%xmm10355cmp $0x50,%rcx356jl .Lxorpart4357vpxord 0x40(%rdx),%xmm10,%xmm9358vmovdqu %xmm9,0x40(%rsi)359360vmovdqa %xmm1,%xmm10361cmp $0x60,%rcx362jl .Lxorpart4363vpxord 0x50(%rdx),%xmm10,%xmm9364vmovdqu %xmm9,0x50(%rsi)365366vmovdqa %xmm2,%xmm10367cmp $0x70,%rcx368jl .Lxorpart4369vpxord 0x60(%rdx),%xmm10,%xmm9370vmovdqu %xmm9,0x60(%rsi)371372vmovdqa %xmm3,%xmm10373cmp $0x80,%rcx374jl .Lxorpart4375vpxord 0x70(%rdx),%xmm10,%xmm9376vmovdqu %xmm9,0x70(%rsi)377378# o0 = i0 ^ (x0 + s0), third block379vpaddd %ymm11,%ymm4,%ymm10380cmp $0x90,%rcx381jl .Lxorpart4382vpxord 0x80(%rdx),%xmm10,%xmm9383vmovdqu %xmm9,0x80(%rsi)384vextracti128 $1,%ymm10,%xmm4385# o1 = i1 ^ (x1 + s1), third block386vpaddd %ymm12,%ymm5,%ymm10387cmp $0xa0,%rcx388jl .Lxorpart4389vpxord 0x90(%rdx),%xmm10,%xmm9390vmovdqu %xmm9,0x90(%rsi)391vextracti128 $1,%ymm10,%xmm5392# o2 = i2 ^ (x2 + s2), third block393vpaddd %ymm13,%ymm6,%ymm10394cmp $0xb0,%rcx395jl .Lxorpart4396vpxord 0xa0(%rdx),%xmm10,%xmm9397vmovdqu %xmm9,0xa0(%rsi)398vextracti128 $1,%ymm10,%xmm6399# o3 = i3 ^ (x3 + s3), third block400vpaddd %ymm15,%ymm7,%ymm10401cmp $0xc0,%rcx402jl .Lxorpart4403vpxord 0xb0(%rdx),%xmm10,%xmm9404vmovdqu %xmm9,0xb0(%rsi)405vextracti128 $1,%ymm10,%xmm7406407# xor and write fourth block408vmovdqa %xmm4,%xmm10409cmp $0xd0,%rcx410jl .Lxorpart4411vpxord 0xc0(%rdx),%xmm10,%xmm9412vmovdqu %xmm9,0xc0(%rsi)413414vmovdqa %xmm5,%xmm10415cmp $0xe0,%rcx416jl .Lxorpart4417vpxord 0xd0(%rdx),%xmm10,%xmm9418vmovdqu %xmm9,0xd0(%rsi)419420vmovdqa %xmm6,%xmm10421cmp $0xf0,%rcx422jl .Lxorpart4423vpxord 0xe0(%rdx),%xmm10,%xmm9424vmovdqu %xmm9,0xe0(%rsi)425426vmovdqa %xmm7,%xmm10427cmp $0x100,%rcx428jl .Lxorpart4429vpxord 0xf0(%rdx),%xmm10,%xmm9430vmovdqu %xmm9,0xf0(%rsi)431432.Ldone4:433vzeroupper434RET435436.Lxorpart4:437# xor remaining bytes from partial register into output438mov %rcx,%rax439and $0xf,%rcx440jz .Ldone4441mov %rax,%r9442and $~0xf,%r9443444mov $1,%rax445shld %cl,%rax,%rax446sub $1,%rax447kmovq %rax,%k1448449vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}450vpxord %xmm10,%xmm1,%xmm1451vmovdqu8 %xmm1,(%rsi,%r9){%k1}452453jmp .Ldone4454455SYM_FUNC_END(chacha_4block_xor_avx512vl)456457SYM_FUNC_START(chacha_8block_xor_avx512vl)458# %rdi: Input state matrix, s459# %rsi: up to 8 data blocks output, o460# %rdx: up to 8 data blocks input, i461# %rcx: input/output length in bytes462# %r8d: nrounds463464# This function encrypts eight consecutive ChaCha blocks by loading465# the state matrix in AVX registers eight times. Compared to AVX2, this466# mostly benefits from the new rotate instructions in VL and the467# additional registers.468469vzeroupper470471# x0..15[0-7] = s[0..15]472vpbroadcastd 0x00(%rdi),%ymm0473vpbroadcastd 0x04(%rdi),%ymm1474vpbroadcastd 0x08(%rdi),%ymm2475vpbroadcastd 0x0c(%rdi),%ymm3476vpbroadcastd 0x10(%rdi),%ymm4477vpbroadcastd 0x14(%rdi),%ymm5478vpbroadcastd 0x18(%rdi),%ymm6479vpbroadcastd 0x1c(%rdi),%ymm7480vpbroadcastd 0x20(%rdi),%ymm8481vpbroadcastd 0x24(%rdi),%ymm9482vpbroadcastd 0x28(%rdi),%ymm10483vpbroadcastd 0x2c(%rdi),%ymm11484vpbroadcastd 0x30(%rdi),%ymm12485vpbroadcastd 0x34(%rdi),%ymm13486vpbroadcastd 0x38(%rdi),%ymm14487vpbroadcastd 0x3c(%rdi),%ymm15488489# x12 += counter values 0-3490vpaddd CTR8BL(%rip),%ymm12,%ymm12491492vmovdqa64 %ymm0,%ymm16493vmovdqa64 %ymm1,%ymm17494vmovdqa64 %ymm2,%ymm18495vmovdqa64 %ymm3,%ymm19496vmovdqa64 %ymm4,%ymm20497vmovdqa64 %ymm5,%ymm21498vmovdqa64 %ymm6,%ymm22499vmovdqa64 %ymm7,%ymm23500vmovdqa64 %ymm8,%ymm24501vmovdqa64 %ymm9,%ymm25502vmovdqa64 %ymm10,%ymm26503vmovdqa64 %ymm11,%ymm27504vmovdqa64 %ymm12,%ymm28505vmovdqa64 %ymm13,%ymm29506vmovdqa64 %ymm14,%ymm30507vmovdqa64 %ymm15,%ymm31508509.Ldoubleround8:510# x0 += x4, x12 = rotl32(x12 ^ x0, 16)511vpaddd %ymm0,%ymm4,%ymm0512vpxord %ymm0,%ymm12,%ymm12513vprold $16,%ymm12,%ymm12514# x1 += x5, x13 = rotl32(x13 ^ x1, 16)515vpaddd %ymm1,%ymm5,%ymm1516vpxord %ymm1,%ymm13,%ymm13517vprold $16,%ymm13,%ymm13518# x2 += x6, x14 = rotl32(x14 ^ x2, 16)519vpaddd %ymm2,%ymm6,%ymm2520vpxord %ymm2,%ymm14,%ymm14521vprold $16,%ymm14,%ymm14522# x3 += x7, x15 = rotl32(x15 ^ x3, 16)523vpaddd %ymm3,%ymm7,%ymm3524vpxord %ymm3,%ymm15,%ymm15525vprold $16,%ymm15,%ymm15526527# x8 += x12, x4 = rotl32(x4 ^ x8, 12)528vpaddd %ymm12,%ymm8,%ymm8529vpxord %ymm8,%ymm4,%ymm4530vprold $12,%ymm4,%ymm4531# x9 += x13, x5 = rotl32(x5 ^ x9, 12)532vpaddd %ymm13,%ymm9,%ymm9533vpxord %ymm9,%ymm5,%ymm5534vprold $12,%ymm5,%ymm5535# x10 += x14, x6 = rotl32(x6 ^ x10, 12)536vpaddd %ymm14,%ymm10,%ymm10537vpxord %ymm10,%ymm6,%ymm6538vprold $12,%ymm6,%ymm6539# x11 += x15, x7 = rotl32(x7 ^ x11, 12)540vpaddd %ymm15,%ymm11,%ymm11541vpxord %ymm11,%ymm7,%ymm7542vprold $12,%ymm7,%ymm7543544# x0 += x4, x12 = rotl32(x12 ^ x0, 8)545vpaddd %ymm0,%ymm4,%ymm0546vpxord %ymm0,%ymm12,%ymm12547vprold $8,%ymm12,%ymm12548# x1 += x5, x13 = rotl32(x13 ^ x1, 8)549vpaddd %ymm1,%ymm5,%ymm1550vpxord %ymm1,%ymm13,%ymm13551vprold $8,%ymm13,%ymm13552# x2 += x6, x14 = rotl32(x14 ^ x2, 8)553vpaddd %ymm2,%ymm6,%ymm2554vpxord %ymm2,%ymm14,%ymm14555vprold $8,%ymm14,%ymm14556# x3 += x7, x15 = rotl32(x15 ^ x3, 8)557vpaddd %ymm3,%ymm7,%ymm3558vpxord %ymm3,%ymm15,%ymm15559vprold $8,%ymm15,%ymm15560561# x8 += x12, x4 = rotl32(x4 ^ x8, 7)562vpaddd %ymm12,%ymm8,%ymm8563vpxord %ymm8,%ymm4,%ymm4564vprold $7,%ymm4,%ymm4565# x9 += x13, x5 = rotl32(x5 ^ x9, 7)566vpaddd %ymm13,%ymm9,%ymm9567vpxord %ymm9,%ymm5,%ymm5568vprold $7,%ymm5,%ymm5569# x10 += x14, x6 = rotl32(x6 ^ x10, 7)570vpaddd %ymm14,%ymm10,%ymm10571vpxord %ymm10,%ymm6,%ymm6572vprold $7,%ymm6,%ymm6573# x11 += x15, x7 = rotl32(x7 ^ x11, 7)574vpaddd %ymm15,%ymm11,%ymm11575vpxord %ymm11,%ymm7,%ymm7576vprold $7,%ymm7,%ymm7577578# x0 += x5, x15 = rotl32(x15 ^ x0, 16)579vpaddd %ymm0,%ymm5,%ymm0580vpxord %ymm0,%ymm15,%ymm15581vprold $16,%ymm15,%ymm15582# x1 += x6, x12 = rotl32(x12 ^ x1, 16)583vpaddd %ymm1,%ymm6,%ymm1584vpxord %ymm1,%ymm12,%ymm12585vprold $16,%ymm12,%ymm12586# x2 += x7, x13 = rotl32(x13 ^ x2, 16)587vpaddd %ymm2,%ymm7,%ymm2588vpxord %ymm2,%ymm13,%ymm13589vprold $16,%ymm13,%ymm13590# x3 += x4, x14 = rotl32(x14 ^ x3, 16)591vpaddd %ymm3,%ymm4,%ymm3592vpxord %ymm3,%ymm14,%ymm14593vprold $16,%ymm14,%ymm14594595# x10 += x15, x5 = rotl32(x5 ^ x10, 12)596vpaddd %ymm15,%ymm10,%ymm10597vpxord %ymm10,%ymm5,%ymm5598vprold $12,%ymm5,%ymm5599# x11 += x12, x6 = rotl32(x6 ^ x11, 12)600vpaddd %ymm12,%ymm11,%ymm11601vpxord %ymm11,%ymm6,%ymm6602vprold $12,%ymm6,%ymm6603# x8 += x13, x7 = rotl32(x7 ^ x8, 12)604vpaddd %ymm13,%ymm8,%ymm8605vpxord %ymm8,%ymm7,%ymm7606vprold $12,%ymm7,%ymm7607# x9 += x14, x4 = rotl32(x4 ^ x9, 12)608vpaddd %ymm14,%ymm9,%ymm9609vpxord %ymm9,%ymm4,%ymm4610vprold $12,%ymm4,%ymm4611612# x0 += x5, x15 = rotl32(x15 ^ x0, 8)613vpaddd %ymm0,%ymm5,%ymm0614vpxord %ymm0,%ymm15,%ymm15615vprold $8,%ymm15,%ymm15616# x1 += x6, x12 = rotl32(x12 ^ x1, 8)617vpaddd %ymm1,%ymm6,%ymm1618vpxord %ymm1,%ymm12,%ymm12619vprold $8,%ymm12,%ymm12620# x2 += x7, x13 = rotl32(x13 ^ x2, 8)621vpaddd %ymm2,%ymm7,%ymm2622vpxord %ymm2,%ymm13,%ymm13623vprold $8,%ymm13,%ymm13624# x3 += x4, x14 = rotl32(x14 ^ x3, 8)625vpaddd %ymm3,%ymm4,%ymm3626vpxord %ymm3,%ymm14,%ymm14627vprold $8,%ymm14,%ymm14628629# x10 += x15, x5 = rotl32(x5 ^ x10, 7)630vpaddd %ymm15,%ymm10,%ymm10631vpxord %ymm10,%ymm5,%ymm5632vprold $7,%ymm5,%ymm5633# x11 += x12, x6 = rotl32(x6 ^ x11, 7)634vpaddd %ymm12,%ymm11,%ymm11635vpxord %ymm11,%ymm6,%ymm6636vprold $7,%ymm6,%ymm6637# x8 += x13, x7 = rotl32(x7 ^ x8, 7)638vpaddd %ymm13,%ymm8,%ymm8639vpxord %ymm8,%ymm7,%ymm7640vprold $7,%ymm7,%ymm7641# x9 += x14, x4 = rotl32(x4 ^ x9, 7)642vpaddd %ymm14,%ymm9,%ymm9643vpxord %ymm9,%ymm4,%ymm4644vprold $7,%ymm4,%ymm4645646sub $2,%r8d647jnz .Ldoubleround8648649# x0..15[0-3] += s[0..15]650vpaddd %ymm16,%ymm0,%ymm0651vpaddd %ymm17,%ymm1,%ymm1652vpaddd %ymm18,%ymm2,%ymm2653vpaddd %ymm19,%ymm3,%ymm3654vpaddd %ymm20,%ymm4,%ymm4655vpaddd %ymm21,%ymm5,%ymm5656vpaddd %ymm22,%ymm6,%ymm6657vpaddd %ymm23,%ymm7,%ymm7658vpaddd %ymm24,%ymm8,%ymm8659vpaddd %ymm25,%ymm9,%ymm9660vpaddd %ymm26,%ymm10,%ymm10661vpaddd %ymm27,%ymm11,%ymm11662vpaddd %ymm28,%ymm12,%ymm12663vpaddd %ymm29,%ymm13,%ymm13664vpaddd %ymm30,%ymm14,%ymm14665vpaddd %ymm31,%ymm15,%ymm15666667# interleave 32-bit words in state n, n+1668vpunpckldq %ymm1,%ymm0,%ymm16669vpunpckhdq %ymm1,%ymm0,%ymm17670vpunpckldq %ymm3,%ymm2,%ymm18671vpunpckhdq %ymm3,%ymm2,%ymm19672vpunpckldq %ymm5,%ymm4,%ymm20673vpunpckhdq %ymm5,%ymm4,%ymm21674vpunpckldq %ymm7,%ymm6,%ymm22675vpunpckhdq %ymm7,%ymm6,%ymm23676vpunpckldq %ymm9,%ymm8,%ymm24677vpunpckhdq %ymm9,%ymm8,%ymm25678vpunpckldq %ymm11,%ymm10,%ymm26679vpunpckhdq %ymm11,%ymm10,%ymm27680vpunpckldq %ymm13,%ymm12,%ymm28681vpunpckhdq %ymm13,%ymm12,%ymm29682vpunpckldq %ymm15,%ymm14,%ymm30683vpunpckhdq %ymm15,%ymm14,%ymm31684685# interleave 64-bit words in state n, n+2686vpunpcklqdq %ymm18,%ymm16,%ymm0687vpunpcklqdq %ymm19,%ymm17,%ymm1688vpunpckhqdq %ymm18,%ymm16,%ymm2689vpunpckhqdq %ymm19,%ymm17,%ymm3690vpunpcklqdq %ymm22,%ymm20,%ymm4691vpunpcklqdq %ymm23,%ymm21,%ymm5692vpunpckhqdq %ymm22,%ymm20,%ymm6693vpunpckhqdq %ymm23,%ymm21,%ymm7694vpunpcklqdq %ymm26,%ymm24,%ymm8695vpunpcklqdq %ymm27,%ymm25,%ymm9696vpunpckhqdq %ymm26,%ymm24,%ymm10697vpunpckhqdq %ymm27,%ymm25,%ymm11698vpunpcklqdq %ymm30,%ymm28,%ymm12699vpunpcklqdq %ymm31,%ymm29,%ymm13700vpunpckhqdq %ymm30,%ymm28,%ymm14701vpunpckhqdq %ymm31,%ymm29,%ymm15702703# interleave 128-bit words in state n, n+4704# xor/write first four blocks705vmovdqa64 %ymm0,%ymm16706vperm2i128 $0x20,%ymm4,%ymm0,%ymm0707cmp $0x0020,%rcx708jl .Lxorpart8709vpxord 0x0000(%rdx),%ymm0,%ymm0710vmovdqu64 %ymm0,0x0000(%rsi)711vmovdqa64 %ymm16,%ymm0712vperm2i128 $0x31,%ymm4,%ymm0,%ymm4713714vperm2i128 $0x20,%ymm12,%ymm8,%ymm0715cmp $0x0040,%rcx716jl .Lxorpart8717vpxord 0x0020(%rdx),%ymm0,%ymm0718vmovdqu64 %ymm0,0x0020(%rsi)719vperm2i128 $0x31,%ymm12,%ymm8,%ymm12720721vperm2i128 $0x20,%ymm6,%ymm2,%ymm0722cmp $0x0060,%rcx723jl .Lxorpart8724vpxord 0x0040(%rdx),%ymm0,%ymm0725vmovdqu64 %ymm0,0x0040(%rsi)726vperm2i128 $0x31,%ymm6,%ymm2,%ymm6727728vperm2i128 $0x20,%ymm14,%ymm10,%ymm0729cmp $0x0080,%rcx730jl .Lxorpart8731vpxord 0x0060(%rdx),%ymm0,%ymm0732vmovdqu64 %ymm0,0x0060(%rsi)733vperm2i128 $0x31,%ymm14,%ymm10,%ymm14734735vperm2i128 $0x20,%ymm5,%ymm1,%ymm0736cmp $0x00a0,%rcx737jl .Lxorpart8738vpxord 0x0080(%rdx),%ymm0,%ymm0739vmovdqu64 %ymm0,0x0080(%rsi)740vperm2i128 $0x31,%ymm5,%ymm1,%ymm5741742vperm2i128 $0x20,%ymm13,%ymm9,%ymm0743cmp $0x00c0,%rcx744jl .Lxorpart8745vpxord 0x00a0(%rdx),%ymm0,%ymm0746vmovdqu64 %ymm0,0x00a0(%rsi)747vperm2i128 $0x31,%ymm13,%ymm9,%ymm13748749vperm2i128 $0x20,%ymm7,%ymm3,%ymm0750cmp $0x00e0,%rcx751jl .Lxorpart8752vpxord 0x00c0(%rdx),%ymm0,%ymm0753vmovdqu64 %ymm0,0x00c0(%rsi)754vperm2i128 $0x31,%ymm7,%ymm3,%ymm7755756vperm2i128 $0x20,%ymm15,%ymm11,%ymm0757cmp $0x0100,%rcx758jl .Lxorpart8759vpxord 0x00e0(%rdx),%ymm0,%ymm0760vmovdqu64 %ymm0,0x00e0(%rsi)761vperm2i128 $0x31,%ymm15,%ymm11,%ymm15762763# xor remaining blocks, write to output764vmovdqa64 %ymm4,%ymm0765cmp $0x0120,%rcx766jl .Lxorpart8767vpxord 0x0100(%rdx),%ymm0,%ymm0768vmovdqu64 %ymm0,0x0100(%rsi)769770vmovdqa64 %ymm12,%ymm0771cmp $0x0140,%rcx772jl .Lxorpart8773vpxord 0x0120(%rdx),%ymm0,%ymm0774vmovdqu64 %ymm0,0x0120(%rsi)775776vmovdqa64 %ymm6,%ymm0777cmp $0x0160,%rcx778jl .Lxorpart8779vpxord 0x0140(%rdx),%ymm0,%ymm0780vmovdqu64 %ymm0,0x0140(%rsi)781782vmovdqa64 %ymm14,%ymm0783cmp $0x0180,%rcx784jl .Lxorpart8785vpxord 0x0160(%rdx),%ymm0,%ymm0786vmovdqu64 %ymm0,0x0160(%rsi)787788vmovdqa64 %ymm5,%ymm0789cmp $0x01a0,%rcx790jl .Lxorpart8791vpxord 0x0180(%rdx),%ymm0,%ymm0792vmovdqu64 %ymm0,0x0180(%rsi)793794vmovdqa64 %ymm13,%ymm0795cmp $0x01c0,%rcx796jl .Lxorpart8797vpxord 0x01a0(%rdx),%ymm0,%ymm0798vmovdqu64 %ymm0,0x01a0(%rsi)799800vmovdqa64 %ymm7,%ymm0801cmp $0x01e0,%rcx802jl .Lxorpart8803vpxord 0x01c0(%rdx),%ymm0,%ymm0804vmovdqu64 %ymm0,0x01c0(%rsi)805806vmovdqa64 %ymm15,%ymm0807cmp $0x0200,%rcx808jl .Lxorpart8809vpxord 0x01e0(%rdx),%ymm0,%ymm0810vmovdqu64 %ymm0,0x01e0(%rsi)811812.Ldone8:813vzeroupper814RET815816.Lxorpart8:817# xor remaining bytes from partial register into output818mov %rcx,%rax819and $0x1f,%rcx820jz .Ldone8821mov %rax,%r9822and $~0x1f,%r9823824mov $1,%rax825shld %cl,%rax,%rax826sub $1,%rax827kmovq %rax,%k1828829vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}830vpxord %ymm0,%ymm1,%ymm1831vmovdqu8 %ymm1,(%rsi,%r9){%k1}832833jmp .Ldone8834835SYM_FUNC_END(chacha_8block_xor_avx512vl)836837838