Path: blob/main/sys/crypto/openssl/aarch64/chacha-armv8.S
39536 views
/* Do not modify. This file is auto-generated from chacha-armv8.pl. */1#include "arm_arch.h"2#ifndef __KERNEL__34.hidden OPENSSL_armcap_P567#endif89.section .rodata1011.align 512.Lsigma:13.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral14.Lone:15.long 1,2,3,416.Lrot24:17.long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f18.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,019.align 22021.text2223.globl ChaCha20_ctr32_dflt24.type ChaCha20_ctr32_dflt,%function25.align 526ChaCha20_ctr32_dflt:27AARCH64_SIGN_LINK_REGISTER28cmp x2,#19229b.lo .Lshort30#ifndef __KERNEL__31adrp x17,OPENSSL_armcap_P32ldr w17,[x17,#:lo12:OPENSSL_armcap_P]33.Lcheck_neon:34tst w17,#ARMV7_NEON35b.ne .LChaCha20_neon36#endif3738.Lshort:39stp x29,x30,[sp,#-96]!40add x29,sp,#04142adrp x5,.Lsigma43add x5,x5,#:lo12:.Lsigma44stp x19,x20,[sp,#16]45stp x21,x22,[sp,#32]46stp x23,x24,[sp,#48]47stp x25,x26,[sp,#64]48stp x27,x28,[sp,#80]49sub sp,sp,#645051ldp x22,x23,[x5] // load sigma52ldp x24,x25,[x3] // load key53ldp x26,x27,[x3,#16]54ldp x28,x30,[x4] // load counter55#ifdef __AARCH64EB__56ror x24,x24,#3257ror x25,x25,#3258ror x26,x26,#3259ror x27,x27,#3260ror x28,x28,#3261ror x30,x30,#3262#endif6364.Loop_outer:65mov w5,w22 // unpack key block66lsr x6,x22,#3267mov w7,w2368lsr x8,x23,#3269mov w9,w2470lsr x10,x24,#3271mov w11,w2572lsr x12,x25,#3273mov w13,w2674lsr x14,x26,#3275mov w15,w2776lsr x16,x27,#3277mov w17,w2878lsr x19,x28,#3279mov w20,w3080lsr x21,x30,#328182mov x4,#1083subs x2,x2,#6484.Loop:85sub x4,x4,#186add w5,w5,w987add w6,w6,w1088add w7,w7,w1189add w8,w8,w1290eor w17,w17,w591eor w19,w19,w692eor w20,w20,w793eor w21,w21,w894ror w17,w17,#1695ror w19,w19,#1696ror w20,w20,#1697ror w21,w21,#1698add w13,w13,w1799add w14,w14,w19100add w15,w15,w20101add w16,w16,w21102eor w9,w9,w13103eor w10,w10,w14104eor w11,w11,w15105eor w12,w12,w16106ror w9,w9,#20107ror w10,w10,#20108ror w11,w11,#20109ror w12,w12,#20110add w5,w5,w9111add w6,w6,w10112add w7,w7,w11113add w8,w8,w12114eor w17,w17,w5115eor w19,w19,w6116eor w20,w20,w7117eor w21,w21,w8118ror w17,w17,#24119ror w19,w19,#24120ror w20,w20,#24121ror w21,w21,#24122add w13,w13,w17123add w14,w14,w19124add w15,w15,w20125add w16,w16,w21126eor w9,w9,w13127eor w10,w10,w14128eor w11,w11,w15129eor w12,w12,w16130ror w9,w9,#25131ror w10,w10,#25132ror w11,w11,#25133ror w12,w12,#25134add w5,w5,w10135add w6,w6,w11136add w7,w7,w12137add w8,w8,w9138eor w21,w21,w5139eor w17,w17,w6140eor w19,w19,w7141eor w20,w20,w8142ror w21,w21,#16143ror w17,w17,#16144ror w19,w19,#16145ror w20,w20,#16146add w15,w15,w21147add w16,w16,w17148add w13,w13,w19149add w14,w14,w20150eor w10,w10,w15151eor w11,w11,w16152eor w12,w12,w13153eor w9,w9,w14154ror w10,w10,#20155ror w11,w11,#20156ror w12,w12,#20157ror w9,w9,#20158add w5,w5,w10159add w6,w6,w11160add w7,w7,w12161add w8,w8,w9162eor w21,w21,w5163eor w17,w17,w6164eor w19,w19,w7165eor w20,w20,w8166ror w21,w21,#24167ror w17,w17,#24168ror w19,w19,#24169ror w20,w20,#24170add w15,w15,w21171add w16,w16,w17172add w13,w13,w19173add w14,w14,w20174eor w10,w10,w15175eor w11,w11,w16176eor w12,w12,w13177eor w9,w9,w14178ror w10,w10,#25179ror w11,w11,#25180ror w12,w12,#25181ror w9,w9,#25182cbnz x4,.Loop183184add w5,w5,w22 // accumulate key block185add x6,x6,x22,lsr#32186add w7,w7,w23187add x8,x8,x23,lsr#32188add w9,w9,w24189add x10,x10,x24,lsr#32190add w11,w11,w25191add x12,x12,x25,lsr#32192add w13,w13,w26193add x14,x14,x26,lsr#32194add w15,w15,w27195add x16,x16,x27,lsr#32196add w17,w17,w28197add x19,x19,x28,lsr#32198add w20,w20,w30199add x21,x21,x30,lsr#32200201b.lo .Ltail202203add x5,x5,x6,lsl#32 // pack204add x7,x7,x8,lsl#32205ldp x6,x8,[x1,#0] // load input206add x9,x9,x10,lsl#32207add x11,x11,x12,lsl#32208ldp x10,x12,[x1,#16]209add x13,x13,x14,lsl#32210add x15,x15,x16,lsl#32211ldp x14,x16,[x1,#32]212add x17,x17,x19,lsl#32213add x20,x20,x21,lsl#32214ldp x19,x21,[x1,#48]215add x1,x1,#64216#ifdef __AARCH64EB__217rev x5,x5218rev x7,x7219rev x9,x9220rev x11,x11221rev x13,x13222rev x15,x15223rev x17,x17224rev x20,x20225#endif226eor x5,x5,x6227eor x7,x7,x8228eor x9,x9,x10229eor x11,x11,x12230eor x13,x13,x14231eor x15,x15,x16232eor x17,x17,x19233eor x20,x20,x21234235stp x5,x7,[x0,#0] // store output236add x28,x28,#1 // increment counter237stp x9,x11,[x0,#16]238stp x13,x15,[x0,#32]239stp x17,x20,[x0,#48]240add x0,x0,#64241242b.hi .Loop_outer243244ldp x19,x20,[x29,#16]245add sp,sp,#64246ldp x21,x22,[x29,#32]247ldp x23,x24,[x29,#48]248ldp x25,x26,[x29,#64]249ldp x27,x28,[x29,#80]250ldp x29,x30,[sp],#96251.Labort:252AARCH64_VALIDATE_LINK_REGISTER253ret254255.align 4256.Ltail:257add x2,x2,#64258.Less_than_64:259sub x0,x0,#1260add x1,x1,x2261add x0,x0,x2262add x4,sp,x2263neg x2,x2264265add x5,x5,x6,lsl#32 // pack266add x7,x7,x8,lsl#32267add x9,x9,x10,lsl#32268add x11,x11,x12,lsl#32269add x13,x13,x14,lsl#32270add x15,x15,x16,lsl#32271add x17,x17,x19,lsl#32272add x20,x20,x21,lsl#32273#ifdef __AARCH64EB__274rev x5,x5275rev x7,x7276rev x9,x9277rev x11,x11278rev x13,x13279rev x15,x15280rev x17,x17281rev x20,x20282#endif283stp x5,x7,[sp,#0]284stp x9,x11,[sp,#16]285stp x13,x15,[sp,#32]286stp x17,x20,[sp,#48]287288.Loop_tail:289ldrb w10,[x1,x2]290ldrb w11,[x4,x2]291add x2,x2,#1292eor w10,w10,w11293strb w10,[x0,x2]294cbnz x2,.Loop_tail295296stp xzr,xzr,[sp,#0]297stp xzr,xzr,[sp,#16]298stp xzr,xzr,[sp,#32]299stp xzr,xzr,[sp,#48]300301ldp x19,x20,[x29,#16]302add sp,sp,#64303ldp x21,x22,[x29,#32]304ldp x23,x24,[x29,#48]305ldp x25,x26,[x29,#64]306ldp x27,x28,[x29,#80]307ldp x29,x30,[sp],#96308AARCH64_VALIDATE_LINK_REGISTER309ret310.size ChaCha20_ctr32_dflt,.-ChaCha20_ctr32_dflt311312.globl ChaCha20_ctr32313.type ChaCha20_ctr32,%function314.align 5315ChaCha20_ctr32:316AARCH64_SIGN_LINK_REGISTER317cbz x2,.Labort318cmp x2,#192319b.lo .Lshort320#ifndef __KERNEL__321adrp x17,OPENSSL_armcap_P322ldr w17,[x17,#:lo12:OPENSSL_armcap_P]323tst w17,#ARMV8_SVE324b.eq .Lcheck_neon325stp x29,x30,[sp,#-16]!326sub sp,sp,#16327// SVE handling will inevitably increment the counter328// Neon/Scalar code that follows to process tail data needs to329// use new counter, unfortunately the input counter buffer330// pointed to by ctr is meant to be read-only per API contract331// we have to copy the buffer to stack to be writable by SVE332ldp x5,x6,[x4]333stp x5,x6,[sp]334mov x4,sp335bl ChaCha20_ctr32_sve336cbz x2,1f337bl ChaCha20_ctr32_dflt3381:339add sp,sp,#16340ldp x29,x30,[sp],#16341AARCH64_VALIDATE_LINK_REGISTER342ret343#endif344b .Lshort345.size ChaCha20_ctr32,.-ChaCha20_ctr32346347#ifdef __KERNEL__348.globl ChaCha20_neon349#endif350.type ChaCha20_neon,%function351.align 5352ChaCha20_neon:353AARCH64_SIGN_LINK_REGISTER354.LChaCha20_neon:355stp x29,x30,[sp,#-96]!356add x29,sp,#0357358adrp x5,.Lsigma359add x5,x5,#:lo12:.Lsigma360stp x19,x20,[sp,#16]361stp x21,x22,[sp,#32]362stp x23,x24,[sp,#48]363stp x25,x26,[sp,#64]364stp x27,x28,[sp,#80]365cmp x2,#512366b.hs .L512_or_more_neon367368sub sp,sp,#64369370ldp x22,x23,[x5] // load sigma371ld1 {v0.4s},[x5],#16372ldp x24,x25,[x3] // load key373ldp x26,x27,[x3,#16]374ld1 {v1.4s,v2.4s},[x3]375ldp x28,x30,[x4] // load counter376ld1 {v3.4s},[x4]377stp d8,d9,[sp] // meet ABI requirements378ld1 {v8.4s,v9.4s},[x5]379#ifdef __AARCH64EB__380rev64 v0.4s,v0.4s381ror x24,x24,#32382ror x25,x25,#32383ror x26,x26,#32384ror x27,x27,#32385ror x28,x28,#32386ror x30,x30,#32387#endif388389.Loop_outer_neon:390dup v16.4s,v0.s[0] // unpack key block391mov w5,w22392dup v20.4s,v0.s[1]393lsr x6,x22,#32394dup v24.4s,v0.s[2]395mov w7,w23396dup v28.4s,v0.s[3]397lsr x8,x23,#32398dup v17.4s,v1.s[0]399mov w9,w24400dup v21.4s,v1.s[1]401lsr x10,x24,#32402dup v25.4s,v1.s[2]403mov w11,w25404dup v29.4s,v1.s[3]405lsr x12,x25,#32406dup v19.4s,v3.s[0]407mov w13,w26408dup v23.4s,v3.s[1]409lsr x14,x26,#32410dup v27.4s,v3.s[2]411mov w15,w27412dup v31.4s,v3.s[3]413lsr x16,x27,#32414add v19.4s,v19.4s,v8.4s415mov w17,w28416dup v18.4s,v2.s[0]417lsr x19,x28,#32418dup v22.4s,v2.s[1]419mov w20,w30420dup v26.4s,v2.s[2]421lsr x21,x30,#32422dup v30.4s,v2.s[3]423424mov x4,#10425subs x2,x2,#320426.Loop_neon:427sub x4,x4,#1428add v16.4s,v16.4s,v17.4s429add w5,w5,w9430add v20.4s,v20.4s,v21.4s431add w6,w6,w10432add v24.4s,v24.4s,v25.4s433add w7,w7,w11434add v28.4s,v28.4s,v29.4s435add w8,w8,w12436eor v19.16b,v19.16b,v16.16b437eor w17,w17,w5438eor v23.16b,v23.16b,v20.16b439eor w19,w19,w6440eor v27.16b,v27.16b,v24.16b441eor w20,w20,w7442eor v31.16b,v31.16b,v28.16b443eor w21,w21,w8444rev32 v19.8h,v19.8h445ror w17,w17,#16446rev32 v23.8h,v23.8h447ror w19,w19,#16448rev32 v27.8h,v27.8h449ror w20,w20,#16450rev32 v31.8h,v31.8h451ror w21,w21,#16452add v18.4s,v18.4s,v19.4s453add w13,w13,w17454add v22.4s,v22.4s,v23.4s455add w14,w14,w19456add v26.4s,v26.4s,v27.4s457add w15,w15,w20458add v30.4s,v30.4s,v31.4s459add w16,w16,w21460eor v4.16b,v17.16b,v18.16b461eor w9,w9,w13462eor v5.16b,v21.16b,v22.16b463eor w10,w10,w14464eor v6.16b,v25.16b,v26.16b465eor w11,w11,w15466eor v7.16b,v29.16b,v30.16b467eor w12,w12,w16468ushr v17.4s,v4.4s,#20469ror w9,w9,#20470ushr v21.4s,v5.4s,#20471ror w10,w10,#20472ushr v25.4s,v6.4s,#20473ror w11,w11,#20474ushr v29.4s,v7.4s,#20475ror w12,w12,#20476sli v17.4s,v4.4s,#12477add w5,w5,w9478sli v21.4s,v5.4s,#12479add w6,w6,w10480sli v25.4s,v6.4s,#12481add w7,w7,w11482sli v29.4s,v7.4s,#12483add w8,w8,w12484add v16.4s,v16.4s,v17.4s485eor w17,w17,w5486add v20.4s,v20.4s,v21.4s487eor w19,w19,w6488add v24.4s,v24.4s,v25.4s489eor w20,w20,w7490add v28.4s,v28.4s,v29.4s491eor w21,w21,w8492eor v4.16b,v19.16b,v16.16b493ror w17,w17,#24494eor v5.16b,v23.16b,v20.16b495ror w19,w19,#24496eor v6.16b,v27.16b,v24.16b497ror w20,w20,#24498eor v7.16b,v31.16b,v28.16b499ror w21,w21,#24500tbl v19.16b,{v4.16b},v9.16b501add w13,w13,w17502tbl v23.16b,{v5.16b},v9.16b503add w14,w14,w19504tbl v27.16b,{v6.16b},v9.16b505add w15,w15,w20506tbl v31.16b,{v7.16b},v9.16b507add w16,w16,w21508add v18.4s,v18.4s,v19.4s509eor w9,w9,w13510add v22.4s,v22.4s,v23.4s511eor w10,w10,w14512add v26.4s,v26.4s,v27.4s513eor w11,w11,w15514add v30.4s,v30.4s,v31.4s515eor w12,w12,w16516eor v4.16b,v17.16b,v18.16b517ror w9,w9,#25518eor v5.16b,v21.16b,v22.16b519ror w10,w10,#25520eor v6.16b,v25.16b,v26.16b521ror w11,w11,#25522eor v7.16b,v29.16b,v30.16b523ror w12,w12,#25524ushr v17.4s,v4.4s,#25525ushr v21.4s,v5.4s,#25526ushr v25.4s,v6.4s,#25527ushr v29.4s,v7.4s,#25528sli v17.4s,v4.4s,#7529sli v21.4s,v5.4s,#7530sli v25.4s,v6.4s,#7531sli v29.4s,v7.4s,#7532add v16.4s,v16.4s,v21.4s533add w5,w5,w10534add v20.4s,v20.4s,v25.4s535add w6,w6,w11536add v24.4s,v24.4s,v29.4s537add w7,w7,w12538add v28.4s,v28.4s,v17.4s539add w8,w8,w9540eor v31.16b,v31.16b,v16.16b541eor w21,w21,w5542eor v19.16b,v19.16b,v20.16b543eor w17,w17,w6544eor v23.16b,v23.16b,v24.16b545eor w19,w19,w7546eor v27.16b,v27.16b,v28.16b547eor w20,w20,w8548rev32 v31.8h,v31.8h549ror w21,w21,#16550rev32 v19.8h,v19.8h551ror w17,w17,#16552rev32 v23.8h,v23.8h553ror w19,w19,#16554rev32 v27.8h,v27.8h555ror w20,w20,#16556add v26.4s,v26.4s,v31.4s557add w15,w15,w21558add v30.4s,v30.4s,v19.4s559add w16,w16,w17560add v18.4s,v18.4s,v23.4s561add w13,w13,w19562add v22.4s,v22.4s,v27.4s563add w14,w14,w20564eor v4.16b,v21.16b,v26.16b565eor w10,w10,w15566eor v5.16b,v25.16b,v30.16b567eor w11,w11,w16568eor v6.16b,v29.16b,v18.16b569eor w12,w12,w13570eor v7.16b,v17.16b,v22.16b571eor w9,w9,w14572ushr v21.4s,v4.4s,#20573ror w10,w10,#20574ushr v25.4s,v5.4s,#20575ror w11,w11,#20576ushr v29.4s,v6.4s,#20577ror w12,w12,#20578ushr v17.4s,v7.4s,#20579ror w9,w9,#20580sli v21.4s,v4.4s,#12581add w5,w5,w10582sli v25.4s,v5.4s,#12583add w6,w6,w11584sli v29.4s,v6.4s,#12585add w7,w7,w12586sli v17.4s,v7.4s,#12587add w8,w8,w9588add v16.4s,v16.4s,v21.4s589eor w21,w21,w5590add v20.4s,v20.4s,v25.4s591eor w17,w17,w6592add v24.4s,v24.4s,v29.4s593eor w19,w19,w7594add v28.4s,v28.4s,v17.4s595eor w20,w20,w8596eor v4.16b,v31.16b,v16.16b597ror w21,w21,#24598eor v5.16b,v19.16b,v20.16b599ror w17,w17,#24600eor v6.16b,v23.16b,v24.16b601ror w19,w19,#24602eor v7.16b,v27.16b,v28.16b603ror w20,w20,#24604tbl v31.16b,{v4.16b},v9.16b605add w15,w15,w21606tbl v19.16b,{v5.16b},v9.16b607add w16,w16,w17608tbl v23.16b,{v6.16b},v9.16b609add w13,w13,w19610tbl v27.16b,{v7.16b},v9.16b611add w14,w14,w20612add v26.4s,v26.4s,v31.4s613eor w10,w10,w15614add v30.4s,v30.4s,v19.4s615eor w11,w11,w16616add v18.4s,v18.4s,v23.4s617eor w12,w12,w13618add v22.4s,v22.4s,v27.4s619eor w9,w9,w14620eor v4.16b,v21.16b,v26.16b621ror w10,w10,#25622eor v5.16b,v25.16b,v30.16b623ror w11,w11,#25624eor v6.16b,v29.16b,v18.16b625ror w12,w12,#25626eor v7.16b,v17.16b,v22.16b627ror w9,w9,#25628ushr v21.4s,v4.4s,#25629ushr v25.4s,v5.4s,#25630ushr v29.4s,v6.4s,#25631ushr v17.4s,v7.4s,#25632sli v21.4s,v4.4s,#7633sli v25.4s,v5.4s,#7634sli v29.4s,v6.4s,#7635sli v17.4s,v7.4s,#7636cbnz x4,.Loop_neon637638add v19.4s,v19.4s,v8.4s639640zip1 v4.4s,v16.4s,v20.4s // transpose data641zip1 v5.4s,v24.4s,v28.4s642zip2 v6.4s,v16.4s,v20.4s643zip2 v7.4s,v24.4s,v28.4s644zip1 v16.2d,v4.2d,v5.2d645zip2 v20.2d,v4.2d,v5.2d646zip1 v24.2d,v6.2d,v7.2d647zip2 v28.2d,v6.2d,v7.2d648649zip1 v4.4s,v17.4s,v21.4s650zip1 v5.4s,v25.4s,v29.4s651zip2 v6.4s,v17.4s,v21.4s652zip2 v7.4s,v25.4s,v29.4s653zip1 v17.2d,v4.2d,v5.2d654zip2 v21.2d,v4.2d,v5.2d655zip1 v25.2d,v6.2d,v7.2d656zip2 v29.2d,v6.2d,v7.2d657658zip1 v4.4s,v18.4s,v22.4s659add w5,w5,w22 // accumulate key block660zip1 v5.4s,v26.4s,v30.4s661add x6,x6,x22,lsr#32662zip2 v6.4s,v18.4s,v22.4s663add w7,w7,w23664zip2 v7.4s,v26.4s,v30.4s665add x8,x8,x23,lsr#32666zip1 v18.2d,v4.2d,v5.2d667add w9,w9,w24668zip2 v22.2d,v4.2d,v5.2d669add x10,x10,x24,lsr#32670zip1 v26.2d,v6.2d,v7.2d671add w11,w11,w25672zip2 v30.2d,v6.2d,v7.2d673add x12,x12,x25,lsr#32674675zip1 v4.4s,v19.4s,v23.4s676add w13,w13,w26677zip1 v5.4s,v27.4s,v31.4s678add x14,x14,x26,lsr#32679zip2 v6.4s,v19.4s,v23.4s680add w15,w15,w27681zip2 v7.4s,v27.4s,v31.4s682add x16,x16,x27,lsr#32683zip1 v19.2d,v4.2d,v5.2d684add w17,w17,w28685zip2 v23.2d,v4.2d,v5.2d686add x19,x19,x28,lsr#32687zip1 v27.2d,v6.2d,v7.2d688add w20,w20,w30689zip2 v31.2d,v6.2d,v7.2d690add x21,x21,x30,lsr#32691692b.lo .Ltail_neon693694add x5,x5,x6,lsl#32 // pack695add x7,x7,x8,lsl#32696ldp x6,x8,[x1,#0] // load input697add v16.4s,v16.4s,v0.4s // accumulate key block698add x9,x9,x10,lsl#32699add x11,x11,x12,lsl#32700ldp x10,x12,[x1,#16]701add v17.4s,v17.4s,v1.4s702add x13,x13,x14,lsl#32703add x15,x15,x16,lsl#32704ldp x14,x16,[x1,#32]705add v18.4s,v18.4s,v2.4s706add x17,x17,x19,lsl#32707add x20,x20,x21,lsl#32708ldp x19,x21,[x1,#48]709add v19.4s,v19.4s,v3.4s710add x1,x1,#64711#ifdef __AARCH64EB__712rev x5,x5713rev x7,x7714rev x9,x9715rev x11,x11716rev x13,x13717rev x15,x15718rev x17,x17719rev x20,x20720#endif721ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64722eor x5,x5,x6723add v20.4s,v20.4s,v0.4s724eor x7,x7,x8725add v21.4s,v21.4s,v1.4s726eor x9,x9,x10727add v22.4s,v22.4s,v2.4s728eor x11,x11,x12729add v23.4s,v23.4s,v3.4s730eor x13,x13,x14731eor v16.16b,v16.16b,v4.16b732movi v4.4s,#5733eor x15,x15,x16734eor v17.16b,v17.16b,v5.16b735eor x17,x17,x19736eor v18.16b,v18.16b,v6.16b737eor x20,x20,x21738eor v19.16b,v19.16b,v7.16b739add v8.4s,v8.4s,v4.4s // += 5740ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64741742stp x5,x7,[x0,#0] // store output743add x28,x28,#5 // increment counter744stp x9,x11,[x0,#16]745stp x13,x15,[x0,#32]746stp x17,x20,[x0,#48]747add x0,x0,#64748749st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64750add v24.4s,v24.4s,v0.4s751add v25.4s,v25.4s,v1.4s752add v26.4s,v26.4s,v2.4s753add v27.4s,v27.4s,v3.4s754ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64755756eor v20.16b,v20.16b,v4.16b757eor v21.16b,v21.16b,v5.16b758eor v22.16b,v22.16b,v6.16b759eor v23.16b,v23.16b,v7.16b760st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64761add v28.4s,v28.4s,v0.4s762add v29.4s,v29.4s,v1.4s763add v30.4s,v30.4s,v2.4s764add v31.4s,v31.4s,v3.4s765ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64766767eor v24.16b,v24.16b,v16.16b768eor v25.16b,v25.16b,v17.16b769eor v26.16b,v26.16b,v18.16b770eor v27.16b,v27.16b,v19.16b771st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64772773eor v28.16b,v28.16b,v20.16b774eor v29.16b,v29.16b,v21.16b775eor v30.16b,v30.16b,v22.16b776eor v31.16b,v31.16b,v23.16b777st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64778779b.hi .Loop_outer_neon780781ldp d8,d9,[sp] // meet ABI requirements782783ldp x19,x20,[x29,#16]784add sp,sp,#64785ldp x21,x22,[x29,#32]786ldp x23,x24,[x29,#48]787ldp x25,x26,[x29,#64]788ldp x27,x28,[x29,#80]789ldp x29,x30,[sp],#96790AARCH64_VALIDATE_LINK_REGISTER791ret792793.align 4794.Ltail_neon:795add x2,x2,#320796ldp d8,d9,[sp] // meet ABI requirements797cmp x2,#64798b.lo .Less_than_64799800add x5,x5,x6,lsl#32 // pack801add x7,x7,x8,lsl#32802ldp x6,x8,[x1,#0] // load input803add x9,x9,x10,lsl#32804add x11,x11,x12,lsl#32805ldp x10,x12,[x1,#16]806add x13,x13,x14,lsl#32807add x15,x15,x16,lsl#32808ldp x14,x16,[x1,#32]809add x17,x17,x19,lsl#32810add x20,x20,x21,lsl#32811ldp x19,x21,[x1,#48]812add x1,x1,#64813#ifdef __AARCH64EB__814rev x5,x5815rev x7,x7816rev x9,x9817rev x11,x11818rev x13,x13819rev x15,x15820rev x17,x17821rev x20,x20822#endif823eor x5,x5,x6824eor x7,x7,x8825eor x9,x9,x10826eor x11,x11,x12827eor x13,x13,x14828eor x15,x15,x16829eor x17,x17,x19830eor x20,x20,x21831832stp x5,x7,[x0,#0] // store output833add v16.4s,v16.4s,v0.4s // accumulate key block834stp x9,x11,[x0,#16]835add v17.4s,v17.4s,v1.4s836stp x13,x15,[x0,#32]837add v18.4s,v18.4s,v2.4s838stp x17,x20,[x0,#48]839add v19.4s,v19.4s,v3.4s840add x0,x0,#64841b.eq .Ldone_neon842sub x2,x2,#64843cmp x2,#64844b.lo .Last_neon845846ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64847eor v16.16b,v16.16b,v4.16b848eor v17.16b,v17.16b,v5.16b849eor v18.16b,v18.16b,v6.16b850eor v19.16b,v19.16b,v7.16b851st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64852b.eq .Ldone_neon853854add v16.4s,v20.4s,v0.4s855add v17.4s,v21.4s,v1.4s856sub x2,x2,#64857add v18.4s,v22.4s,v2.4s858cmp x2,#64859add v19.4s,v23.4s,v3.4s860b.lo .Last_neon861862ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64863eor v20.16b,v16.16b,v4.16b864eor v21.16b,v17.16b,v5.16b865eor v22.16b,v18.16b,v6.16b866eor v23.16b,v19.16b,v7.16b867st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64868b.eq .Ldone_neon869870add v16.4s,v24.4s,v0.4s871add v17.4s,v25.4s,v1.4s872sub x2,x2,#64873add v18.4s,v26.4s,v2.4s874cmp x2,#64875add v19.4s,v27.4s,v3.4s876b.lo .Last_neon877878ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64879eor v24.16b,v16.16b,v4.16b880eor v25.16b,v17.16b,v5.16b881eor v26.16b,v18.16b,v6.16b882eor v27.16b,v19.16b,v7.16b883st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64884b.eq .Ldone_neon885886add v16.4s,v28.4s,v0.4s887add v17.4s,v29.4s,v1.4s888add v18.4s,v30.4s,v2.4s889add v19.4s,v31.4s,v3.4s890sub x2,x2,#64891892.Last_neon:893st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]894895sub x0,x0,#1896add x1,x1,x2897add x0,x0,x2898add x4,sp,x2899neg x2,x2900901.Loop_tail_neon:902ldrb w10,[x1,x2]903ldrb w11,[x4,x2]904add x2,x2,#1905eor w10,w10,w11906strb w10,[x0,x2]907cbnz x2,.Loop_tail_neon908909stp xzr,xzr,[sp,#0]910stp xzr,xzr,[sp,#16]911stp xzr,xzr,[sp,#32]912stp xzr,xzr,[sp,#48]913914.Ldone_neon:915ldp x19,x20,[x29,#16]916add sp,sp,#64917ldp x21,x22,[x29,#32]918ldp x23,x24,[x29,#48]919ldp x25,x26,[x29,#64]920ldp x27,x28,[x29,#80]921ldp x29,x30,[sp],#96922AARCH64_VALIDATE_LINK_REGISTER923ret924.size ChaCha20_neon,.-ChaCha20_neon925.type ChaCha20_512_neon,%function926.align 5927ChaCha20_512_neon:928AARCH64_SIGN_LINK_REGISTER929stp x29,x30,[sp,#-96]!930add x29,sp,#0931932adrp x5,.Lsigma933add x5,x5,#:lo12:.Lsigma934stp x19,x20,[sp,#16]935stp x21,x22,[sp,#32]936stp x23,x24,[sp,#48]937stp x25,x26,[sp,#64]938stp x27,x28,[sp,#80]939940.L512_or_more_neon:941sub sp,sp,#128+64942943eor v7.16b,v7.16b,v7.16b944ldp x22,x23,[x5] // load sigma945ld1 {v0.4s},[x5],#16946ldp x24,x25,[x3] // load key947ldp x26,x27,[x3,#16]948ld1 {v1.4s,v2.4s},[x3]949ldp x28,x30,[x4] // load counter950ld1 {v3.4s},[x4]951ld1 {v7.s}[0],[x5]952add x3,x5,#16 // .Lrot24953#ifdef __AARCH64EB__954rev64 v0.4s,v0.4s955ror x24,x24,#32956ror x25,x25,#32957ror x26,x26,#32958ror x27,x27,#32959ror x28,x28,#32960ror x30,x30,#32961#endif962add v3.4s,v3.4s,v7.4s // += 1963stp q0,q1,[sp,#0] // off-load key block, invariant part964add v3.4s,v3.4s,v7.4s // not typo965str q2,[sp,#32]966add v4.4s,v3.4s,v7.4s967add v5.4s,v4.4s,v7.4s968add v6.4s,v5.4s,v7.4s969shl v7.4s,v7.4s,#2 // 1 -> 4970971stp d8,d9,[sp,#128+0] // meet ABI requirements972stp d10,d11,[sp,#128+16]973stp d12,d13,[sp,#128+32]974stp d14,d15,[sp,#128+48]975976sub x2,x2,#512 // not typo977978.Loop_outer_512_neon:979mov v8.16b,v0.16b980mov v12.16b,v0.16b981mov v16.16b,v0.16b982mov v20.16b,v0.16b983mov v24.16b,v0.16b984mov v28.16b,v0.16b985mov v9.16b,v1.16b986mov w5,w22 // unpack key block987mov v13.16b,v1.16b988lsr x6,x22,#32989mov v17.16b,v1.16b990mov w7,w23991mov v21.16b,v1.16b992lsr x8,x23,#32993mov v25.16b,v1.16b994mov w9,w24995mov v29.16b,v1.16b996lsr x10,x24,#32997mov v11.16b,v3.16b998mov w11,w25999mov v15.16b,v4.16b1000lsr x12,x25,#321001mov v19.16b,v5.16b1002mov w13,w261003mov v23.16b,v6.16b1004lsr x14,x26,#321005mov v10.16b,v2.16b1006mov w15,w271007mov v14.16b,v2.16b1008lsr x16,x27,#321009add v27.4s,v11.4s,v7.4s // +41010mov w17,w281011add v31.4s,v15.4s,v7.4s // +41012lsr x19,x28,#321013mov v18.16b,v2.16b1014mov w20,w301015mov v22.16b,v2.16b1016lsr x21,x30,#321017mov v26.16b,v2.16b1018stp q3,q4,[sp,#48] // off-load key block, variable part1019mov v30.16b,v2.16b1020stp q5,q6,[sp,#80]10211022mov x4,#51023ld1 {v6.4s},[x3]1024subs x2,x2,#5121025.Loop_upper_neon:1026sub x4,x4,#11027add v8.4s,v8.4s,v9.4s1028add w5,w5,w91029add v12.4s,v12.4s,v13.4s1030add w6,w6,w101031add v16.4s,v16.4s,v17.4s1032add w7,w7,w111033add v20.4s,v20.4s,v21.4s1034add w8,w8,w121035add v24.4s,v24.4s,v25.4s1036eor w17,w17,w51037add v28.4s,v28.4s,v29.4s1038eor w19,w19,w61039eor v11.16b,v11.16b,v8.16b1040eor w20,w20,w71041eor v15.16b,v15.16b,v12.16b1042eor w21,w21,w81043eor v19.16b,v19.16b,v16.16b1044ror w17,w17,#161045eor v23.16b,v23.16b,v20.16b1046ror w19,w19,#161047eor v27.16b,v27.16b,v24.16b1048ror w20,w20,#161049eor v31.16b,v31.16b,v28.16b1050ror w21,w21,#161051rev32 v11.8h,v11.8h1052add w13,w13,w171053rev32 v15.8h,v15.8h1054add w14,w14,w191055rev32 v19.8h,v19.8h1056add w15,w15,w201057rev32 v23.8h,v23.8h1058add w16,w16,w211059rev32 v27.8h,v27.8h1060eor w9,w9,w131061rev32 v31.8h,v31.8h1062eor w10,w10,w141063add v10.4s,v10.4s,v11.4s1064eor w11,w11,w151065add v14.4s,v14.4s,v15.4s1066eor w12,w12,w161067add v18.4s,v18.4s,v19.4s1068ror w9,w9,#201069add v22.4s,v22.4s,v23.4s1070ror w10,w10,#201071add v26.4s,v26.4s,v27.4s1072ror w11,w11,#201073add v30.4s,v30.4s,v31.4s1074ror w12,w12,#201075eor v0.16b,v9.16b,v10.16b1076add w5,w5,w91077eor v1.16b,v13.16b,v14.16b1078add w6,w6,w101079eor v2.16b,v17.16b,v18.16b1080add w7,w7,w111081eor v3.16b,v21.16b,v22.16b1082add w8,w8,w121083eor v4.16b,v25.16b,v26.16b1084eor w17,w17,w51085eor v5.16b,v29.16b,v30.16b1086eor w19,w19,w61087ushr v9.4s,v0.4s,#201088eor w20,w20,w71089ushr v13.4s,v1.4s,#201090eor w21,w21,w81091ushr v17.4s,v2.4s,#201092ror w17,w17,#241093ushr v21.4s,v3.4s,#201094ror w19,w19,#241095ushr v25.4s,v4.4s,#201096ror w20,w20,#241097ushr v29.4s,v5.4s,#201098ror w21,w21,#241099sli v9.4s,v0.4s,#121100add w13,w13,w171101sli v13.4s,v1.4s,#121102add w14,w14,w191103sli v17.4s,v2.4s,#121104add w15,w15,w201105sli v21.4s,v3.4s,#121106add w16,w16,w211107sli v25.4s,v4.4s,#121108eor w9,w9,w131109sli v29.4s,v5.4s,#121110eor w10,w10,w141111add v8.4s,v8.4s,v9.4s1112eor w11,w11,w151113add v12.4s,v12.4s,v13.4s1114eor w12,w12,w161115add v16.4s,v16.4s,v17.4s1116ror w9,w9,#251117add v20.4s,v20.4s,v21.4s1118ror w10,w10,#251119add v24.4s,v24.4s,v25.4s1120ror w11,w11,#251121add v28.4s,v28.4s,v29.4s1122ror w12,w12,#251123eor v11.16b,v11.16b,v8.16b1124add w5,w5,w101125eor v15.16b,v15.16b,v12.16b1126add w6,w6,w111127eor v19.16b,v19.16b,v16.16b1128add w7,w7,w121129eor v23.16b,v23.16b,v20.16b1130add w8,w8,w91131eor v27.16b,v27.16b,v24.16b1132eor w21,w21,w51133eor v31.16b,v31.16b,v28.16b1134eor w17,w17,w61135tbl v11.16b,{v11.16b},v6.16b1136eor w19,w19,w71137tbl v15.16b,{v15.16b},v6.16b1138eor w20,w20,w81139tbl v19.16b,{v19.16b},v6.16b1140ror w21,w21,#161141tbl v23.16b,{v23.16b},v6.16b1142ror w17,w17,#161143tbl v27.16b,{v27.16b},v6.16b1144ror w19,w19,#161145tbl v31.16b,{v31.16b},v6.16b1146ror w20,w20,#161147add v10.4s,v10.4s,v11.4s1148add w15,w15,w211149add v14.4s,v14.4s,v15.4s1150add w16,w16,w171151add v18.4s,v18.4s,v19.4s1152add w13,w13,w191153add v22.4s,v22.4s,v23.4s1154add w14,w14,w201155add v26.4s,v26.4s,v27.4s1156eor w10,w10,w151157add v30.4s,v30.4s,v31.4s1158eor w11,w11,w161159eor v0.16b,v9.16b,v10.16b1160eor w12,w12,w131161eor v1.16b,v13.16b,v14.16b1162eor w9,w9,w141163eor v2.16b,v17.16b,v18.16b1164ror w10,w10,#201165eor v3.16b,v21.16b,v22.16b1166ror w11,w11,#201167eor v4.16b,v25.16b,v26.16b1168ror w12,w12,#201169eor v5.16b,v29.16b,v30.16b1170ror w9,w9,#201171ushr v9.4s,v0.4s,#251172add w5,w5,w101173ushr v13.4s,v1.4s,#251174add w6,w6,w111175ushr v17.4s,v2.4s,#251176add w7,w7,w121177ushr v21.4s,v3.4s,#251178add w8,w8,w91179ushr v25.4s,v4.4s,#251180eor w21,w21,w51181ushr v29.4s,v5.4s,#251182eor w17,w17,w61183sli v9.4s,v0.4s,#71184eor w19,w19,w71185sli v13.4s,v1.4s,#71186eor w20,w20,w81187sli v17.4s,v2.4s,#71188ror w21,w21,#241189sli v21.4s,v3.4s,#71190ror w17,w17,#241191sli v25.4s,v4.4s,#71192ror w19,w19,#241193sli v29.4s,v5.4s,#71194ror w20,w20,#241195ext v10.16b,v10.16b,v10.16b,#81196add w15,w15,w211197ext v14.16b,v14.16b,v14.16b,#81198add w16,w16,w171199ext v18.16b,v18.16b,v18.16b,#81200add w13,w13,w191201ext v22.16b,v22.16b,v22.16b,#81202add w14,w14,w201203ext v26.16b,v26.16b,v26.16b,#81204eor w10,w10,w151205ext v30.16b,v30.16b,v30.16b,#81206eor w11,w11,w161207ext v11.16b,v11.16b,v11.16b,#121208eor w12,w12,w131209ext v15.16b,v15.16b,v15.16b,#121210eor w9,w9,w141211ext v19.16b,v19.16b,v19.16b,#121212ror w10,w10,#251213ext v23.16b,v23.16b,v23.16b,#121214ror w11,w11,#251215ext v27.16b,v27.16b,v27.16b,#121216ror w12,w12,#251217ext v31.16b,v31.16b,v31.16b,#121218ror w9,w9,#251219ext v9.16b,v9.16b,v9.16b,#41220ext v13.16b,v13.16b,v13.16b,#41221ext v17.16b,v17.16b,v17.16b,#41222ext v21.16b,v21.16b,v21.16b,#41223ext v25.16b,v25.16b,v25.16b,#41224ext v29.16b,v29.16b,v29.16b,#41225add v8.4s,v8.4s,v9.4s1226add w5,w5,w91227add v12.4s,v12.4s,v13.4s1228add w6,w6,w101229add v16.4s,v16.4s,v17.4s1230add w7,w7,w111231add v20.4s,v20.4s,v21.4s1232add w8,w8,w121233add v24.4s,v24.4s,v25.4s1234eor w17,w17,w51235add v28.4s,v28.4s,v29.4s1236eor w19,w19,w61237eor v11.16b,v11.16b,v8.16b1238eor w20,w20,w71239eor v15.16b,v15.16b,v12.16b1240eor w21,w21,w81241eor v19.16b,v19.16b,v16.16b1242ror w17,w17,#161243eor v23.16b,v23.16b,v20.16b1244ror w19,w19,#161245eor v27.16b,v27.16b,v24.16b1246ror w20,w20,#161247eor v31.16b,v31.16b,v28.16b1248ror w21,w21,#161249rev32 v11.8h,v11.8h1250add w13,w13,w171251rev32 v15.8h,v15.8h1252add w14,w14,w191253rev32 v19.8h,v19.8h1254add w15,w15,w201255rev32 v23.8h,v23.8h1256add w16,w16,w211257rev32 v27.8h,v27.8h1258eor w9,w9,w131259rev32 v31.8h,v31.8h1260eor w10,w10,w141261add v10.4s,v10.4s,v11.4s1262eor w11,w11,w151263add v14.4s,v14.4s,v15.4s1264eor w12,w12,w161265add v18.4s,v18.4s,v19.4s1266ror w9,w9,#201267add v22.4s,v22.4s,v23.4s1268ror w10,w10,#201269add v26.4s,v26.4s,v27.4s1270ror w11,w11,#201271add v30.4s,v30.4s,v31.4s1272ror w12,w12,#201273eor v0.16b,v9.16b,v10.16b1274add w5,w5,w91275eor v1.16b,v13.16b,v14.16b1276add w6,w6,w101277eor v2.16b,v17.16b,v18.16b1278add w7,w7,w111279eor v3.16b,v21.16b,v22.16b1280add w8,w8,w121281eor v4.16b,v25.16b,v26.16b1282eor w17,w17,w51283eor v5.16b,v29.16b,v30.16b1284eor w19,w19,w61285ushr v9.4s,v0.4s,#201286eor w20,w20,w71287ushr v13.4s,v1.4s,#201288eor w21,w21,w81289ushr v17.4s,v2.4s,#201290ror w17,w17,#241291ushr v21.4s,v3.4s,#201292ror w19,w19,#241293ushr v25.4s,v4.4s,#201294ror w20,w20,#241295ushr v29.4s,v5.4s,#201296ror w21,w21,#241297sli v9.4s,v0.4s,#121298add w13,w13,w171299sli v13.4s,v1.4s,#121300add w14,w14,w191301sli v17.4s,v2.4s,#121302add w15,w15,w201303sli v21.4s,v3.4s,#121304add w16,w16,w211305sli v25.4s,v4.4s,#121306eor w9,w9,w131307sli v29.4s,v5.4s,#121308eor w10,w10,w141309add v8.4s,v8.4s,v9.4s1310eor w11,w11,w151311add v12.4s,v12.4s,v13.4s1312eor w12,w12,w161313add v16.4s,v16.4s,v17.4s1314ror w9,w9,#251315add v20.4s,v20.4s,v21.4s1316ror w10,w10,#251317add v24.4s,v24.4s,v25.4s1318ror w11,w11,#251319add v28.4s,v28.4s,v29.4s1320ror w12,w12,#251321eor v11.16b,v11.16b,v8.16b1322add w5,w5,w101323eor v15.16b,v15.16b,v12.16b1324add w6,w6,w111325eor v19.16b,v19.16b,v16.16b1326add w7,w7,w121327eor v23.16b,v23.16b,v20.16b1328add w8,w8,w91329eor v27.16b,v27.16b,v24.16b1330eor w21,w21,w51331eor v31.16b,v31.16b,v28.16b1332eor w17,w17,w61333tbl v11.16b,{v11.16b},v6.16b1334eor w19,w19,w71335tbl v15.16b,{v15.16b},v6.16b1336eor w20,w20,w81337tbl v19.16b,{v19.16b},v6.16b1338ror w21,w21,#161339tbl v23.16b,{v23.16b},v6.16b1340ror w17,w17,#161341tbl v27.16b,{v27.16b},v6.16b1342ror w19,w19,#161343tbl v31.16b,{v31.16b},v6.16b1344ror w20,w20,#161345add v10.4s,v10.4s,v11.4s1346add w15,w15,w211347add v14.4s,v14.4s,v15.4s1348add w16,w16,w171349add v18.4s,v18.4s,v19.4s1350add w13,w13,w191351add v22.4s,v22.4s,v23.4s1352add w14,w14,w201353add v26.4s,v26.4s,v27.4s1354eor w10,w10,w151355add v30.4s,v30.4s,v31.4s1356eor w11,w11,w161357eor v0.16b,v9.16b,v10.16b1358eor w12,w12,w131359eor v1.16b,v13.16b,v14.16b1360eor w9,w9,w141361eor v2.16b,v17.16b,v18.16b1362ror w10,w10,#201363eor v3.16b,v21.16b,v22.16b1364ror w11,w11,#201365eor v4.16b,v25.16b,v26.16b1366ror w12,w12,#201367eor v5.16b,v29.16b,v30.16b1368ror w9,w9,#201369ushr v9.4s,v0.4s,#251370add w5,w5,w101371ushr v13.4s,v1.4s,#251372add w6,w6,w111373ushr v17.4s,v2.4s,#251374add w7,w7,w121375ushr v21.4s,v3.4s,#251376add w8,w8,w91377ushr v25.4s,v4.4s,#251378eor w21,w21,w51379ushr v29.4s,v5.4s,#251380eor w17,w17,w61381sli v9.4s,v0.4s,#71382eor w19,w19,w71383sli v13.4s,v1.4s,#71384eor w20,w20,w81385sli v17.4s,v2.4s,#71386ror w21,w21,#241387sli v21.4s,v3.4s,#71388ror w17,w17,#241389sli v25.4s,v4.4s,#71390ror w19,w19,#241391sli v29.4s,v5.4s,#71392ror w20,w20,#241393ext v10.16b,v10.16b,v10.16b,#81394add w15,w15,w211395ext v14.16b,v14.16b,v14.16b,#81396add w16,w16,w171397ext v18.16b,v18.16b,v18.16b,#81398add w13,w13,w191399ext v22.16b,v22.16b,v22.16b,#81400add w14,w14,w201401ext v26.16b,v26.16b,v26.16b,#81402eor w10,w10,w151403ext v30.16b,v30.16b,v30.16b,#81404eor w11,w11,w161405ext v11.16b,v11.16b,v11.16b,#41406eor w12,w12,w131407ext v15.16b,v15.16b,v15.16b,#41408eor w9,w9,w141409ext v19.16b,v19.16b,v19.16b,#41410ror w10,w10,#251411ext v23.16b,v23.16b,v23.16b,#41412ror w11,w11,#251413ext v27.16b,v27.16b,v27.16b,#41414ror w12,w12,#251415ext v31.16b,v31.16b,v31.16b,#41416ror w9,w9,#251417ext v9.16b,v9.16b,v9.16b,#121418ext v13.16b,v13.16b,v13.16b,#121419ext v17.16b,v17.16b,v17.16b,#121420ext v21.16b,v21.16b,v21.16b,#121421ext v25.16b,v25.16b,v25.16b,#121422ext v29.16b,v29.16b,v29.16b,#121423cbnz x4,.Loop_upper_neon14241425add w5,w5,w22 // accumulate key block1426add x6,x6,x22,lsr#321427add w7,w7,w231428add x8,x8,x23,lsr#321429add w9,w9,w241430add x10,x10,x24,lsr#321431add w11,w11,w251432add x12,x12,x25,lsr#321433add w13,w13,w261434add x14,x14,x26,lsr#321435add w15,w15,w271436add x16,x16,x27,lsr#321437add w17,w17,w281438add x19,x19,x28,lsr#321439add w20,w20,w301440add x21,x21,x30,lsr#3214411442add x5,x5,x6,lsl#32 // pack1443add x7,x7,x8,lsl#321444ldp x6,x8,[x1,#0] // load input1445add x9,x9,x10,lsl#321446add x11,x11,x12,lsl#321447ldp x10,x12,[x1,#16]1448add x13,x13,x14,lsl#321449add x15,x15,x16,lsl#321450ldp x14,x16,[x1,#32]1451add x17,x17,x19,lsl#321452add x20,x20,x21,lsl#321453ldp x19,x21,[x1,#48]1454add x1,x1,#641455#ifdef __AARCH64EB__1456rev x5,x51457rev x7,x71458rev x9,x91459rev x11,x111460rev x13,x131461rev x15,x151462rev x17,x171463rev x20,x201464#endif1465eor x5,x5,x61466eor x7,x7,x81467eor x9,x9,x101468eor x11,x11,x121469eor x13,x13,x141470eor x15,x15,x161471eor x17,x17,x191472eor x20,x20,x2114731474stp x5,x7,[x0,#0] // store output1475add x28,x28,#1 // increment counter1476mov w5,w22 // unpack key block1477lsr x6,x22,#321478stp x9,x11,[x0,#16]1479mov w7,w231480lsr x8,x23,#321481stp x13,x15,[x0,#32]1482mov w9,w241483lsr x10,x24,#321484stp x17,x20,[x0,#48]1485add x0,x0,#641486mov w11,w251487lsr x12,x25,#321488mov w13,w261489lsr x14,x26,#321490mov w15,w271491lsr x16,x27,#321492mov w17,w281493lsr x19,x28,#321494mov w20,w301495lsr x21,x30,#3214961497mov x4,#51498.Loop_lower_neon:1499sub x4,x4,#11500add v8.4s,v8.4s,v9.4s1501add w5,w5,w91502add v12.4s,v12.4s,v13.4s1503add w6,w6,w101504add v16.4s,v16.4s,v17.4s1505add w7,w7,w111506add v20.4s,v20.4s,v21.4s1507add w8,w8,w121508add v24.4s,v24.4s,v25.4s1509eor w17,w17,w51510add v28.4s,v28.4s,v29.4s1511eor w19,w19,w61512eor v11.16b,v11.16b,v8.16b1513eor w20,w20,w71514eor v15.16b,v15.16b,v12.16b1515eor w21,w21,w81516eor v19.16b,v19.16b,v16.16b1517ror w17,w17,#161518eor v23.16b,v23.16b,v20.16b1519ror w19,w19,#161520eor v27.16b,v27.16b,v24.16b1521ror w20,w20,#161522eor v31.16b,v31.16b,v28.16b1523ror w21,w21,#161524rev32 v11.8h,v11.8h1525add w13,w13,w171526rev32 v15.8h,v15.8h1527add w14,w14,w191528rev32 v19.8h,v19.8h1529add w15,w15,w201530rev32 v23.8h,v23.8h1531add w16,w16,w211532rev32 v27.8h,v27.8h1533eor w9,w9,w131534rev32 v31.8h,v31.8h1535eor w10,w10,w141536add v10.4s,v10.4s,v11.4s1537eor w11,w11,w151538add v14.4s,v14.4s,v15.4s1539eor w12,w12,w161540add v18.4s,v18.4s,v19.4s1541ror w9,w9,#201542add v22.4s,v22.4s,v23.4s1543ror w10,w10,#201544add v26.4s,v26.4s,v27.4s1545ror w11,w11,#201546add v30.4s,v30.4s,v31.4s1547ror w12,w12,#201548eor v0.16b,v9.16b,v10.16b1549add w5,w5,w91550eor v1.16b,v13.16b,v14.16b1551add w6,w6,w101552eor v2.16b,v17.16b,v18.16b1553add w7,w7,w111554eor v3.16b,v21.16b,v22.16b1555add w8,w8,w121556eor v4.16b,v25.16b,v26.16b1557eor w17,w17,w51558eor v5.16b,v29.16b,v30.16b1559eor w19,w19,w61560ushr v9.4s,v0.4s,#201561eor w20,w20,w71562ushr v13.4s,v1.4s,#201563eor w21,w21,w81564ushr v17.4s,v2.4s,#201565ror w17,w17,#241566ushr v21.4s,v3.4s,#201567ror w19,w19,#241568ushr v25.4s,v4.4s,#201569ror w20,w20,#241570ushr v29.4s,v5.4s,#201571ror w21,w21,#241572sli v9.4s,v0.4s,#121573add w13,w13,w171574sli v13.4s,v1.4s,#121575add w14,w14,w191576sli v17.4s,v2.4s,#121577add w15,w15,w201578sli v21.4s,v3.4s,#121579add w16,w16,w211580sli v25.4s,v4.4s,#121581eor w9,w9,w131582sli v29.4s,v5.4s,#121583eor w10,w10,w141584add v8.4s,v8.4s,v9.4s1585eor w11,w11,w151586add v12.4s,v12.4s,v13.4s1587eor w12,w12,w161588add v16.4s,v16.4s,v17.4s1589ror w9,w9,#251590add v20.4s,v20.4s,v21.4s1591ror w10,w10,#251592add v24.4s,v24.4s,v25.4s1593ror w11,w11,#251594add v28.4s,v28.4s,v29.4s1595ror w12,w12,#251596eor v11.16b,v11.16b,v8.16b1597add w5,w5,w101598eor v15.16b,v15.16b,v12.16b1599add w6,w6,w111600eor v19.16b,v19.16b,v16.16b1601add w7,w7,w121602eor v23.16b,v23.16b,v20.16b1603add w8,w8,w91604eor v27.16b,v27.16b,v24.16b1605eor w21,w21,w51606eor v31.16b,v31.16b,v28.16b1607eor w17,w17,w61608tbl v11.16b,{v11.16b},v6.16b1609eor w19,w19,w71610tbl v15.16b,{v15.16b},v6.16b1611eor w20,w20,w81612tbl v19.16b,{v19.16b},v6.16b1613ror w21,w21,#161614tbl v23.16b,{v23.16b},v6.16b1615ror w17,w17,#161616tbl v27.16b,{v27.16b},v6.16b1617ror w19,w19,#161618tbl v31.16b,{v31.16b},v6.16b1619ror w20,w20,#161620add v10.4s,v10.4s,v11.4s1621add w15,w15,w211622add v14.4s,v14.4s,v15.4s1623add w16,w16,w171624add v18.4s,v18.4s,v19.4s1625add w13,w13,w191626add v22.4s,v22.4s,v23.4s1627add w14,w14,w201628add v26.4s,v26.4s,v27.4s1629eor w10,w10,w151630add v30.4s,v30.4s,v31.4s1631eor w11,w11,w161632eor v0.16b,v9.16b,v10.16b1633eor w12,w12,w131634eor v1.16b,v13.16b,v14.16b1635eor w9,w9,w141636eor v2.16b,v17.16b,v18.16b1637ror w10,w10,#201638eor v3.16b,v21.16b,v22.16b1639ror w11,w11,#201640eor v4.16b,v25.16b,v26.16b1641ror w12,w12,#201642eor v5.16b,v29.16b,v30.16b1643ror w9,w9,#201644ushr v9.4s,v0.4s,#251645add w5,w5,w101646ushr v13.4s,v1.4s,#251647add w6,w6,w111648ushr v17.4s,v2.4s,#251649add w7,w7,w121650ushr v21.4s,v3.4s,#251651add w8,w8,w91652ushr v25.4s,v4.4s,#251653eor w21,w21,w51654ushr v29.4s,v5.4s,#251655eor w17,w17,w61656sli v9.4s,v0.4s,#71657eor w19,w19,w71658sli v13.4s,v1.4s,#71659eor w20,w20,w81660sli v17.4s,v2.4s,#71661ror w21,w21,#241662sli v21.4s,v3.4s,#71663ror w17,w17,#241664sli v25.4s,v4.4s,#71665ror w19,w19,#241666sli v29.4s,v5.4s,#71667ror w20,w20,#241668ext v10.16b,v10.16b,v10.16b,#81669add w15,w15,w211670ext v14.16b,v14.16b,v14.16b,#81671add w16,w16,w171672ext v18.16b,v18.16b,v18.16b,#81673add w13,w13,w191674ext v22.16b,v22.16b,v22.16b,#81675add w14,w14,w201676ext v26.16b,v26.16b,v26.16b,#81677eor w10,w10,w151678ext v30.16b,v30.16b,v30.16b,#81679eor w11,w11,w161680ext v11.16b,v11.16b,v11.16b,#121681eor w12,w12,w131682ext v15.16b,v15.16b,v15.16b,#121683eor w9,w9,w141684ext v19.16b,v19.16b,v19.16b,#121685ror w10,w10,#251686ext v23.16b,v23.16b,v23.16b,#121687ror w11,w11,#251688ext v27.16b,v27.16b,v27.16b,#121689ror w12,w12,#251690ext v31.16b,v31.16b,v31.16b,#121691ror w9,w9,#251692ext v9.16b,v9.16b,v9.16b,#41693ext v13.16b,v13.16b,v13.16b,#41694ext v17.16b,v17.16b,v17.16b,#41695ext v21.16b,v21.16b,v21.16b,#41696ext v25.16b,v25.16b,v25.16b,#41697ext v29.16b,v29.16b,v29.16b,#41698add v8.4s,v8.4s,v9.4s1699add w5,w5,w91700add v12.4s,v12.4s,v13.4s1701add w6,w6,w101702add v16.4s,v16.4s,v17.4s1703add w7,w7,w111704add v20.4s,v20.4s,v21.4s1705add w8,w8,w121706add v24.4s,v24.4s,v25.4s1707eor w17,w17,w51708add v28.4s,v28.4s,v29.4s1709eor w19,w19,w61710eor v11.16b,v11.16b,v8.16b1711eor w20,w20,w71712eor v15.16b,v15.16b,v12.16b1713eor w21,w21,w81714eor v19.16b,v19.16b,v16.16b1715ror w17,w17,#161716eor v23.16b,v23.16b,v20.16b1717ror w19,w19,#161718eor v27.16b,v27.16b,v24.16b1719ror w20,w20,#161720eor v31.16b,v31.16b,v28.16b1721ror w21,w21,#161722rev32 v11.8h,v11.8h1723add w13,w13,w171724rev32 v15.8h,v15.8h1725add w14,w14,w191726rev32 v19.8h,v19.8h1727add w15,w15,w201728rev32 v23.8h,v23.8h1729add w16,w16,w211730rev32 v27.8h,v27.8h1731eor w9,w9,w131732rev32 v31.8h,v31.8h1733eor w10,w10,w141734add v10.4s,v10.4s,v11.4s1735eor w11,w11,w151736add v14.4s,v14.4s,v15.4s1737eor w12,w12,w161738add v18.4s,v18.4s,v19.4s1739ror w9,w9,#201740add v22.4s,v22.4s,v23.4s1741ror w10,w10,#201742add v26.4s,v26.4s,v27.4s1743ror w11,w11,#201744add v30.4s,v30.4s,v31.4s1745ror w12,w12,#201746eor v0.16b,v9.16b,v10.16b1747add w5,w5,w91748eor v1.16b,v13.16b,v14.16b1749add w6,w6,w101750eor v2.16b,v17.16b,v18.16b1751add w7,w7,w111752eor v3.16b,v21.16b,v22.16b1753add w8,w8,w121754eor v4.16b,v25.16b,v26.16b1755eor w17,w17,w51756eor v5.16b,v29.16b,v30.16b1757eor w19,w19,w61758ushr v9.4s,v0.4s,#201759eor w20,w20,w71760ushr v13.4s,v1.4s,#201761eor w21,w21,w81762ushr v17.4s,v2.4s,#201763ror w17,w17,#241764ushr v21.4s,v3.4s,#201765ror w19,w19,#241766ushr v25.4s,v4.4s,#201767ror w20,w20,#241768ushr v29.4s,v5.4s,#201769ror w21,w21,#241770sli v9.4s,v0.4s,#121771add w13,w13,w171772sli v13.4s,v1.4s,#121773add w14,w14,w191774sli v17.4s,v2.4s,#121775add w15,w15,w201776sli v21.4s,v3.4s,#121777add w16,w16,w211778sli v25.4s,v4.4s,#121779eor w9,w9,w131780sli v29.4s,v5.4s,#121781eor w10,w10,w141782add v8.4s,v8.4s,v9.4s1783eor w11,w11,w151784add v12.4s,v12.4s,v13.4s1785eor w12,w12,w161786add v16.4s,v16.4s,v17.4s1787ror w9,w9,#251788add v20.4s,v20.4s,v21.4s1789ror w10,w10,#251790add v24.4s,v24.4s,v25.4s1791ror w11,w11,#251792add v28.4s,v28.4s,v29.4s1793ror w12,w12,#251794eor v11.16b,v11.16b,v8.16b1795add w5,w5,w101796eor v15.16b,v15.16b,v12.16b1797add w6,w6,w111798eor v19.16b,v19.16b,v16.16b1799add w7,w7,w121800eor v23.16b,v23.16b,v20.16b1801add w8,w8,w91802eor v27.16b,v27.16b,v24.16b1803eor w21,w21,w51804eor v31.16b,v31.16b,v28.16b1805eor w17,w17,w61806tbl v11.16b,{v11.16b},v6.16b1807eor w19,w19,w71808tbl v15.16b,{v15.16b},v6.16b1809eor w20,w20,w81810tbl v19.16b,{v19.16b},v6.16b1811ror w21,w21,#161812tbl v23.16b,{v23.16b},v6.16b1813ror w17,w17,#161814tbl v27.16b,{v27.16b},v6.16b1815ror w19,w19,#161816tbl v31.16b,{v31.16b},v6.16b1817ror w20,w20,#161818add v10.4s,v10.4s,v11.4s1819add w15,w15,w211820add v14.4s,v14.4s,v15.4s1821add w16,w16,w171822add v18.4s,v18.4s,v19.4s1823add w13,w13,w191824add v22.4s,v22.4s,v23.4s1825add w14,w14,w201826add v26.4s,v26.4s,v27.4s1827eor w10,w10,w151828add v30.4s,v30.4s,v31.4s1829eor w11,w11,w161830eor v0.16b,v9.16b,v10.16b1831eor w12,w12,w131832eor v1.16b,v13.16b,v14.16b1833eor w9,w9,w141834eor v2.16b,v17.16b,v18.16b1835ror w10,w10,#201836eor v3.16b,v21.16b,v22.16b1837ror w11,w11,#201838eor v4.16b,v25.16b,v26.16b1839ror w12,w12,#201840eor v5.16b,v29.16b,v30.16b1841ror w9,w9,#201842ushr v9.4s,v0.4s,#251843add w5,w5,w101844ushr v13.4s,v1.4s,#251845add w6,w6,w111846ushr v17.4s,v2.4s,#251847add w7,w7,w121848ushr v21.4s,v3.4s,#251849add w8,w8,w91850ushr v25.4s,v4.4s,#251851eor w21,w21,w51852ushr v29.4s,v5.4s,#251853eor w17,w17,w61854sli v9.4s,v0.4s,#71855eor w19,w19,w71856sli v13.4s,v1.4s,#71857eor w20,w20,w81858sli v17.4s,v2.4s,#71859ror w21,w21,#241860sli v21.4s,v3.4s,#71861ror w17,w17,#241862sli v25.4s,v4.4s,#71863ror w19,w19,#241864sli v29.4s,v5.4s,#71865ror w20,w20,#241866ext v10.16b,v10.16b,v10.16b,#81867add w15,w15,w211868ext v14.16b,v14.16b,v14.16b,#81869add w16,w16,w171870ext v18.16b,v18.16b,v18.16b,#81871add w13,w13,w191872ext v22.16b,v22.16b,v22.16b,#81873add w14,w14,w201874ext v26.16b,v26.16b,v26.16b,#81875eor w10,w10,w151876ext v30.16b,v30.16b,v30.16b,#81877eor w11,w11,w161878ext v11.16b,v11.16b,v11.16b,#41879eor w12,w12,w131880ext v15.16b,v15.16b,v15.16b,#41881eor w9,w9,w141882ext v19.16b,v19.16b,v19.16b,#41883ror w10,w10,#251884ext v23.16b,v23.16b,v23.16b,#41885ror w11,w11,#251886ext v27.16b,v27.16b,v27.16b,#41887ror w12,w12,#251888ext v31.16b,v31.16b,v31.16b,#41889ror w9,w9,#251890ext v9.16b,v9.16b,v9.16b,#121891ext v13.16b,v13.16b,v13.16b,#121892ext v17.16b,v17.16b,v17.16b,#121893ext v21.16b,v21.16b,v21.16b,#121894ext v25.16b,v25.16b,v25.16b,#121895ext v29.16b,v29.16b,v29.16b,#121896cbnz x4,.Loop_lower_neon18971898add w5,w5,w22 // accumulate key block1899ldp q0,q1,[sp,#0]1900add x6,x6,x22,lsr#321901ldp q2,q3,[sp,#32]1902add w7,w7,w231903ldp q4,q5,[sp,#64]1904add x8,x8,x23,lsr#321905ldr q6,[sp,#96]1906add v8.4s,v8.4s,v0.4s1907add w9,w9,w241908add v12.4s,v12.4s,v0.4s1909add x10,x10,x24,lsr#321910add v16.4s,v16.4s,v0.4s1911add w11,w11,w251912add v20.4s,v20.4s,v0.4s1913add x12,x12,x25,lsr#321914add v24.4s,v24.4s,v0.4s1915add w13,w13,w261916add v28.4s,v28.4s,v0.4s1917add x14,x14,x26,lsr#321918add v10.4s,v10.4s,v2.4s1919add w15,w15,w271920add v14.4s,v14.4s,v2.4s1921add x16,x16,x27,lsr#321922add v18.4s,v18.4s,v2.4s1923add w17,w17,w281924add v22.4s,v22.4s,v2.4s1925add x19,x19,x28,lsr#321926add v26.4s,v26.4s,v2.4s1927add w20,w20,w301928add v30.4s,v30.4s,v2.4s1929add x21,x21,x30,lsr#321930add v27.4s,v27.4s,v7.4s // +41931add x5,x5,x6,lsl#32 // pack1932add v31.4s,v31.4s,v7.4s // +41933add x7,x7,x8,lsl#321934add v11.4s,v11.4s,v3.4s1935ldp x6,x8,[x1,#0] // load input1936add v15.4s,v15.4s,v4.4s1937add x9,x9,x10,lsl#321938add v19.4s,v19.4s,v5.4s1939add x11,x11,x12,lsl#321940add v23.4s,v23.4s,v6.4s1941ldp x10,x12,[x1,#16]1942add v27.4s,v27.4s,v3.4s1943add x13,x13,x14,lsl#321944add v31.4s,v31.4s,v4.4s1945add x15,x15,x16,lsl#321946add v9.4s,v9.4s,v1.4s1947ldp x14,x16,[x1,#32]1948add v13.4s,v13.4s,v1.4s1949add x17,x17,x19,lsl#321950add v17.4s,v17.4s,v1.4s1951add x20,x20,x21,lsl#321952add v21.4s,v21.4s,v1.4s1953ldp x19,x21,[x1,#48]1954add v25.4s,v25.4s,v1.4s1955add x1,x1,#641956add v29.4s,v29.4s,v1.4s19571958#ifdef __AARCH64EB__1959rev x5,x51960rev x7,x71961rev x9,x91962rev x11,x111963rev x13,x131964rev x15,x151965rev x17,x171966rev x20,x201967#endif1968ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#641969eor x5,x5,x61970eor x7,x7,x81971eor x9,x9,x101972eor x11,x11,x121973eor x13,x13,x141974eor v8.16b,v8.16b,v0.16b1975eor x15,x15,x161976eor v9.16b,v9.16b,v1.16b1977eor x17,x17,x191978eor v10.16b,v10.16b,v2.16b1979eor x20,x20,x211980eor v11.16b,v11.16b,v3.16b1981ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#6419821983stp x5,x7,[x0,#0] // store output1984add x28,x28,#7 // increment counter1985stp x9,x11,[x0,#16]1986stp x13,x15,[x0,#32]1987stp x17,x20,[x0,#48]1988add x0,x0,#641989st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#6419901991ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#641992eor v12.16b,v12.16b,v0.16b1993eor v13.16b,v13.16b,v1.16b1994eor v14.16b,v14.16b,v2.16b1995eor v15.16b,v15.16b,v3.16b1996st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#6419971998ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#641999eor v16.16b,v16.16b,v8.16b2000ldp q0,q1,[sp,#0]2001eor v17.16b,v17.16b,v9.16b2002ldp q2,q3,[sp,#32]2003eor v18.16b,v18.16b,v10.16b2004eor v19.16b,v19.16b,v11.16b2005st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#6420062007ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#642008eor v20.16b,v20.16b,v12.16b2009eor v21.16b,v21.16b,v13.16b2010eor v22.16b,v22.16b,v14.16b2011eor v23.16b,v23.16b,v15.16b2012st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#6420132014ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#642015eor v24.16b,v24.16b,v16.16b2016eor v25.16b,v25.16b,v17.16b2017eor v26.16b,v26.16b,v18.16b2018eor v27.16b,v27.16b,v19.16b2019st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#6420202021shl v8.4s,v7.4s,#1 // 4 -> 82022eor v28.16b,v28.16b,v20.16b2023eor v29.16b,v29.16b,v21.16b2024eor v30.16b,v30.16b,v22.16b2025eor v31.16b,v31.16b,v23.16b2026st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#6420272028add v3.4s,v3.4s,v8.4s // += 82029add v4.4s,v4.4s,v8.4s2030add v5.4s,v5.4s,v8.4s2031add v6.4s,v6.4s,v8.4s20322033b.hs .Loop_outer_512_neon20342035adds x2,x2,#5122036ushr v7.4s,v7.4s,#1 // 4 -> 220372038ldp d10,d11,[sp,#128+16] // meet ABI requirements2039ldp d12,d13,[sp,#128+32]2040ldp d14,d15,[sp,#128+48]20412042stp q0,q0,[sp,#0] // wipe off-load area2043stp q0,q0,[sp,#32]2044stp q0,q0,[sp,#64]20452046b.eq .Ldone_512_neon20472048sub x3,x3,#16 // .Lone2049cmp x2,#1922050add sp,sp,#1282051sub v3.4s,v3.4s,v7.4s // -= 22052ld1 {v8.4s,v9.4s},[x3]2053b.hs .Loop_outer_neon20542055ldp d8,d9,[sp,#0] // meet ABI requirements2056eor v1.16b,v1.16b,v1.16b2057eor v2.16b,v2.16b,v2.16b2058eor v3.16b,v3.16b,v3.16b2059eor v4.16b,v4.16b,v4.16b2060eor v5.16b,v5.16b,v5.16b2061eor v6.16b,v6.16b,v6.16b2062b .Loop_outer20632064.Ldone_512_neon:2065ldp d8,d9,[sp,#128+0] // meet ABI requirements2066ldp x19,x20,[x29,#16]2067add sp,sp,#128+642068ldp x21,x22,[x29,#32]2069ldp x23,x24,[x29,#48]2070ldp x25,x26,[x29,#64]2071ldp x27,x28,[x29,#80]2072ldp x29,x30,[sp],#962073AARCH64_VALIDATE_LINK_REGISTER2074ret2075.size ChaCha20_512_neon,.-ChaCha20_512_neon207620772078