Path: blob/main/sys/crypto/openssl/aarch64/poly1305-armv8.S
39536 views
/* Do not modify. This file is auto-generated from poly1305-armv8.pl. */1#include "arm_arch.h"23.text45// forward "declarations" are required for Apple67.hidden OPENSSL_armcap_P8.globl poly1305_init9.hidden poly1305_init10.globl poly1305_blocks11.hidden poly1305_blocks12.globl poly1305_emit13.hidden poly1305_emit1415.type poly1305_init,%function16.align 517poly1305_init:18AARCH64_VALID_CALL_TARGET19cmp x1,xzr20stp xzr,xzr,[x0] // zero hash value21stp xzr,xzr,[x0,#16] // [along with is_base2_26]2223csel x0,xzr,x0,eq24b.eq .Lno_key2526adrp x17,OPENSSL_armcap_P27ldr w17,[x17,#:lo12:OPENSSL_armcap_P]2829ldp x7,x8,[x1] // load key30mov x9,#0xfffffffc0fffffff31movk x9,#0x0fff,lsl#4832#ifdef __AARCH64EB__33rev x7,x7 // flip bytes34rev x8,x835#endif36and x7,x7,x9 // &=0ffffffc0fffffff37and x9,x9,#-438and x8,x8,x9 // &=0ffffffc0ffffffc39stp x7,x8,[x0,#32] // save key value4041tst w17,#ARMV7_NEON4243adrp x12,poly1305_blocks44add x12,x12,#:lo12:.Lpoly1305_blocks45adrp x7,poly1305_blocks_neon46add x7,x7,#:lo12:.Lpoly1305_blocks_neon47adrp x13,poly1305_emit48add x13,x13,#:lo12:.Lpoly1305_emit49adrp x8,poly1305_emit_neon50add x8,x8,#:lo12:.Lpoly1305_emit_neon5152csel x12,x12,x7,eq53csel x13,x13,x8,eq5455#ifdef __ILP32__56stp w12,w13,[x2]57#else58stp x12,x13,[x2]59#endif6061mov x0,#162.Lno_key:63ret64.size poly1305_init,.-poly1305_init6566.type poly1305_blocks,%function67.align 568poly1305_blocks:69.Lpoly1305_blocks:70// The symbol .Lpoly1305_blocks is not a .globl symbol71// but a pointer to it is returned by poly1305_init72AARCH64_VALID_CALL_TARGET73ands x2,x2,#-1674b.eq .Lno_data7576ldp x4,x5,[x0] // load hash value77ldp x7,x8,[x0,#32] // load key value78ldr x6,[x0,#16]79add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)80b .Loop8182.align 583.Loop:84ldp x10,x11,[x1],#16 // load input85sub x2,x2,#1686#ifdef __AARCH64EB__87rev x10,x1088rev x11,x1189#endif90adds x4,x4,x10 // accumulate input91adcs x5,x5,x119293mul x12,x4,x7 // h0*r094adc x6,x6,x395umulh x13,x4,x79697mul x10,x5,x9 // h1*5*r198umulh x11,x5,x999100adds x12,x12,x10101mul x10,x4,x8 // h0*r1102adc x13,x13,x11103umulh x14,x4,x8104105adds x13,x13,x10106mul x10,x5,x7 // h1*r0107adc x14,x14,xzr108umulh x11,x5,x7109110adds x13,x13,x10111mul x10,x6,x9 // h2*5*r1112adc x14,x14,x11113mul x11,x6,x7 // h2*r0114115adds x13,x13,x10116adc x14,x14,x11117118and x10,x14,#-4 // final reduction119and x6,x14,#3120add x10,x10,x14,lsr#2121adds x4,x12,x10122adcs x5,x13,xzr123adc x6,x6,xzr124125cbnz x2,.Loop126127stp x4,x5,[x0] // store hash value128str x6,[x0,#16]129130.Lno_data:131ret132.size poly1305_blocks,.-poly1305_blocks133134.type poly1305_emit,%function135.align 5136poly1305_emit:137.Lpoly1305_emit:138// The symbol .poly1305_emit is not a .globl symbol139// but a pointer to it is returned by poly1305_init140AARCH64_VALID_CALL_TARGET141ldp x4,x5,[x0] // load hash base 2^64142ldr x6,[x0,#16]143ldp x10,x11,[x2] // load nonce144145adds x12,x4,#5 // compare to modulus146adcs x13,x5,xzr147adc x14,x6,xzr148149tst x14,#-4 // see if it's carried/borrowed150151csel x4,x4,x12,eq152csel x5,x5,x13,eq153154#ifdef __AARCH64EB__155ror x10,x10,#32 // flip nonce words156ror x11,x11,#32157#endif158adds x4,x4,x10 // accumulate nonce159adc x5,x5,x11160#ifdef __AARCH64EB__161rev x4,x4 // flip output bytes162rev x5,x5163#endif164stp x4,x5,[x1] // write result165166ret167.size poly1305_emit,.-poly1305_emit168.type poly1305_mult,%function169.align 5170poly1305_mult:171mul x12,x4,x7 // h0*r0172umulh x13,x4,x7173174mul x10,x5,x9 // h1*5*r1175umulh x11,x5,x9176177adds x12,x12,x10178mul x10,x4,x8 // h0*r1179adc x13,x13,x11180umulh x14,x4,x8181182adds x13,x13,x10183mul x10,x5,x7 // h1*r0184adc x14,x14,xzr185umulh x11,x5,x7186187adds x13,x13,x10188mul x10,x6,x9 // h2*5*r1189adc x14,x14,x11190mul x11,x6,x7 // h2*r0191192adds x13,x13,x10193adc x14,x14,x11194195and x10,x14,#-4 // final reduction196and x6,x14,#3197add x10,x10,x14,lsr#2198adds x4,x12,x10199adcs x5,x13,xzr200adc x6,x6,xzr201202ret203.size poly1305_mult,.-poly1305_mult204205.type poly1305_splat,%function206.align 5207poly1305_splat:208and x12,x4,#0x03ffffff // base 2^64 -> base 2^26209ubfx x13,x4,#26,#26210extr x14,x5,x4,#52211and x14,x14,#0x03ffffff212ubfx x15,x5,#14,#26213extr x16,x6,x5,#40214215str w12,[x0,#16*0] // r0216add w12,w13,w13,lsl#2 // r1*5217str w13,[x0,#16*1] // r1218add w13,w14,w14,lsl#2 // r2*5219str w12,[x0,#16*2] // s1220str w14,[x0,#16*3] // r2221add w14,w15,w15,lsl#2 // r3*5222str w13,[x0,#16*4] // s2223str w15,[x0,#16*5] // r3224add w15,w16,w16,lsl#2 // r4*5225str w14,[x0,#16*6] // s3226str w16,[x0,#16*7] // r4227str w15,[x0,#16*8] // s4228229ret230.size poly1305_splat,.-poly1305_splat231232.type poly1305_blocks_neon,%function233.align 5234poly1305_blocks_neon:235.Lpoly1305_blocks_neon:236// The symbol .Lpoly1305_blocks_neon is not a .globl symbol237// but a pointer to it is returned by poly1305_init238AARCH64_VALID_CALL_TARGET239ldr x17,[x0,#24]240cmp x2,#128241b.hs .Lblocks_neon242cbz x17,.Lpoly1305_blocks243244.Lblocks_neon:245AARCH64_SIGN_LINK_REGISTER246stp x29,x30,[sp,#-80]!247add x29,sp,#0248249ands x2,x2,#-16250b.eq .Lno_data_neon251252cbz x17,.Lbase2_64_neon253254ldp w10,w11,[x0] // load hash value base 2^26255ldp w12,w13,[x0,#8]256ldr w14,[x0,#16]257258tst x2,#31259b.eq .Leven_neon260261ldp x7,x8,[x0,#32] // load key value262263add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64264lsr x5,x12,#12265adds x4,x4,x12,lsl#52266add x5,x5,x13,lsl#14267adc x5,x5,xzr268lsr x6,x14,#24269adds x5,x5,x14,lsl#40270adc x14,x6,xzr // can be partially reduced...271272ldp x12,x13,[x1],#16 // load input273sub x2,x2,#16274add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)275276and x10,x14,#-4 // ... so reduce277and x6,x14,#3278add x10,x10,x14,lsr#2279adds x4,x4,x10280adcs x5,x5,xzr281adc x6,x6,xzr282283#ifdef __AARCH64EB__284rev x12,x12285rev x13,x13286#endif287adds x4,x4,x12 // accumulate input288adcs x5,x5,x13289adc x6,x6,x3290291bl poly1305_mult292ldr x30,[sp,#8]293294cbz x3,.Lstore_base2_64_neon295296and x10,x4,#0x03ffffff // base 2^64 -> base 2^26297ubfx x11,x4,#26,#26298extr x12,x5,x4,#52299and x12,x12,#0x03ffffff300ubfx x13,x5,#14,#26301extr x14,x6,x5,#40302303cbnz x2,.Leven_neon304305stp w10,w11,[x0] // store hash value base 2^26306stp w12,w13,[x0,#8]307str w14,[x0,#16]308b .Lno_data_neon309310.align 4311.Lstore_base2_64_neon:312stp x4,x5,[x0] // store hash value base 2^64313stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed314b .Lno_data_neon315316.align 4317.Lbase2_64_neon:318ldp x7,x8,[x0,#32] // load key value319320ldp x4,x5,[x0] // load hash value base 2^64321ldr x6,[x0,#16]322323tst x2,#31324b.eq .Linit_neon325326ldp x12,x13,[x1],#16 // load input327sub x2,x2,#16328add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)329#ifdef __AARCH64EB__330rev x12,x12331rev x13,x13332#endif333adds x4,x4,x12 // accumulate input334adcs x5,x5,x13335adc x6,x6,x3336337bl poly1305_mult338339.Linit_neon:340and x10,x4,#0x03ffffff // base 2^64 -> base 2^26341ubfx x11,x4,#26,#26342extr x12,x5,x4,#52343and x12,x12,#0x03ffffff344ubfx x13,x5,#14,#26345extr x14,x6,x5,#40346347stp d8,d9,[sp,#16] // meet ABI requirements348stp d10,d11,[sp,#32]349stp d12,d13,[sp,#48]350stp d14,d15,[sp,#64]351352fmov d24,x10353fmov d25,x11354fmov d26,x12355fmov d27,x13356fmov d28,x14357358////////////////////////////////// initialize r^n table359mov x4,x7 // r^1360add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)361mov x5,x8362mov x6,xzr363add x0,x0,#48+12364bl poly1305_splat365366bl poly1305_mult // r^2367sub x0,x0,#4368bl poly1305_splat369370bl poly1305_mult // r^3371sub x0,x0,#4372bl poly1305_splat373374bl poly1305_mult // r^4375sub x0,x0,#4376bl poly1305_splat377ldr x30,[sp,#8]378379add x16,x1,#32380adrp x17,.Lzeros381add x17,x17,#:lo12:.Lzeros382subs x2,x2,#64383csel x16,x17,x16,lo384385mov x4,#1386stur x4,[x0,#-24] // set is_base2_26387sub x0,x0,#48 // restore original x0388b .Ldo_neon389390.align 4391.Leven_neon:392add x16,x1,#32393adrp x17,.Lzeros394add x17,x17,#:lo12:.Lzeros395subs x2,x2,#64396csel x16,x17,x16,lo397398stp d8,d9,[sp,#16] // meet ABI requirements399stp d10,d11,[sp,#32]400stp d12,d13,[sp,#48]401stp d14,d15,[sp,#64]402403fmov d24,x10404fmov d25,x11405fmov d26,x12406fmov d27,x13407fmov d28,x14408409.Ldo_neon:410ldp x8,x12,[x16],#16 // inp[2:3] (or zero)411ldp x9,x13,[x16],#48412413lsl x3,x3,#24414add x15,x0,#48415416#ifdef __AARCH64EB__417rev x8,x8418rev x12,x12419rev x9,x9420rev x13,x13421#endif422and x4,x8,#0x03ffffff // base 2^64 -> base 2^26423and x5,x9,#0x03ffffff424ubfx x6,x8,#26,#26425ubfx x7,x9,#26,#26426add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32427extr x8,x12,x8,#52428extr x9,x13,x9,#52429add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32430fmov d14,x4431and x8,x8,#0x03ffffff432and x9,x9,#0x03ffffff433ubfx x10,x12,#14,#26434ubfx x11,x13,#14,#26435add x12,x3,x12,lsr#40436add x13,x3,x13,lsr#40437add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32438fmov d15,x6439add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32440add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32441fmov d16,x8442fmov d17,x10443fmov d18,x12444445ldp x8,x12,[x1],#16 // inp[0:1]446ldp x9,x13,[x1],#48447448ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64449ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64450ld1 {v8.4s},[x15]451452#ifdef __AARCH64EB__453rev x8,x8454rev x12,x12455rev x9,x9456rev x13,x13457#endif458and x4,x8,#0x03ffffff // base 2^64 -> base 2^26459and x5,x9,#0x03ffffff460ubfx x6,x8,#26,#26461ubfx x7,x9,#26,#26462add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32463extr x8,x12,x8,#52464extr x9,x13,x9,#52465add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32466fmov d9,x4467and x8,x8,#0x03ffffff468and x9,x9,#0x03ffffff469ubfx x10,x12,#14,#26470ubfx x11,x13,#14,#26471add x12,x3,x12,lsr#40472add x13,x3,x13,lsr#40473add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32474fmov d10,x6475add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32476add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32477movi v31.2d,#-1478fmov d11,x8479fmov d12,x10480fmov d13,x12481ushr v31.2d,v31.2d,#38482483b.ls .Lskip_loop484485.align 4486.Loop_neon:487////////////////////////////////////////////////////////////////488// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2489// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r490// ___________________/491// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2492// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r493// ___________________/ ____________________/494//495// Note that we start with inp[2:3]*r^2. This is because it496// doesn't depend on reduction in previous iteration.497////////////////////////////////////////////////////////////////498// d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0499// d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4500// d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3501// d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2502// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1503504subs x2,x2,#64505umull v23.2d,v14.2s,v7.s[2]506csel x16,x17,x16,lo507umull v22.2d,v14.2s,v5.s[2]508umull v21.2d,v14.2s,v3.s[2]509ldp x8,x12,[x16],#16 // inp[2:3] (or zero)510umull v20.2d,v14.2s,v1.s[2]511ldp x9,x13,[x16],#48512umull v19.2d,v14.2s,v0.s[2]513#ifdef __AARCH64EB__514rev x8,x8515rev x12,x12516rev x9,x9517rev x13,x13518#endif519520umlal v23.2d,v15.2s,v5.s[2]521and x4,x8,#0x03ffffff // base 2^64 -> base 2^26522umlal v22.2d,v15.2s,v3.s[2]523and x5,x9,#0x03ffffff524umlal v21.2d,v15.2s,v1.s[2]525ubfx x6,x8,#26,#26526umlal v20.2d,v15.2s,v0.s[2]527ubfx x7,x9,#26,#26528umlal v19.2d,v15.2s,v8.s[2]529add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32530531umlal v23.2d,v16.2s,v3.s[2]532extr x8,x12,x8,#52533umlal v22.2d,v16.2s,v1.s[2]534extr x9,x13,x9,#52535umlal v21.2d,v16.2s,v0.s[2]536add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32537umlal v20.2d,v16.2s,v8.s[2]538fmov d14,x4539umlal v19.2d,v16.2s,v6.s[2]540and x8,x8,#0x03ffffff541542umlal v23.2d,v17.2s,v1.s[2]543and x9,x9,#0x03ffffff544umlal v22.2d,v17.2s,v0.s[2]545ubfx x10,x12,#14,#26546umlal v21.2d,v17.2s,v8.s[2]547ubfx x11,x13,#14,#26548umlal v20.2d,v17.2s,v6.s[2]549add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32550umlal v19.2d,v17.2s,v4.s[2]551fmov d15,x6552553add v11.2s,v11.2s,v26.2s554add x12,x3,x12,lsr#40555umlal v23.2d,v18.2s,v0.s[2]556add x13,x3,x13,lsr#40557umlal v22.2d,v18.2s,v8.s[2]558add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32559umlal v21.2d,v18.2s,v6.s[2]560add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32561umlal v20.2d,v18.2s,v4.s[2]562fmov d16,x8563umlal v19.2d,v18.2s,v2.s[2]564fmov d17,x10565566////////////////////////////////////////////////////////////////567// (hash+inp[0:1])*r^4 and accumulate568569add v9.2s,v9.2s,v24.2s570fmov d18,x12571umlal v22.2d,v11.2s,v1.s[0]572ldp x8,x12,[x1],#16 // inp[0:1]573umlal v19.2d,v11.2s,v6.s[0]574ldp x9,x13,[x1],#48575umlal v23.2d,v11.2s,v3.s[0]576umlal v20.2d,v11.2s,v8.s[0]577umlal v21.2d,v11.2s,v0.s[0]578#ifdef __AARCH64EB__579rev x8,x8580rev x12,x12581rev x9,x9582rev x13,x13583#endif584585add v10.2s,v10.2s,v25.2s586umlal v22.2d,v9.2s,v5.s[0]587umlal v23.2d,v9.2s,v7.s[0]588and x4,x8,#0x03ffffff // base 2^64 -> base 2^26589umlal v21.2d,v9.2s,v3.s[0]590and x5,x9,#0x03ffffff591umlal v19.2d,v9.2s,v0.s[0]592ubfx x6,x8,#26,#26593umlal v20.2d,v9.2s,v1.s[0]594ubfx x7,x9,#26,#26595596add v12.2s,v12.2s,v27.2s597add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32598umlal v22.2d,v10.2s,v3.s[0]599extr x8,x12,x8,#52600umlal v23.2d,v10.2s,v5.s[0]601extr x9,x13,x9,#52602umlal v19.2d,v10.2s,v8.s[0]603add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32604umlal v21.2d,v10.2s,v1.s[0]605fmov d9,x4606umlal v20.2d,v10.2s,v0.s[0]607and x8,x8,#0x03ffffff608609add v13.2s,v13.2s,v28.2s610and x9,x9,#0x03ffffff611umlal v22.2d,v12.2s,v0.s[0]612ubfx x10,x12,#14,#26613umlal v19.2d,v12.2s,v4.s[0]614ubfx x11,x13,#14,#26615umlal v23.2d,v12.2s,v1.s[0]616add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32617umlal v20.2d,v12.2s,v6.s[0]618fmov d10,x6619umlal v21.2d,v12.2s,v8.s[0]620add x12,x3,x12,lsr#40621622umlal v22.2d,v13.2s,v8.s[0]623add x13,x3,x13,lsr#40624umlal v19.2d,v13.2s,v2.s[0]625add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32626umlal v23.2d,v13.2s,v0.s[0]627add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32628umlal v20.2d,v13.2s,v4.s[0]629fmov d11,x8630umlal v21.2d,v13.2s,v6.s[0]631fmov d12,x10632fmov d13,x12633634/////////////////////////////////////////////////////////////////635// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein636// and P. Schwabe637//638// [see discussion in poly1305-armv4 module]639640ushr v29.2d,v22.2d,#26641xtn v27.2s,v22.2d642ushr v30.2d,v19.2d,#26643and v19.16b,v19.16b,v31.16b644add v23.2d,v23.2d,v29.2d // h3 -> h4645bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff646add v20.2d,v20.2d,v30.2d // h0 -> h1647648ushr v29.2d,v23.2d,#26649xtn v28.2s,v23.2d650ushr v30.2d,v20.2d,#26651xtn v25.2s,v20.2d652bic v28.2s,#0xfc,lsl#24653add v21.2d,v21.2d,v30.2d // h1 -> h2654655add v19.2d,v19.2d,v29.2d656shl v29.2d,v29.2d,#2657shrn v30.2s,v21.2d,#26658xtn v26.2s,v21.2d659add v19.2d,v19.2d,v29.2d // h4 -> h0660bic v25.2s,#0xfc,lsl#24661add v27.2s,v27.2s,v30.2s // h2 -> h3662bic v26.2s,#0xfc,lsl#24663664shrn v29.2s,v19.2d,#26665xtn v24.2s,v19.2d666ushr v30.2s,v27.2s,#26667bic v27.2s,#0xfc,lsl#24668bic v24.2s,#0xfc,lsl#24669add v25.2s,v25.2s,v29.2s // h0 -> h1670add v28.2s,v28.2s,v30.2s // h3 -> h4671672b.hi .Loop_neon673674.Lskip_loop:675dup v16.2d,v16.d[0]676add v11.2s,v11.2s,v26.2s677678////////////////////////////////////////////////////////////////679// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1680681adds x2,x2,#32682b.ne .Long_tail683684dup v16.2d,v11.d[0]685add v14.2s,v9.2s,v24.2s686add v17.2s,v12.2s,v27.2s687add v15.2s,v10.2s,v25.2s688add v18.2s,v13.2s,v28.2s689690.Long_tail:691dup v14.2d,v14.d[0]692umull2 v19.2d,v16.4s,v6.4s693umull2 v22.2d,v16.4s,v1.4s694umull2 v23.2d,v16.4s,v3.4s695umull2 v21.2d,v16.4s,v0.4s696umull2 v20.2d,v16.4s,v8.4s697698dup v15.2d,v15.d[0]699umlal2 v19.2d,v14.4s,v0.4s700umlal2 v21.2d,v14.4s,v3.4s701umlal2 v22.2d,v14.4s,v5.4s702umlal2 v23.2d,v14.4s,v7.4s703umlal2 v20.2d,v14.4s,v1.4s704705dup v17.2d,v17.d[0]706umlal2 v19.2d,v15.4s,v8.4s707umlal2 v22.2d,v15.4s,v3.4s708umlal2 v21.2d,v15.4s,v1.4s709umlal2 v23.2d,v15.4s,v5.4s710umlal2 v20.2d,v15.4s,v0.4s711712dup v18.2d,v18.d[0]713umlal2 v22.2d,v17.4s,v0.4s714umlal2 v23.2d,v17.4s,v1.4s715umlal2 v19.2d,v17.4s,v4.4s716umlal2 v20.2d,v17.4s,v6.4s717umlal2 v21.2d,v17.4s,v8.4s718719umlal2 v22.2d,v18.4s,v8.4s720umlal2 v19.2d,v18.4s,v2.4s721umlal2 v23.2d,v18.4s,v0.4s722umlal2 v20.2d,v18.4s,v4.4s723umlal2 v21.2d,v18.4s,v6.4s724725b.eq .Lshort_tail726727////////////////////////////////////////////////////////////////728// (hash+inp[0:1])*r^4:r^3 and accumulate729730add v9.2s,v9.2s,v24.2s731umlal v22.2d,v11.2s,v1.2s732umlal v19.2d,v11.2s,v6.2s733umlal v23.2d,v11.2s,v3.2s734umlal v20.2d,v11.2s,v8.2s735umlal v21.2d,v11.2s,v0.2s736737add v10.2s,v10.2s,v25.2s738umlal v22.2d,v9.2s,v5.2s739umlal v19.2d,v9.2s,v0.2s740umlal v23.2d,v9.2s,v7.2s741umlal v20.2d,v9.2s,v1.2s742umlal v21.2d,v9.2s,v3.2s743744add v12.2s,v12.2s,v27.2s745umlal v22.2d,v10.2s,v3.2s746umlal v19.2d,v10.2s,v8.2s747umlal v23.2d,v10.2s,v5.2s748umlal v20.2d,v10.2s,v0.2s749umlal v21.2d,v10.2s,v1.2s750751add v13.2s,v13.2s,v28.2s752umlal v22.2d,v12.2s,v0.2s753umlal v19.2d,v12.2s,v4.2s754umlal v23.2d,v12.2s,v1.2s755umlal v20.2d,v12.2s,v6.2s756umlal v21.2d,v12.2s,v8.2s757758umlal v22.2d,v13.2s,v8.2s759umlal v19.2d,v13.2s,v2.2s760umlal v23.2d,v13.2s,v0.2s761umlal v20.2d,v13.2s,v4.2s762umlal v21.2d,v13.2s,v6.2s763764.Lshort_tail:765////////////////////////////////////////////////////////////////766// horizontal add767768addp v22.2d,v22.2d,v22.2d769ldp d8,d9,[sp,#16] // meet ABI requirements770addp v19.2d,v19.2d,v19.2d771ldp d10,d11,[sp,#32]772addp v23.2d,v23.2d,v23.2d773ldp d12,d13,[sp,#48]774addp v20.2d,v20.2d,v20.2d775ldp d14,d15,[sp,#64]776addp v21.2d,v21.2d,v21.2d777778////////////////////////////////////////////////////////////////779// lazy reduction, but without narrowing780781ushr v29.2d,v22.2d,#26782and v22.16b,v22.16b,v31.16b783ushr v30.2d,v19.2d,#26784and v19.16b,v19.16b,v31.16b785786add v23.2d,v23.2d,v29.2d // h3 -> h4787add v20.2d,v20.2d,v30.2d // h0 -> h1788789ushr v29.2d,v23.2d,#26790and v23.16b,v23.16b,v31.16b791ushr v30.2d,v20.2d,#26792and v20.16b,v20.16b,v31.16b793add v21.2d,v21.2d,v30.2d // h1 -> h2794795add v19.2d,v19.2d,v29.2d796shl v29.2d,v29.2d,#2797ushr v30.2d,v21.2d,#26798and v21.16b,v21.16b,v31.16b799add v19.2d,v19.2d,v29.2d // h4 -> h0800add v22.2d,v22.2d,v30.2d // h2 -> h3801802ushr v29.2d,v19.2d,#26803and v19.16b,v19.16b,v31.16b804ushr v30.2d,v22.2d,#26805and v22.16b,v22.16b,v31.16b806add v20.2d,v20.2d,v29.2d // h0 -> h1807add v23.2d,v23.2d,v30.2d // h3 -> h4808809////////////////////////////////////////////////////////////////810// write the result, can be partially reduced811812st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16813st1 {v23.s}[0],[x0]814815.Lno_data_neon:816ldr x29,[sp],#80817AARCH64_VALIDATE_LINK_REGISTER818ret819.size poly1305_blocks_neon,.-poly1305_blocks_neon820821.type poly1305_emit_neon,%function822.align 5823poly1305_emit_neon:824.Lpoly1305_emit_neon:825// The symbol .Lpoly1305_emit_neon is not a .globl symbol826// but a pointer to it is returned by poly1305_init827AARCH64_VALID_CALL_TARGET828ldr x17,[x0,#24]829cbz x17,poly1305_emit830831ldp w10,w11,[x0] // load hash value base 2^26832ldp w12,w13,[x0,#8]833ldr w14,[x0,#16]834835add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64836lsr x5,x12,#12837adds x4,x4,x12,lsl#52838add x5,x5,x13,lsl#14839adc x5,x5,xzr840lsr x6,x14,#24841adds x5,x5,x14,lsl#40842adc x6,x6,xzr // can be partially reduced...843844ldp x10,x11,[x2] // load nonce845846and x12,x6,#-4 // ... so reduce847add x12,x12,x6,lsr#2848and x6,x6,#3849adds x4,x4,x12850adcs x5,x5,xzr851adc x6,x6,xzr852853adds x12,x4,#5 // compare to modulus854adcs x13,x5,xzr855adc x14,x6,xzr856857tst x14,#-4 // see if it's carried/borrowed858859csel x4,x4,x12,eq860csel x5,x5,x13,eq861862#ifdef __AARCH64EB__863ror x10,x10,#32 // flip nonce words864ror x11,x11,#32865#endif866adds x4,x4,x10 // accumulate nonce867adc x5,x5,x11868#ifdef __AARCH64EB__869rev x4,x4 // flip output bytes870rev x5,x5871#endif872stp x4,x5,[x1] // write result873874ret875.size poly1305_emit_neon,.-poly1305_emit_neon876877.section .rodata878879.align 5880.Lzeros:881.long 0,0,0,0,0,0,0,0882.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0883.align 2884.align 2885886887