Path: blob/main/sys/crypto/openssl/arm/poly1305-armv4.S
39482 views
/* Do not modify. This file is auto-generated from poly1305-armv4.pl. */1#include "arm_arch.h"23#if defined(__thumb2__)4.syntax unified5.thumb6#else7.code 328#endif910.text1112.globl poly1305_emit13.globl poly1305_blocks14.globl poly1305_init15.type poly1305_init,%function16.align 517poly1305_init:18.Lpoly1305_init:19stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11}2021eor r3,r3,r322cmp r1,#023str r3,[r0,#0] @ zero hash value24str r3,[r0,#4]25str r3,[r0,#8]26str r3,[r0,#12]27str r3,[r0,#16]28str r3,[r0,#36] @ is_base2_2629add r0,r0,#203031#ifdef __thumb2__32it eq33#endif34moveq r0,#035beq .Lno_key3637#if __ARM_MAX_ARCH__>=738adr r11,.Lpoly1305_init39ldr r12,.LOPENSSL_armcap40#endif41ldrb r4,[r1,#0]42mov r10,#0x0fffffff43ldrb r5,[r1,#1]44and r3,r10,#-4 @ 0x0ffffffc45ldrb r6,[r1,#2]46ldrb r7,[r1,#3]47orr r4,r4,r5,lsl#848ldrb r5,[r1,#4]49orr r4,r4,r6,lsl#1650ldrb r6,[r1,#5]51orr r4,r4,r7,lsl#2452ldrb r7,[r1,#6]53and r4,r4,r105455#if __ARM_MAX_ARCH__>=756# if !defined(_WIN32)57ldr r12,[r11,r12] @ OPENSSL_armcap_P58# endif59# if defined(__APPLE__) || defined(_WIN32)60ldr r12,[r12]61# endif62#endif63ldrb r8,[r1,#7]64orr r5,r5,r6,lsl#865ldrb r6,[r1,#8]66orr r5,r5,r7,lsl#1667ldrb r7,[r1,#9]68orr r5,r5,r8,lsl#2469ldrb r8,[r1,#10]70and r5,r5,r37172#if __ARM_MAX_ARCH__>=773tst r12,#ARMV7_NEON @ check for NEON74# ifdef __thumb2__75adr r9,.Lpoly1305_blocks_neon76adr r11,.Lpoly1305_blocks77adr r12,.Lpoly1305_emit78adr r10,.Lpoly1305_emit_neon79itt ne80movne r11,r981movne r12,r1082orr r11,r11,#1 @ thumb-ify address83orr r12,r12,#184# else85addeq r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)86addne r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init)87addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)88addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)89# endif90#endif91ldrb r9,[r1,#11]92orr r6,r6,r7,lsl#893ldrb r7,[r1,#12]94orr r6,r6,r8,lsl#1695ldrb r8,[r1,#13]96orr r6,r6,r9,lsl#2497ldrb r9,[r1,#14]98and r6,r6,r399100ldrb r10,[r1,#15]101orr r7,r7,r8,lsl#8102str r4,[r0,#0]103orr r7,r7,r9,lsl#16104str r5,[r0,#4]105orr r7,r7,r10,lsl#24106str r6,[r0,#8]107and r7,r7,r3108str r7,[r0,#12]109#if __ARM_MAX_ARCH__>=7110stmia r2,{r11,r12} @ fill functions table111mov r0,#1112#else113mov r0,#0114#endif115.Lno_key:116ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11}117#if __ARM_ARCH__>=5118bx lr @ bx lr119#else120tst lr,#1121moveq pc,lr @ be binary compatible with V4, yet122.word 0xe12fff1e @ interoperable with Thumb ISA:-)123#endif124.size poly1305_init,.-poly1305_init125.type poly1305_blocks,%function126.align 5127poly1305_blocks:128.Lpoly1305_blocks:129stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}130131ands r2,r2,#-16132beq .Lno_data133134cmp r3,#0135add r2,r2,r1 @ end pointer136sub sp,sp,#32137138ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12} @ load context139140str r0,[sp,#12] @ offload stuff141mov lr,r1142str r2,[sp,#16]143str r10,[sp,#20]144str r11,[sp,#24]145str r12,[sp,#28]146b .Loop147148.Loop:149#if __ARM_ARCH__<7150ldrb r0,[lr],#16 @ load input151# ifdef __thumb2__152it hi153# endif154addhi r8,r8,#1 @ 1<<128155ldrb r1,[lr,#-15]156ldrb r2,[lr,#-14]157ldrb r3,[lr,#-13]158orr r1,r0,r1,lsl#8159ldrb r0,[lr,#-12]160orr r2,r1,r2,lsl#16161ldrb r1,[lr,#-11]162orr r3,r2,r3,lsl#24163ldrb r2,[lr,#-10]164adds r4,r4,r3 @ accumulate input165166ldrb r3,[lr,#-9]167orr r1,r0,r1,lsl#8168ldrb r0,[lr,#-8]169orr r2,r1,r2,lsl#16170ldrb r1,[lr,#-7]171orr r3,r2,r3,lsl#24172ldrb r2,[lr,#-6]173adcs r5,r5,r3174175ldrb r3,[lr,#-5]176orr r1,r0,r1,lsl#8177ldrb r0,[lr,#-4]178orr r2,r1,r2,lsl#16179ldrb r1,[lr,#-3]180orr r3,r2,r3,lsl#24181ldrb r2,[lr,#-2]182adcs r6,r6,r3183184ldrb r3,[lr,#-1]185orr r1,r0,r1,lsl#8186str lr,[sp,#8] @ offload input pointer187orr r2,r1,r2,lsl#16188add r10,r10,r10,lsr#2189orr r3,r2,r3,lsl#24190#else191ldr r0,[lr],#16 @ load input192# ifdef __thumb2__193it hi194# endif195addhi r8,r8,#1 @ padbit196ldr r1,[lr,#-12]197ldr r2,[lr,#-8]198ldr r3,[lr,#-4]199# ifdef __ARMEB__200rev r0,r0201rev r1,r1202rev r2,r2203rev r3,r3204# endif205adds r4,r4,r0 @ accumulate input206str lr,[sp,#8] @ offload input pointer207adcs r5,r5,r1208add r10,r10,r10,lsr#2209adcs r6,r6,r2210#endif211add r11,r11,r11,lsr#2212adcs r7,r7,r3213add r12,r12,r12,lsr#2214215umull r2,r3,r5,r9216adc r8,r8,#0217umull r0,r1,r4,r9218umlal r2,r3,r8,r10219umlal r0,r1,r7,r10220ldr r10,[sp,#20] @ reload r10221umlal r2,r3,r6,r12222umlal r0,r1,r5,r12223umlal r2,r3,r7,r11224umlal r0,r1,r6,r11225umlal r2,r3,r4,r10226str r0,[sp,#0] @ future r4227mul r0,r11,r8228ldr r11,[sp,#24] @ reload r11229adds r2,r2,r1 @ d1+=d0>>32230eor r1,r1,r1231adc lr,r3,#0 @ future r6232str r2,[sp,#4] @ future r5233234mul r2,r12,r8235eor r3,r3,r3236umlal r0,r1,r7,r12237ldr r12,[sp,#28] @ reload r12238umlal r2,r3,r7,r9239umlal r0,r1,r6,r9240umlal r2,r3,r6,r10241umlal r0,r1,r5,r10242umlal r2,r3,r5,r11243umlal r0,r1,r4,r11244umlal r2,r3,r4,r12245ldr r4,[sp,#0]246mul r8,r9,r8247ldr r5,[sp,#4]248249adds r6,lr,r0 @ d2+=d1>>32250ldr lr,[sp,#8] @ reload input pointer251adc r1,r1,#0252adds r7,r2,r1 @ d3+=d2>>32253ldr r0,[sp,#16] @ reload end pointer254adc r3,r3,#0255add r8,r8,r3 @ h4+=d3>>32256257and r1,r8,#-4258and r8,r8,#3259add r1,r1,r1,lsr#2 @ *=5260adds r4,r4,r1261adcs r5,r5,#0262adcs r6,r6,#0263adcs r7,r7,#0264adc r8,r8,#0265266cmp r0,lr @ done yet?267bhi .Loop268269ldr r0,[sp,#12]270add sp,sp,#32271stmia r0,{r4,r5,r6,r7,r8} @ store the result272273.Lno_data:274#if __ARM_ARCH__>=5275ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc}276#else277ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}278tst lr,#1279moveq pc,lr @ be binary compatible with V4, yet280.word 0xe12fff1e @ interoperable with Thumb ISA:-)281#endif282.size poly1305_blocks,.-poly1305_blocks283.type poly1305_emit,%function284.align 5285poly1305_emit:286.Lpoly1305_emit:287stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11}288.Lpoly1305_emit_enter:289290ldmia r0,{r3,r4,r5,r6,r7}291adds r8,r3,#5 @ compare to modulus292adcs r9,r4,#0293adcs r10,r5,#0294adcs r11,r6,#0295adc r7,r7,#0296tst r7,#4 @ did it carry/borrow?297298#ifdef __thumb2__299it ne300#endif301movne r3,r8302ldr r8,[r2,#0]303#ifdef __thumb2__304it ne305#endif306movne r4,r9307ldr r9,[r2,#4]308#ifdef __thumb2__309it ne310#endif311movne r5,r10312ldr r10,[r2,#8]313#ifdef __thumb2__314it ne315#endif316movne r6,r11317ldr r11,[r2,#12]318319adds r3,r3,r8320adcs r4,r4,r9321adcs r5,r5,r10322adc r6,r6,r11323324#if __ARM_ARCH__>=7325# ifdef __ARMEB__326rev r3,r3327rev r4,r4328rev r5,r5329rev r6,r6330# endif331str r3,[r1,#0]332str r4,[r1,#4]333str r5,[r1,#8]334str r6,[r1,#12]335#else336strb r3,[r1,#0]337mov r3,r3,lsr#8338strb r4,[r1,#4]339mov r4,r4,lsr#8340strb r5,[r1,#8]341mov r5,r5,lsr#8342strb r6,[r1,#12]343mov r6,r6,lsr#8344345strb r3,[r1,#1]346mov r3,r3,lsr#8347strb r4,[r1,#5]348mov r4,r4,lsr#8349strb r5,[r1,#9]350mov r5,r5,lsr#8351strb r6,[r1,#13]352mov r6,r6,lsr#8353354strb r3,[r1,#2]355mov r3,r3,lsr#8356strb r4,[r1,#6]357mov r4,r4,lsr#8358strb r5,[r1,#10]359mov r5,r5,lsr#8360strb r6,[r1,#14]361mov r6,r6,lsr#8362363strb r3,[r1,#3]364strb r4,[r1,#7]365strb r5,[r1,#11]366strb r6,[r1,#15]367#endif368ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11}369#if __ARM_ARCH__>=5370bx lr @ bx lr371#else372tst lr,#1373moveq pc,lr @ be binary compatible with V4, yet374.word 0xe12fff1e @ interoperable with Thumb ISA:-)375#endif376.size poly1305_emit,.-poly1305_emit377#if __ARM_MAX_ARCH__>=7378.fpu neon379380.type poly1305_init_neon,%function381.align 5382poly1305_init_neon:383ldr r4,[r0,#20] @ load key base 2^32384ldr r5,[r0,#24]385ldr r6,[r0,#28]386ldr r7,[r0,#32]387388and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26389mov r3,r4,lsr#26390mov r4,r5,lsr#20391orr r3,r3,r5,lsl#6392mov r5,r6,lsr#14393orr r4,r4,r6,lsl#12394mov r6,r7,lsr#8395orr r5,r5,r7,lsl#18396and r3,r3,#0x03ffffff397and r4,r4,#0x03ffffff398and r5,r5,#0x03ffffff399400vdup.32 d0,r2 @ r^1 in both lanes401add r2,r3,r3,lsl#2 @ *5402vdup.32 d1,r3403add r3,r4,r4,lsl#2404vdup.32 d2,r2405vdup.32 d3,r4406add r4,r5,r5,lsl#2407vdup.32 d4,r3408vdup.32 d5,r5409add r5,r6,r6,lsl#2410vdup.32 d6,r4411vdup.32 d7,r6412vdup.32 d8,r5413414mov r5,#2 @ counter415416.Lsquare_neon:417@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@418@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4419@ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4420@ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4421@ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4422@ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4423424vmull.u32 q5,d0,d0[1]425vmull.u32 q6,d1,d0[1]426vmull.u32 q7,d3,d0[1]427vmull.u32 q8,d5,d0[1]428vmull.u32 q9,d7,d0[1]429430vmlal.u32 q5,d7,d2[1]431vmlal.u32 q6,d0,d1[1]432vmlal.u32 q7,d1,d1[1]433vmlal.u32 q8,d3,d1[1]434vmlal.u32 q9,d5,d1[1]435436vmlal.u32 q5,d5,d4[1]437vmlal.u32 q6,d7,d4[1]438vmlal.u32 q8,d1,d3[1]439vmlal.u32 q7,d0,d3[1]440vmlal.u32 q9,d3,d3[1]441442vmlal.u32 q5,d3,d6[1]443vmlal.u32 q8,d0,d5[1]444vmlal.u32 q6,d5,d6[1]445vmlal.u32 q7,d7,d6[1]446vmlal.u32 q9,d1,d5[1]447448vmlal.u32 q8,d7,d8[1]449vmlal.u32 q5,d1,d8[1]450vmlal.u32 q6,d3,d8[1]451vmlal.u32 q7,d5,d8[1]452vmlal.u32 q9,d0,d7[1]453454@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@455@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein456@ and P. Schwabe457@458@ H0>>+H1>>+H2>>+H3>>+H4459@ H3>>+H4>>*5+H0>>+H1460@461@ Trivia.462@463@ Result of multiplication of n-bit number by m-bit number is464@ n+m bits wide. However! Even though 2^n is a n+1-bit number,465@ m-bit number multiplied by 2^n is still n+m bits wide.466@467@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,468@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit469@ one is n+1 bits wide.470@471@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that472@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4473@ can be 27. However! In cases when their width exceeds 26 bits474@ they are limited by 2^26+2^6. This in turn means that *sum*475@ of the products with these values can still be viewed as sum476@ of 52-bit numbers as long as the amount of addends is not a477@ power of 2. For example,478@479@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,480@481@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or482@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than483@ 8 * (2^52) or 2^55. However, the value is then multiplied by484@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),485@ which is less than 32 * (2^52) or 2^57. And when processing486@ data we are looking at triple as many addends...487@488@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and489@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the490@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while491@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32492@ instruction accepts 2x32-bit input and writes 2x64-bit result.493@ This means that result of reduction have to be compressed upon494@ loop wrap-around. This can be done in the process of reduction495@ to minimize amount of instructions [as well as amount of496@ 128-bit instructions, which benefits low-end processors], but497@ one has to watch for H2 (which is narrower than H0) and 5*H4498@ not being wider than 58 bits, so that result of right shift499@ by 26 bits fits in 32 bits. This is also useful on x86,500@ because it allows to use paddd in place for paddq, which501@ benefits Atom, where paddq is ridiculously slow.502503vshr.u64 q15,q8,#26504vmovn.i64 d16,q8505vshr.u64 q4,q5,#26506vmovn.i64 d10,q5507vadd.i64 q9,q9,q15 @ h3 -> h4508vbic.i32 d16,#0xfc000000 @ &=0x03ffffff509vadd.i64 q6,q6,q4 @ h0 -> h1510vbic.i32 d10,#0xfc000000511512vshrn.u64 d30,q9,#26513vmovn.i64 d18,q9514vshr.u64 q4,q6,#26515vmovn.i64 d12,q6516vadd.i64 q7,q7,q4 @ h1 -> h2517vbic.i32 d18,#0xfc000000518vbic.i32 d12,#0xfc000000519520vadd.i32 d10,d10,d30521vshl.u32 d30,d30,#2522vshrn.u64 d8,q7,#26523vmovn.i64 d14,q7524vadd.i32 d10,d10,d30 @ h4 -> h0525vadd.i32 d16,d16,d8 @ h2 -> h3526vbic.i32 d14,#0xfc000000527528vshr.u32 d30,d10,#26529vbic.i32 d10,#0xfc000000530vshr.u32 d8,d16,#26531vbic.i32 d16,#0xfc000000532vadd.i32 d12,d12,d30 @ h0 -> h1533vadd.i32 d18,d18,d8 @ h3 -> h4534535subs r5,r5,#1536beq .Lsquare_break_neon537538add r6,r0,#(48+0*9*4)539add r7,r0,#(48+1*9*4)540541vtrn.32 d0,d10 @ r^2:r^1542vtrn.32 d3,d14543vtrn.32 d5,d16544vtrn.32 d1,d12545vtrn.32 d7,d18546547vshl.u32 d4,d3,#2 @ *5548vshl.u32 d6,d5,#2549vshl.u32 d2,d1,#2550vshl.u32 d8,d7,#2551vadd.i32 d4,d4,d3552vadd.i32 d2,d2,d1553vadd.i32 d6,d6,d5554vadd.i32 d8,d8,d7555556vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!557vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!558vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!559vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!560vst1.32 {d8[0]},[r6,:32]561vst1.32 {d8[1]},[r7,:32]562563b .Lsquare_neon564565.align 4566.Lsquare_break_neon:567add r6,r0,#(48+2*4*9)568add r7,r0,#(48+3*4*9)569570vmov d0,d10 @ r^4:r^3571vshl.u32 d2,d12,#2 @ *5572vmov d1,d12573vshl.u32 d4,d14,#2574vmov d3,d14575vshl.u32 d6,d16,#2576vmov d5,d16577vshl.u32 d8,d18,#2578vmov d7,d18579vadd.i32 d2,d2,d12580vadd.i32 d4,d4,d14581vadd.i32 d6,d6,d16582vadd.i32 d8,d8,d18583584vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!585vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!586vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!587vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!588vst1.32 {d8[0]},[r6]589vst1.32 {d8[1]},[r7]590591bx lr @ bx lr592.size poly1305_init_neon,.-poly1305_init_neon593594.type poly1305_blocks_neon,%function595.align 5596poly1305_blocks_neon:597.Lpoly1305_blocks_neon:598ldr ip,[r0,#36] @ is_base2_26599ands r2,r2,#-16600beq .Lno_data_neon601602cmp r2,#64603bhs .Lenter_neon604tst ip,ip @ is_base2_26?605beq .Lpoly1305_blocks606607.Lenter_neon:608stmdb sp!,{r4,r5,r6,r7}609vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so610611tst ip,ip @ is_base2_26?612bne .Lbase2_26_neon613614stmdb sp!,{r1,r2,r3,lr}615bl poly1305_init_neon616617ldr r4,[r0,#0] @ load hash value base 2^32618ldr r5,[r0,#4]619ldr r6,[r0,#8]620ldr r7,[r0,#12]621ldr ip,[r0,#16]622623and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26624mov r3,r4,lsr#26625veor d10,d10,d10626mov r4,r5,lsr#20627orr r3,r3,r5,lsl#6628veor d12,d12,d12629mov r5,r6,lsr#14630orr r4,r4,r6,lsl#12631veor d14,d14,d14632mov r6,r7,lsr#8633orr r5,r5,r7,lsl#18634veor d16,d16,d16635and r3,r3,#0x03ffffff636orr r6,r6,ip,lsl#24637veor d18,d18,d18638and r4,r4,#0x03ffffff639mov r1,#1640and r5,r5,#0x03ffffff641str r1,[r0,#36] @ is_base2_26642643vmov.32 d10[0],r2644vmov.32 d12[0],r3645vmov.32 d14[0],r4646vmov.32 d16[0],r5647vmov.32 d18[0],r6648adr r5,.Lzeros649650ldmia sp!,{r1,r2,r3,lr}651b .Lbase2_32_neon652653.align 4654.Lbase2_26_neon:655@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@656@ load hash value657658veor d10,d10,d10659veor d12,d12,d12660veor d14,d14,d14661veor d16,d16,d16662veor d18,d18,d18663vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!664adr r5,.Lzeros665vld1.32 {d18[0]},[r0]666sub r0,r0,#16 @ rewind667668.Lbase2_32_neon:669add r4,r1,#32670mov r3,r3,lsl#24671tst r2,#31672beq .Leven673674vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]!675vmov.32 d28[0],r3676sub r2,r2,#16677add r4,r1,#32678679# ifdef __ARMEB__680vrev32.8 q10,q10681vrev32.8 q13,q13682vrev32.8 q11,q11683vrev32.8 q12,q12684# endif685vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26686vshl.u32 d26,d26,#18687688vsri.u32 d26,d24,#14689vshl.u32 d24,d24,#12690vadd.i32 d29,d28,d18 @ add hash value and move to #hi691692vbic.i32 d26,#0xfc000000693vsri.u32 d24,d22,#20694vshl.u32 d22,d22,#6695696vbic.i32 d24,#0xfc000000697vsri.u32 d22,d20,#26698vadd.i32 d27,d26,d16699700vbic.i32 d20,#0xfc000000701vbic.i32 d22,#0xfc000000702vadd.i32 d25,d24,d14703704vadd.i32 d21,d20,d10705vadd.i32 d23,d22,d12706707mov r7,r5708add r6,r0,#48709710cmp r2,r2711b .Long_tail712713.align 4714.Leven:715subs r2,r2,#64716it lo717movlo r4,r5718719vmov.i32 q14,#1<<24 @ padbit, yes, always720vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]721add r1,r1,#64722vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)723add r4,r4,#64724itt hi725addhi r7,r0,#(48+1*9*4)726addhi r6,r0,#(48+3*9*4)727728# ifdef __ARMEB__729vrev32.8 q10,q10730vrev32.8 q13,q13731vrev32.8 q11,q11732vrev32.8 q12,q12733# endif734vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26735vshl.u32 q13,q13,#18736737vsri.u32 q13,q12,#14738vshl.u32 q12,q12,#12739740vbic.i32 q13,#0xfc000000741vsri.u32 q12,q11,#20742vshl.u32 q11,q11,#6743744vbic.i32 q12,#0xfc000000745vsri.u32 q11,q10,#26746747vbic.i32 q10,#0xfc000000748vbic.i32 q11,#0xfc000000749750bls .Lskip_loop751752vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2753vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4754vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!755vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!756b .Loop_neon757758.align 5759.Loop_neon:760@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@761@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2762@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r763@ ___________________/764@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2765@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r766@ ___________________/ ____________________/767@768@ Note that we start with inp[2:3]*r^2. This is because it769@ doesn't depend on reduction in previous iteration.770@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@771@ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4772@ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4773@ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4774@ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4775@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4776777@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@778@ inp[2:3]*r^2779780vadd.i32 d24,d24,d14 @ accumulate inp[0:1]781vmull.u32 q7,d25,d0[1]782vadd.i32 d20,d20,d10783vmull.u32 q5,d21,d0[1]784vadd.i32 d26,d26,d16785vmull.u32 q8,d27,d0[1]786vmlal.u32 q7,d23,d1[1]787vadd.i32 d22,d22,d12788vmull.u32 q6,d23,d0[1]789790vadd.i32 d28,d28,d18791vmull.u32 q9,d29,d0[1]792subs r2,r2,#64793vmlal.u32 q5,d29,d2[1]794it lo795movlo r4,r5796vmlal.u32 q8,d25,d1[1]797vld1.32 d8[1],[r7,:32]798vmlal.u32 q6,d21,d1[1]799vmlal.u32 q9,d27,d1[1]800801vmlal.u32 q5,d27,d4[1]802vmlal.u32 q8,d23,d3[1]803vmlal.u32 q9,d25,d3[1]804vmlal.u32 q6,d29,d4[1]805vmlal.u32 q7,d21,d3[1]806807vmlal.u32 q8,d21,d5[1]808vmlal.u32 q5,d25,d6[1]809vmlal.u32 q9,d23,d5[1]810vmlal.u32 q6,d27,d6[1]811vmlal.u32 q7,d29,d6[1]812813vmlal.u32 q8,d29,d8[1]814vmlal.u32 q5,d23,d8[1]815vmlal.u32 q9,d21,d7[1]816vmlal.u32 q6,d25,d8[1]817vmlal.u32 q7,d27,d8[1]818819vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)820add r4,r4,#64821822@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@823@ (hash+inp[0:1])*r^4 and accumulate824825vmlal.u32 q8,d26,d0[0]826vmlal.u32 q5,d20,d0[0]827vmlal.u32 q9,d28,d0[0]828vmlal.u32 q6,d22,d0[0]829vmlal.u32 q7,d24,d0[0]830vld1.32 d8[0],[r6,:32]831832vmlal.u32 q8,d24,d1[0]833vmlal.u32 q5,d28,d2[0]834vmlal.u32 q9,d26,d1[0]835vmlal.u32 q6,d20,d1[0]836vmlal.u32 q7,d22,d1[0]837838vmlal.u32 q8,d22,d3[0]839vmlal.u32 q5,d26,d4[0]840vmlal.u32 q9,d24,d3[0]841vmlal.u32 q6,d28,d4[0]842vmlal.u32 q7,d20,d3[0]843844vmlal.u32 q8,d20,d5[0]845vmlal.u32 q5,d24,d6[0]846vmlal.u32 q9,d22,d5[0]847vmlal.u32 q6,d26,d6[0]848vmlal.u32 q8,d28,d8[0]849850vmlal.u32 q7,d28,d6[0]851vmlal.u32 q5,d22,d8[0]852vmlal.u32 q9,d20,d7[0]853vmov.i32 q14,#1<<24 @ padbit, yes, always854vmlal.u32 q6,d24,d8[0]855vmlal.u32 q7,d26,d8[0]856857vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]858add r1,r1,#64859# ifdef __ARMEB__860vrev32.8 q10,q10861vrev32.8 q11,q11862vrev32.8 q12,q12863vrev32.8 q13,q13864# endif865866@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@867@ lazy reduction interleaved with base 2^32 -> base 2^26 of868@ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.869870vshr.u64 q15,q8,#26871vmovn.i64 d16,q8872vshr.u64 q4,q5,#26873vmovn.i64 d10,q5874vadd.i64 q9,q9,q15 @ h3 -> h4875vbic.i32 d16,#0xfc000000876vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26877vadd.i64 q6,q6,q4 @ h0 -> h1878vshl.u32 q13,q13,#18879vbic.i32 d10,#0xfc000000880881vshrn.u64 d30,q9,#26882vmovn.i64 d18,q9883vshr.u64 q4,q6,#26884vmovn.i64 d12,q6885vadd.i64 q7,q7,q4 @ h1 -> h2886vsri.u32 q13,q12,#14887vbic.i32 d18,#0xfc000000888vshl.u32 q12,q12,#12889vbic.i32 d12,#0xfc000000890891vadd.i32 d10,d10,d30892vshl.u32 d30,d30,#2893vbic.i32 q13,#0xfc000000894vshrn.u64 d8,q7,#26895vmovn.i64 d14,q7896vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec]897vsri.u32 q12,q11,#20898vadd.i32 d16,d16,d8 @ h2 -> h3899vshl.u32 q11,q11,#6900vbic.i32 d14,#0xfc000000901vbic.i32 q12,#0xfc000000902903vshrn.u64 d30,q5,#26 @ re-narrow904vmovn.i64 d10,q5905vsri.u32 q11,q10,#26906vbic.i32 q10,#0xfc000000907vshr.u32 d8,d16,#26908vbic.i32 d16,#0xfc000000909vbic.i32 d10,#0xfc000000910vadd.i32 d12,d12,d30 @ h0 -> h1911vadd.i32 d18,d18,d8 @ h3 -> h4912vbic.i32 q11,#0xfc000000913914bhi .Loop_neon915916.Lskip_loop:917@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@918@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1919920add r7,r0,#(48+0*9*4)921add r6,r0,#(48+1*9*4)922adds r2,r2,#32923it ne924movne r2,#0925bne .Long_tail926927vadd.i32 d25,d24,d14 @ add hash value and move to #hi928vadd.i32 d21,d20,d10929vadd.i32 d27,d26,d16930vadd.i32 d23,d22,d12931vadd.i32 d29,d28,d18932933.Long_tail:934vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1935vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2936937vadd.i32 d24,d24,d14 @ can be redundant938vmull.u32 q7,d25,d0939vadd.i32 d20,d20,d10940vmull.u32 q5,d21,d0941vadd.i32 d26,d26,d16942vmull.u32 q8,d27,d0943vadd.i32 d22,d22,d12944vmull.u32 q6,d23,d0945vadd.i32 d28,d28,d18946vmull.u32 q9,d29,d0947948vmlal.u32 q5,d29,d2949vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!950vmlal.u32 q8,d25,d1951vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!952vmlal.u32 q6,d21,d1953vmlal.u32 q9,d27,d1954vmlal.u32 q7,d23,d1955956vmlal.u32 q8,d23,d3957vld1.32 d8[1],[r7,:32]958vmlal.u32 q5,d27,d4959vld1.32 d8[0],[r6,:32]960vmlal.u32 q9,d25,d3961vmlal.u32 q6,d29,d4962vmlal.u32 q7,d21,d3963964vmlal.u32 q8,d21,d5965it ne966addne r7,r0,#(48+2*9*4)967vmlal.u32 q5,d25,d6968it ne969addne r6,r0,#(48+3*9*4)970vmlal.u32 q9,d23,d5971vmlal.u32 q6,d27,d6972vmlal.u32 q7,d29,d6973974vmlal.u32 q8,d29,d8975vorn q0,q0,q0 @ all-ones, can be redundant976vmlal.u32 q5,d23,d8977vshr.u64 q0,q0,#38978vmlal.u32 q9,d21,d7979vmlal.u32 q6,d25,d8980vmlal.u32 q7,d27,d8981982beq .Lshort_tail983984@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@985@ (hash+inp[0:1])*r^4:r^3 and accumulate986987vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3988vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4989990vmlal.u32 q7,d24,d0991vmlal.u32 q5,d20,d0992vmlal.u32 q8,d26,d0993vmlal.u32 q6,d22,d0994vmlal.u32 q9,d28,d0995996vmlal.u32 q5,d28,d2997vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!998vmlal.u32 q8,d24,d1999vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!1000vmlal.u32 q6,d20,d11001vmlal.u32 q9,d26,d11002vmlal.u32 q7,d22,d110031004vmlal.u32 q8,d22,d31005vld1.32 d8[1],[r7,:32]1006vmlal.u32 q5,d26,d41007vld1.32 d8[0],[r6,:32]1008vmlal.u32 q9,d24,d31009vmlal.u32 q6,d28,d41010vmlal.u32 q7,d20,d310111012vmlal.u32 q8,d20,d51013vmlal.u32 q5,d24,d61014vmlal.u32 q9,d22,d51015vmlal.u32 q6,d26,d61016vmlal.u32 q7,d28,d610171018vmlal.u32 q8,d28,d81019vorn q0,q0,q0 @ all-ones1020vmlal.u32 q5,d22,d81021vshr.u64 q0,q0,#381022vmlal.u32 q9,d20,d71023vmlal.u32 q6,d24,d81024vmlal.u32 q7,d26,d810251026.Lshort_tail:1027@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@1028@ horizontal addition10291030vadd.i64 d16,d16,d171031vadd.i64 d10,d10,d111032vadd.i64 d18,d18,d191033vadd.i64 d12,d12,d131034vadd.i64 d14,d14,d1510351036@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@1037@ lazy reduction, but without narrowing10381039vshr.u64 q15,q8,#261040vand.i64 q8,q8,q01041vshr.u64 q4,q5,#261042vand.i64 q5,q5,q01043vadd.i64 q9,q9,q15 @ h3 -> h41044vadd.i64 q6,q6,q4 @ h0 -> h110451046vshr.u64 q15,q9,#261047vand.i64 q9,q9,q01048vshr.u64 q4,q6,#261049vand.i64 q6,q6,q01050vadd.i64 q7,q7,q4 @ h1 -> h210511052vadd.i64 q5,q5,q151053vshl.u64 q15,q15,#21054vshr.u64 q4,q7,#261055vand.i64 q7,q7,q01056vadd.i64 q5,q5,q15 @ h4 -> h01057vadd.i64 q8,q8,q4 @ h2 -> h310581059vshr.u64 q15,q5,#261060vand.i64 q5,q5,q01061vshr.u64 q4,q8,#261062vand.i64 q8,q8,q01063vadd.i64 q6,q6,q15 @ h0 -> h11064vadd.i64 q9,q9,q4 @ h3 -> h410651066cmp r2,#01067bne .Leven10681069@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@1070@ store hash value10711072vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!1073vst1.32 {d18[0]},[r0]10741075vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ epilogue1076ldmia sp!,{r4,r5,r6,r7}1077.Lno_data_neon:1078bx lr @ bx lr1079.size poly1305_blocks_neon,.-poly1305_blocks_neon10801081.type poly1305_emit_neon,%function1082.align 51083poly1305_emit_neon:1084.Lpoly1305_emit_neon:1085ldr ip,[r0,#36] @ is_base2_2610861087stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11}10881089tst ip,ip1090beq .Lpoly1305_emit_enter10911092ldmia r0,{r3,r4,r5,r6,r7}1093eor r8,r8,r810941095adds r3,r3,r4,lsl#26 @ base 2^26 -> base 2^321096mov r4,r4,lsr#61097adcs r4,r4,r5,lsl#201098mov r5,r5,lsr#121099adcs r5,r5,r6,lsl#141100mov r6,r6,lsr#181101adcs r6,r6,r7,lsl#81102adc r7,r8,r7,lsr#24 @ can be partially reduced ...11031104and r8,r7,#-4 @ ... so reduce1105and r7,r6,#31106add r8,r8,r8,lsr#2 @ *= 51107adds r3,r3,r81108adcs r4,r4,#01109adcs r5,r5,#01110adcs r6,r6,#01111adc r7,r7,#011121113adds r8,r3,#5 @ compare to modulus1114adcs r9,r4,#01115adcs r10,r5,#01116adcs r11,r6,#01117adc r7,r7,#01118tst r7,#4 @ did it carry/borrow?11191120it ne1121movne r3,r81122ldr r8,[r2,#0]1123it ne1124movne r4,r91125ldr r9,[r2,#4]1126it ne1127movne r5,r101128ldr r10,[r2,#8]1129it ne1130movne r6,r111131ldr r11,[r2,#12]11321133adds r3,r3,r8 @ accumulate nonce1134adcs r4,r4,r91135adcs r5,r5,r101136adc r6,r6,r1111371138# ifdef __ARMEB__1139rev r3,r31140rev r4,r41141rev r5,r51142rev r6,r61143# endif1144str r3,[r1,#0] @ store the result1145str r4,[r1,#4]1146str r5,[r1,#8]1147str r6,[r1,#12]11481149ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11}1150bx lr @ bx lr1151.size poly1305_emit_neon,.-poly1305_emit_neon11521153.align 51154.Lzeros:1155.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,01156.LOPENSSL_armcap:1157# ifdef _WIN321158.word OPENSSL_armcap_P1159# else1160.word OPENSSL_armcap_P-.Lpoly1305_init1161# endif1162#endif1163.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,01164.align 21165.align 21166#if __ARM_MAX_ARCH__>=711671168.hidden OPENSSL_armcap_P1169#endif117011711172