Path: blob/main/sys/crypto/openssl/aarch64/armv8-mont.S
39536 views
/* Do not modify. This file is auto-generated from armv8-mont.pl. */1#include "arm_arch.h"2#ifndef __KERNEL__34.hidden OPENSSL_armv8_rsa_neonized5#endif6.text78.globl bn_mul_mont9.type bn_mul_mont,%function10.align 511bn_mul_mont:12AARCH64_SIGN_LINK_REGISTER13.Lbn_mul_mont:14tst x5,#315b.ne .Lmul_mont16cmp x5,#3217b.le .Lscalar_impl18#ifndef __KERNEL__19#ifndef __AARCH64EB__20adrp x17,OPENSSL_armv8_rsa_neonized21ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]22cbnz w17, bn_mul8x_mont_neon23#endif24#endif2526.Lscalar_impl:27tst x5,#728b.eq __bn_sqr8x_mont29tst x5,#330b.eq __bn_mul4x_mont3132.Lmul_mont:33stp x29,x30,[sp,#-64]!34add x29,sp,#035stp x19,x20,[sp,#16]36stp x21,x22,[sp,#32]37stp x23,x24,[sp,#48]3839ldr x9,[x2],#8 // bp[0]40sub x22,sp,x5,lsl#341ldp x7,x8,[x1],#16 // ap[0..1]42lsl x5,x5,#343ldr x4,[x4] // *n044and x22,x22,#-16 // ABI says so45ldp x13,x14,[x3],#16 // np[0..1]4647mul x6,x7,x9 // ap[0]*bp[0]48sub x21,x5,#16 // j=num-249umulh x7,x7,x950mul x10,x8,x9 // ap[1]*bp[0]51umulh x11,x8,x95253mul x15,x6,x4 // "tp[0]"*n054mov sp,x22 // alloca5556// (*) mul x12,x13,x15 // np[0]*m157umulh x13,x13,x1558mul x16,x14,x15 // np[1]*m159// (*) adds x12,x12,x6 // discarded60// (*) As for removal of first multiplication and addition61// instructions. The outcome of first addition is62// guaranteed to be zero, which leaves two computationally63// significant outcomes: it either carries or not. Then64// question is when does it carry? Is there alternative65// way to deduce it? If you follow operations, you can66// observe that condition for carry is quite simple:67// x6 being non-zero. So that carry can be calculated68// by adding -1 to x6. That's what next instruction does.69subs xzr,x6,#1 // (*)70umulh x17,x14,x1571adc x13,x13,xzr72cbz x21,.L1st_skip7374.L1st:75ldr x8,[x1],#876adds x6,x10,x777sub x21,x21,#8 // j--78adc x7,x11,xzr7980ldr x14,[x3],#881adds x12,x16,x1382mul x10,x8,x9 // ap[j]*bp[0]83adc x13,x17,xzr84umulh x11,x8,x98586adds x12,x12,x687mul x16,x14,x15 // np[j]*m188adc x13,x13,xzr89umulh x17,x14,x1590str x12,[x22],#8 // tp[j-1]91cbnz x21,.L1st9293.L1st_skip:94adds x6,x10,x795sub x1,x1,x5 // rewind x196adc x7,x11,xzr9798adds x12,x16,x1399sub x3,x3,x5 // rewind x3100adc x13,x17,xzr101102adds x12,x12,x6103sub x20,x5,#8 // i=num-1104adcs x13,x13,x7105106adc x19,xzr,xzr // upmost overflow bit107stp x12,x13,[x22]108109.Louter:110ldr x9,[x2],#8 // bp[i]111ldp x7,x8,[x1],#16112ldr x23,[sp] // tp[0]113add x22,sp,#8114115mul x6,x7,x9 // ap[0]*bp[i]116sub x21,x5,#16 // j=num-2117umulh x7,x7,x9118ldp x13,x14,[x3],#16119mul x10,x8,x9 // ap[1]*bp[i]120adds x6,x6,x23121umulh x11,x8,x9122adc x7,x7,xzr123124mul x15,x6,x4125sub x20,x20,#8 // i--126127// (*) mul x12,x13,x15 // np[0]*m1128umulh x13,x13,x15129mul x16,x14,x15 // np[1]*m1130// (*) adds x12,x12,x6131subs xzr,x6,#1 // (*)132umulh x17,x14,x15133cbz x21,.Linner_skip134135.Linner:136ldr x8,[x1],#8137adc x13,x13,xzr138ldr x23,[x22],#8 // tp[j]139adds x6,x10,x7140sub x21,x21,#8 // j--141adc x7,x11,xzr142143adds x12,x16,x13144ldr x14,[x3],#8145adc x13,x17,xzr146147mul x10,x8,x9 // ap[j]*bp[i]148adds x6,x6,x23149umulh x11,x8,x9150adc x7,x7,xzr151152mul x16,x14,x15 // np[j]*m1153adds x12,x12,x6154umulh x17,x14,x15155stur x12,[x22,#-16] // tp[j-1]156cbnz x21,.Linner157158.Linner_skip:159ldr x23,[x22],#8 // tp[j]160adc x13,x13,xzr161adds x6,x10,x7162sub x1,x1,x5 // rewind x1163adc x7,x11,xzr164165adds x12,x16,x13166sub x3,x3,x5 // rewind x3167adcs x13,x17,x19168adc x19,xzr,xzr169170adds x6,x6,x23171adc x7,x7,xzr172173adds x12,x12,x6174adcs x13,x13,x7175adc x19,x19,xzr // upmost overflow bit176stp x12,x13,[x22,#-16]177178cbnz x20,.Louter179180// Final step. We see if result is larger than modulus, and181// if it is, subtract the modulus. But comparison implies182// subtraction. So we subtract modulus, see if it borrowed,183// and conditionally copy original value.184ldr x23,[sp] // tp[0]185add x22,sp,#8186ldr x14,[x3],#8 // np[0]187subs x21,x5,#8 // j=num-1 and clear borrow188mov x1,x0189.Lsub:190sbcs x8,x23,x14 // tp[j]-np[j]191ldr x23,[x22],#8192sub x21,x21,#8 // j--193ldr x14,[x3],#8194str x8,[x1],#8 // rp[j]=tp[j]-np[j]195cbnz x21,.Lsub196197sbcs x8,x23,x14198sbcs x19,x19,xzr // did it borrow?199str x8,[x1],#8 // rp[num-1]200201ldr x23,[sp] // tp[0]202add x22,sp,#8203ldr x8,[x0],#8 // rp[0]204sub x5,x5,#8 // num--205nop206.Lcond_copy:207sub x5,x5,#8 // num--208csel x14,x23,x8,lo // did it borrow?209ldr x23,[x22],#8210ldr x8,[x0],#8211stur xzr,[x22,#-16] // wipe tp212stur x14,[x0,#-16]213cbnz x5,.Lcond_copy214215csel x14,x23,x8,lo216stur xzr,[x22,#-8] // wipe tp217stur x14,[x0,#-8]218219ldp x19,x20,[x29,#16]220mov sp,x29221ldp x21,x22,[x29,#32]222mov x0,#1223ldp x23,x24,[x29,#48]224ldr x29,[sp],#64225AARCH64_VALIDATE_LINK_REGISTER226ret227.size bn_mul_mont,.-bn_mul_mont228.type bn_mul8x_mont_neon,%function229.align 5230bn_mul8x_mont_neon:231// Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to232// only from bn_mul_mont which has already signed the return address.233stp x29,x30,[sp,#-80]!234mov x16,sp235stp d8,d9,[sp,#16]236stp d10,d11,[sp,#32]237stp d12,d13,[sp,#48]238stp d14,d15,[sp,#64]239lsl x5,x5,#1240eor v14.16b,v14.16b,v14.16b241242.align 4243.LNEON_8n:244eor v6.16b,v6.16b,v6.16b245sub x7,sp,#128246eor v7.16b,v7.16b,v7.16b247sub x7,x7,x5,lsl#4248eor v8.16b,v8.16b,v8.16b249and x7,x7,#-64250eor v9.16b,v9.16b,v9.16b251mov sp,x7 // alloca252eor v10.16b,v10.16b,v10.16b253add x7,x7,#256254eor v11.16b,v11.16b,v11.16b255sub x8,x5,#8256eor v12.16b,v12.16b,v12.16b257eor v13.16b,v13.16b,v13.16b258259.LNEON_8n_init:260st1 {v6.2d,v7.2d},[x7],#32261subs x8,x8,#8262st1 {v8.2d,v9.2d},[x7],#32263st1 {v10.2d,v11.2d},[x7],#32264st1 {v12.2d,v13.2d},[x7],#32265bne .LNEON_8n_init266267add x6,sp,#256268ld1 {v0.4s,v1.4s},[x1],#32269add x10,sp,#8270ldr s30,[x4],#4271mov x9,x5272b .LNEON_8n_outer273274.align 4275.LNEON_8n_outer:276ldr s28,[x2],#4 // *b++277uxtl v28.4s,v28.4h278add x7,sp,#128279ld1 {v2.4s,v3.4s},[x3],#32280281umlal v6.2d,v28.2s,v0.s[0]282umlal v7.2d,v28.2s,v0.s[1]283umlal v8.2d,v28.2s,v0.s[2]284shl v29.2d,v6.2d,#16285ext v29.16b,v29.16b,v29.16b,#8286umlal v9.2d,v28.2s,v0.s[3]287add v29.2d,v29.2d,v6.2d288umlal v10.2d,v28.2s,v1.s[0]289mul v29.2s,v29.2s,v30.2s290umlal v11.2d,v28.2s,v1.s[1]291st1 {v28.2s},[sp] // put aside smashed b[8*i+0]292umlal v12.2d,v28.2s,v1.s[2]293uxtl v29.4s,v29.4h294umlal v13.2d,v28.2s,v1.s[3]295ldr s28,[x2],#4 // *b++296umlal v6.2d,v29.2s,v2.s[0]297umlal v7.2d,v29.2s,v2.s[1]298uxtl v28.4s,v28.4h299umlal v8.2d,v29.2s,v2.s[2]300ushr v15.2d,v6.2d,#16301umlal v9.2d,v29.2s,v2.s[3]302umlal v10.2d,v29.2s,v3.s[0]303ext v6.16b,v6.16b,v6.16b,#8304add v6.2d,v6.2d,v15.2d305umlal v11.2d,v29.2s,v3.s[1]306ushr v6.2d,v6.2d,#16307umlal v12.2d,v29.2s,v3.s[2]308umlal v13.2d,v29.2s,v3.s[3]309add v16.2d,v7.2d,v6.2d310ins v7.d[0],v16.d[0]311st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+0]312umlal v7.2d,v28.2s,v0.s[0]313ld1 {v6.2d},[x6],#16314umlal v8.2d,v28.2s,v0.s[1]315umlal v9.2d,v28.2s,v0.s[2]316shl v29.2d,v7.2d,#16317ext v29.16b,v29.16b,v29.16b,#8318umlal v10.2d,v28.2s,v0.s[3]319add v29.2d,v29.2d,v7.2d320umlal v11.2d,v28.2s,v1.s[0]321mul v29.2s,v29.2s,v30.2s322umlal v12.2d,v28.2s,v1.s[1]323st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+1]324umlal v13.2d,v28.2s,v1.s[2]325uxtl v29.4s,v29.4h326umlal v6.2d,v28.2s,v1.s[3]327ldr s28,[x2],#4 // *b++328umlal v7.2d,v29.2s,v2.s[0]329umlal v8.2d,v29.2s,v2.s[1]330uxtl v28.4s,v28.4h331umlal v9.2d,v29.2s,v2.s[2]332ushr v15.2d,v7.2d,#16333umlal v10.2d,v29.2s,v2.s[3]334umlal v11.2d,v29.2s,v3.s[0]335ext v7.16b,v7.16b,v7.16b,#8336add v7.2d,v7.2d,v15.2d337umlal v12.2d,v29.2s,v3.s[1]338ushr v7.2d,v7.2d,#16339umlal v13.2d,v29.2s,v3.s[2]340umlal v6.2d,v29.2s,v3.s[3]341add v16.2d,v8.2d,v7.2d342ins v8.d[0],v16.d[0]343st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+1]344umlal v8.2d,v28.2s,v0.s[0]345ld1 {v7.2d},[x6],#16346umlal v9.2d,v28.2s,v0.s[1]347umlal v10.2d,v28.2s,v0.s[2]348shl v29.2d,v8.2d,#16349ext v29.16b,v29.16b,v29.16b,#8350umlal v11.2d,v28.2s,v0.s[3]351add v29.2d,v29.2d,v8.2d352umlal v12.2d,v28.2s,v1.s[0]353mul v29.2s,v29.2s,v30.2s354umlal v13.2d,v28.2s,v1.s[1]355st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+2]356umlal v6.2d,v28.2s,v1.s[2]357uxtl v29.4s,v29.4h358umlal v7.2d,v28.2s,v1.s[3]359ldr s28,[x2],#4 // *b++360umlal v8.2d,v29.2s,v2.s[0]361umlal v9.2d,v29.2s,v2.s[1]362uxtl v28.4s,v28.4h363umlal v10.2d,v29.2s,v2.s[2]364ushr v15.2d,v8.2d,#16365umlal v11.2d,v29.2s,v2.s[3]366umlal v12.2d,v29.2s,v3.s[0]367ext v8.16b,v8.16b,v8.16b,#8368add v8.2d,v8.2d,v15.2d369umlal v13.2d,v29.2s,v3.s[1]370ushr v8.2d,v8.2d,#16371umlal v6.2d,v29.2s,v3.s[2]372umlal v7.2d,v29.2s,v3.s[3]373add v16.2d,v9.2d,v8.2d374ins v9.d[0],v16.d[0]375st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+2]376umlal v9.2d,v28.2s,v0.s[0]377ld1 {v8.2d},[x6],#16378umlal v10.2d,v28.2s,v0.s[1]379umlal v11.2d,v28.2s,v0.s[2]380shl v29.2d,v9.2d,#16381ext v29.16b,v29.16b,v29.16b,#8382umlal v12.2d,v28.2s,v0.s[3]383add v29.2d,v29.2d,v9.2d384umlal v13.2d,v28.2s,v1.s[0]385mul v29.2s,v29.2s,v30.2s386umlal v6.2d,v28.2s,v1.s[1]387st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+3]388umlal v7.2d,v28.2s,v1.s[2]389uxtl v29.4s,v29.4h390umlal v8.2d,v28.2s,v1.s[3]391ldr s28,[x2],#4 // *b++392umlal v9.2d,v29.2s,v2.s[0]393umlal v10.2d,v29.2s,v2.s[1]394uxtl v28.4s,v28.4h395umlal v11.2d,v29.2s,v2.s[2]396ushr v15.2d,v9.2d,#16397umlal v12.2d,v29.2s,v2.s[3]398umlal v13.2d,v29.2s,v3.s[0]399ext v9.16b,v9.16b,v9.16b,#8400add v9.2d,v9.2d,v15.2d401umlal v6.2d,v29.2s,v3.s[1]402ushr v9.2d,v9.2d,#16403umlal v7.2d,v29.2s,v3.s[2]404umlal v8.2d,v29.2s,v3.s[3]405add v16.2d,v10.2d,v9.2d406ins v10.d[0],v16.d[0]407st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+3]408umlal v10.2d,v28.2s,v0.s[0]409ld1 {v9.2d},[x6],#16410umlal v11.2d,v28.2s,v0.s[1]411umlal v12.2d,v28.2s,v0.s[2]412shl v29.2d,v10.2d,#16413ext v29.16b,v29.16b,v29.16b,#8414umlal v13.2d,v28.2s,v0.s[3]415add v29.2d,v29.2d,v10.2d416umlal v6.2d,v28.2s,v1.s[0]417mul v29.2s,v29.2s,v30.2s418umlal v7.2d,v28.2s,v1.s[1]419st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+4]420umlal v8.2d,v28.2s,v1.s[2]421uxtl v29.4s,v29.4h422umlal v9.2d,v28.2s,v1.s[3]423ldr s28,[x2],#4 // *b++424umlal v10.2d,v29.2s,v2.s[0]425umlal v11.2d,v29.2s,v2.s[1]426uxtl v28.4s,v28.4h427umlal v12.2d,v29.2s,v2.s[2]428ushr v15.2d,v10.2d,#16429umlal v13.2d,v29.2s,v2.s[3]430umlal v6.2d,v29.2s,v3.s[0]431ext v10.16b,v10.16b,v10.16b,#8432add v10.2d,v10.2d,v15.2d433umlal v7.2d,v29.2s,v3.s[1]434ushr v10.2d,v10.2d,#16435umlal v8.2d,v29.2s,v3.s[2]436umlal v9.2d,v29.2s,v3.s[3]437add v16.2d,v11.2d,v10.2d438ins v11.d[0],v16.d[0]439st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+4]440umlal v11.2d,v28.2s,v0.s[0]441ld1 {v10.2d},[x6],#16442umlal v12.2d,v28.2s,v0.s[1]443umlal v13.2d,v28.2s,v0.s[2]444shl v29.2d,v11.2d,#16445ext v29.16b,v29.16b,v29.16b,#8446umlal v6.2d,v28.2s,v0.s[3]447add v29.2d,v29.2d,v11.2d448umlal v7.2d,v28.2s,v1.s[0]449mul v29.2s,v29.2s,v30.2s450umlal v8.2d,v28.2s,v1.s[1]451st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+5]452umlal v9.2d,v28.2s,v1.s[2]453uxtl v29.4s,v29.4h454umlal v10.2d,v28.2s,v1.s[3]455ldr s28,[x2],#4 // *b++456umlal v11.2d,v29.2s,v2.s[0]457umlal v12.2d,v29.2s,v2.s[1]458uxtl v28.4s,v28.4h459umlal v13.2d,v29.2s,v2.s[2]460ushr v15.2d,v11.2d,#16461umlal v6.2d,v29.2s,v2.s[3]462umlal v7.2d,v29.2s,v3.s[0]463ext v11.16b,v11.16b,v11.16b,#8464add v11.2d,v11.2d,v15.2d465umlal v8.2d,v29.2s,v3.s[1]466ushr v11.2d,v11.2d,#16467umlal v9.2d,v29.2s,v3.s[2]468umlal v10.2d,v29.2s,v3.s[3]469add v16.2d,v12.2d,v11.2d470ins v12.d[0],v16.d[0]471st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+5]472umlal v12.2d,v28.2s,v0.s[0]473ld1 {v11.2d},[x6],#16474umlal v13.2d,v28.2s,v0.s[1]475umlal v6.2d,v28.2s,v0.s[2]476shl v29.2d,v12.2d,#16477ext v29.16b,v29.16b,v29.16b,#8478umlal v7.2d,v28.2s,v0.s[3]479add v29.2d,v29.2d,v12.2d480umlal v8.2d,v28.2s,v1.s[0]481mul v29.2s,v29.2s,v30.2s482umlal v9.2d,v28.2s,v1.s[1]483st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+6]484umlal v10.2d,v28.2s,v1.s[2]485uxtl v29.4s,v29.4h486umlal v11.2d,v28.2s,v1.s[3]487ldr s28,[x2],#4 // *b++488umlal v12.2d,v29.2s,v2.s[0]489umlal v13.2d,v29.2s,v2.s[1]490uxtl v28.4s,v28.4h491umlal v6.2d,v29.2s,v2.s[2]492ushr v15.2d,v12.2d,#16493umlal v7.2d,v29.2s,v2.s[3]494umlal v8.2d,v29.2s,v3.s[0]495ext v12.16b,v12.16b,v12.16b,#8496add v12.2d,v12.2d,v15.2d497umlal v9.2d,v29.2s,v3.s[1]498ushr v12.2d,v12.2d,#16499umlal v10.2d,v29.2s,v3.s[2]500umlal v11.2d,v29.2s,v3.s[3]501add v16.2d,v13.2d,v12.2d502ins v13.d[0],v16.d[0]503st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+6]504umlal v13.2d,v28.2s,v0.s[0]505ld1 {v12.2d},[x6],#16506umlal v6.2d,v28.2s,v0.s[1]507umlal v7.2d,v28.2s,v0.s[2]508shl v29.2d,v13.2d,#16509ext v29.16b,v29.16b,v29.16b,#8510umlal v8.2d,v28.2s,v0.s[3]511add v29.2d,v29.2d,v13.2d512umlal v9.2d,v28.2s,v1.s[0]513mul v29.2s,v29.2s,v30.2s514umlal v10.2d,v28.2s,v1.s[1]515st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+7]516umlal v11.2d,v28.2s,v1.s[2]517uxtl v29.4s,v29.4h518umlal v12.2d,v28.2s,v1.s[3]519ld1 {v28.2s},[sp] // pull smashed b[8*i+0]520umlal v13.2d,v29.2s,v2.s[0]521ld1 {v0.4s,v1.4s},[x1],#32522umlal v6.2d,v29.2s,v2.s[1]523umlal v7.2d,v29.2s,v2.s[2]524mov v5.16b,v13.16b525ushr v5.2d,v5.2d,#16526ext v13.16b,v13.16b,v13.16b,#8527umlal v8.2d,v29.2s,v2.s[3]528umlal v9.2d,v29.2s,v3.s[0]529add v13.2d,v13.2d,v5.2d530umlal v10.2d,v29.2s,v3.s[1]531ushr v13.2d,v13.2d,#16532eor v15.16b,v15.16b,v15.16b533ins v13.d[1],v15.d[0]534umlal v11.2d,v29.2s,v3.s[2]535umlal v12.2d,v29.2s,v3.s[3]536add v6.2d,v6.2d,v13.2d537st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+7]538add x10,sp,#8 // rewind539sub x8,x5,#8540b .LNEON_8n_inner541542.align 4543.LNEON_8n_inner:544subs x8,x8,#8545umlal v6.2d,v28.2s,v0.s[0]546ld1 {v13.2d},[x6]547umlal v7.2d,v28.2s,v0.s[1]548ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+0]549umlal v8.2d,v28.2s,v0.s[2]550ld1 {v2.4s,v3.4s},[x3],#32551umlal v9.2d,v28.2s,v0.s[3]552b.eq .LInner_jump553add x6,x6,#16 // don't advance in last iteration554.LInner_jump:555umlal v10.2d,v28.2s,v1.s[0]556umlal v11.2d,v28.2s,v1.s[1]557umlal v12.2d,v28.2s,v1.s[2]558umlal v13.2d,v28.2s,v1.s[3]559ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+1]560umlal v6.2d,v29.2s,v2.s[0]561umlal v7.2d,v29.2s,v2.s[1]562umlal v8.2d,v29.2s,v2.s[2]563umlal v9.2d,v29.2s,v2.s[3]564umlal v10.2d,v29.2s,v3.s[0]565umlal v11.2d,v29.2s,v3.s[1]566umlal v12.2d,v29.2s,v3.s[2]567umlal v13.2d,v29.2s,v3.s[3]568st1 {v6.2d},[x7],#16569umlal v7.2d,v28.2s,v0.s[0]570ld1 {v6.2d},[x6]571umlal v8.2d,v28.2s,v0.s[1]572ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+1]573umlal v9.2d,v28.2s,v0.s[2]574b.eq .LInner_jump1575add x6,x6,#16 // don't advance in last iteration576.LInner_jump1:577umlal v10.2d,v28.2s,v0.s[3]578umlal v11.2d,v28.2s,v1.s[0]579umlal v12.2d,v28.2s,v1.s[1]580umlal v13.2d,v28.2s,v1.s[2]581umlal v6.2d,v28.2s,v1.s[3]582ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+2]583umlal v7.2d,v29.2s,v2.s[0]584umlal v8.2d,v29.2s,v2.s[1]585umlal v9.2d,v29.2s,v2.s[2]586umlal v10.2d,v29.2s,v2.s[3]587umlal v11.2d,v29.2s,v3.s[0]588umlal v12.2d,v29.2s,v3.s[1]589umlal v13.2d,v29.2s,v3.s[2]590umlal v6.2d,v29.2s,v3.s[3]591st1 {v7.2d},[x7],#16592umlal v8.2d,v28.2s,v0.s[0]593ld1 {v7.2d},[x6]594umlal v9.2d,v28.2s,v0.s[1]595ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+2]596umlal v10.2d,v28.2s,v0.s[2]597b.eq .LInner_jump2598add x6,x6,#16 // don't advance in last iteration599.LInner_jump2:600umlal v11.2d,v28.2s,v0.s[3]601umlal v12.2d,v28.2s,v1.s[0]602umlal v13.2d,v28.2s,v1.s[1]603umlal v6.2d,v28.2s,v1.s[2]604umlal v7.2d,v28.2s,v1.s[3]605ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+3]606umlal v8.2d,v29.2s,v2.s[0]607umlal v9.2d,v29.2s,v2.s[1]608umlal v10.2d,v29.2s,v2.s[2]609umlal v11.2d,v29.2s,v2.s[3]610umlal v12.2d,v29.2s,v3.s[0]611umlal v13.2d,v29.2s,v3.s[1]612umlal v6.2d,v29.2s,v3.s[2]613umlal v7.2d,v29.2s,v3.s[3]614st1 {v8.2d},[x7],#16615umlal v9.2d,v28.2s,v0.s[0]616ld1 {v8.2d},[x6]617umlal v10.2d,v28.2s,v0.s[1]618ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+3]619umlal v11.2d,v28.2s,v0.s[2]620b.eq .LInner_jump3621add x6,x6,#16 // don't advance in last iteration622.LInner_jump3:623umlal v12.2d,v28.2s,v0.s[3]624umlal v13.2d,v28.2s,v1.s[0]625umlal v6.2d,v28.2s,v1.s[1]626umlal v7.2d,v28.2s,v1.s[2]627umlal v8.2d,v28.2s,v1.s[3]628ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+4]629umlal v9.2d,v29.2s,v2.s[0]630umlal v10.2d,v29.2s,v2.s[1]631umlal v11.2d,v29.2s,v2.s[2]632umlal v12.2d,v29.2s,v2.s[3]633umlal v13.2d,v29.2s,v3.s[0]634umlal v6.2d,v29.2s,v3.s[1]635umlal v7.2d,v29.2s,v3.s[2]636umlal v8.2d,v29.2s,v3.s[3]637st1 {v9.2d},[x7],#16638umlal v10.2d,v28.2s,v0.s[0]639ld1 {v9.2d},[x6]640umlal v11.2d,v28.2s,v0.s[1]641ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+4]642umlal v12.2d,v28.2s,v0.s[2]643b.eq .LInner_jump4644add x6,x6,#16 // don't advance in last iteration645.LInner_jump4:646umlal v13.2d,v28.2s,v0.s[3]647umlal v6.2d,v28.2s,v1.s[0]648umlal v7.2d,v28.2s,v1.s[1]649umlal v8.2d,v28.2s,v1.s[2]650umlal v9.2d,v28.2s,v1.s[3]651ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+5]652umlal v10.2d,v29.2s,v2.s[0]653umlal v11.2d,v29.2s,v2.s[1]654umlal v12.2d,v29.2s,v2.s[2]655umlal v13.2d,v29.2s,v2.s[3]656umlal v6.2d,v29.2s,v3.s[0]657umlal v7.2d,v29.2s,v3.s[1]658umlal v8.2d,v29.2s,v3.s[2]659umlal v9.2d,v29.2s,v3.s[3]660st1 {v10.2d},[x7],#16661umlal v11.2d,v28.2s,v0.s[0]662ld1 {v10.2d},[x6]663umlal v12.2d,v28.2s,v0.s[1]664ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+5]665umlal v13.2d,v28.2s,v0.s[2]666b.eq .LInner_jump5667add x6,x6,#16 // don't advance in last iteration668.LInner_jump5:669umlal v6.2d,v28.2s,v0.s[3]670umlal v7.2d,v28.2s,v1.s[0]671umlal v8.2d,v28.2s,v1.s[1]672umlal v9.2d,v28.2s,v1.s[2]673umlal v10.2d,v28.2s,v1.s[3]674ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+6]675umlal v11.2d,v29.2s,v2.s[0]676umlal v12.2d,v29.2s,v2.s[1]677umlal v13.2d,v29.2s,v2.s[2]678umlal v6.2d,v29.2s,v2.s[3]679umlal v7.2d,v29.2s,v3.s[0]680umlal v8.2d,v29.2s,v3.s[1]681umlal v9.2d,v29.2s,v3.s[2]682umlal v10.2d,v29.2s,v3.s[3]683st1 {v11.2d},[x7],#16684umlal v12.2d,v28.2s,v0.s[0]685ld1 {v11.2d},[x6]686umlal v13.2d,v28.2s,v0.s[1]687ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+6]688umlal v6.2d,v28.2s,v0.s[2]689b.eq .LInner_jump6690add x6,x6,#16 // don't advance in last iteration691.LInner_jump6:692umlal v7.2d,v28.2s,v0.s[3]693umlal v8.2d,v28.2s,v1.s[0]694umlal v9.2d,v28.2s,v1.s[1]695umlal v10.2d,v28.2s,v1.s[2]696umlal v11.2d,v28.2s,v1.s[3]697ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+7]698umlal v12.2d,v29.2s,v2.s[0]699umlal v13.2d,v29.2s,v2.s[1]700umlal v6.2d,v29.2s,v2.s[2]701umlal v7.2d,v29.2s,v2.s[3]702umlal v8.2d,v29.2s,v3.s[0]703umlal v9.2d,v29.2s,v3.s[1]704umlal v10.2d,v29.2s,v3.s[2]705umlal v11.2d,v29.2s,v3.s[3]706st1 {v12.2d},[x7],#16707umlal v13.2d,v28.2s,v0.s[0]708ld1 {v12.2d},[x6]709umlal v6.2d,v28.2s,v0.s[1]710ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+7]711umlal v7.2d,v28.2s,v0.s[2]712b.eq .LInner_jump7713add x6,x6,#16 // don't advance in last iteration714.LInner_jump7:715umlal v8.2d,v28.2s,v0.s[3]716umlal v9.2d,v28.2s,v1.s[0]717umlal v10.2d,v28.2s,v1.s[1]718umlal v11.2d,v28.2s,v1.s[2]719umlal v12.2d,v28.2s,v1.s[3]720b.ne .LInner_after_rewind8721sub x1,x1,x5,lsl#2 // rewind722.LInner_after_rewind8:723umlal v13.2d,v29.2s,v2.s[0]724ld1 {v28.2s},[sp] // pull smashed b[8*i+0]725umlal v6.2d,v29.2s,v2.s[1]726ld1 {v0.4s,v1.4s},[x1],#32727umlal v7.2d,v29.2s,v2.s[2]728add x10,sp,#8 // rewind729umlal v8.2d,v29.2s,v2.s[3]730umlal v9.2d,v29.2s,v3.s[0]731umlal v10.2d,v29.2s,v3.s[1]732umlal v11.2d,v29.2s,v3.s[2]733st1 {v13.2d},[x7],#16734umlal v12.2d,v29.2s,v3.s[3]735736bne .LNEON_8n_inner737add x6,sp,#128738st1 {v6.2d,v7.2d},[x7],#32739eor v2.16b,v2.16b,v2.16b // v2740st1 {v8.2d,v9.2d},[x7],#32741eor v3.16b,v3.16b,v3.16b // v3742st1 {v10.2d,v11.2d},[x7],#32743st1 {v12.2d},[x7]744745subs x9,x9,#8746ld1 {v6.2d,v7.2d},[x6],#32747ld1 {v8.2d,v9.2d},[x6],#32748ld1 {v10.2d,v11.2d},[x6],#32749ld1 {v12.2d,v13.2d},[x6],#32750751b.eq .LInner_8n_jump_2steps752sub x3,x3,x5,lsl#2 // rewind753b .LNEON_8n_outer754755.LInner_8n_jump_2steps:756add x7,sp,#128757st1 {v2.2d,v3.2d}, [sp],#32 // start wiping stack frame758mov v5.16b,v6.16b759ushr v15.2d,v6.2d,#16760ext v6.16b,v6.16b,v6.16b,#8761st1 {v2.2d,v3.2d}, [sp],#32762add v6.2d,v6.2d,v15.2d763st1 {v2.2d,v3.2d}, [sp],#32764ushr v15.2d,v6.2d,#16765st1 {v2.2d,v3.2d}, [sp],#32766zip1 v6.4h,v5.4h,v6.4h767ins v15.d[1],v14.d[0]768769mov x8,x5770b .LNEON_tail_entry771772.align 4773.LNEON_tail:774add v6.2d,v6.2d,v15.2d775mov v5.16b,v6.16b776ushr v15.2d,v6.2d,#16777ext v6.16b,v6.16b,v6.16b,#8778ld1 {v8.2d,v9.2d}, [x6],#32779add v6.2d,v6.2d,v15.2d780ld1 {v10.2d,v11.2d}, [x6],#32781ushr v15.2d,v6.2d,#16782ld1 {v12.2d,v13.2d}, [x6],#32783zip1 v6.4h,v5.4h,v6.4h784ins v15.d[1],v14.d[0]785786.LNEON_tail_entry:787add v7.2d,v7.2d,v15.2d788st1 {v6.s}[0], [x7],#4789ushr v15.2d,v7.2d,#16790mov v5.16b,v7.16b791ext v7.16b,v7.16b,v7.16b,#8792add v7.2d,v7.2d,v15.2d793ushr v15.2d,v7.2d,#16794zip1 v7.4h,v5.4h,v7.4h795ins v15.d[1],v14.d[0]796add v8.2d,v8.2d,v15.2d797st1 {v7.s}[0], [x7],#4798ushr v15.2d,v8.2d,#16799mov v5.16b,v8.16b800ext v8.16b,v8.16b,v8.16b,#8801add v8.2d,v8.2d,v15.2d802ushr v15.2d,v8.2d,#16803zip1 v8.4h,v5.4h,v8.4h804ins v15.d[1],v14.d[0]805add v9.2d,v9.2d,v15.2d806st1 {v8.s}[0], [x7],#4807ushr v15.2d,v9.2d,#16808mov v5.16b,v9.16b809ext v9.16b,v9.16b,v9.16b,#8810add v9.2d,v9.2d,v15.2d811ushr v15.2d,v9.2d,#16812zip1 v9.4h,v5.4h,v9.4h813ins v15.d[1],v14.d[0]814add v10.2d,v10.2d,v15.2d815st1 {v9.s}[0], [x7],#4816ushr v15.2d,v10.2d,#16817mov v5.16b,v10.16b818ext v10.16b,v10.16b,v10.16b,#8819add v10.2d,v10.2d,v15.2d820ushr v15.2d,v10.2d,#16821zip1 v10.4h,v5.4h,v10.4h822ins v15.d[1],v14.d[0]823add v11.2d,v11.2d,v15.2d824st1 {v10.s}[0], [x7],#4825ushr v15.2d,v11.2d,#16826mov v5.16b,v11.16b827ext v11.16b,v11.16b,v11.16b,#8828add v11.2d,v11.2d,v15.2d829ushr v15.2d,v11.2d,#16830zip1 v11.4h,v5.4h,v11.4h831ins v15.d[1],v14.d[0]832add v12.2d,v12.2d,v15.2d833st1 {v11.s}[0], [x7],#4834ushr v15.2d,v12.2d,#16835mov v5.16b,v12.16b836ext v12.16b,v12.16b,v12.16b,#8837add v12.2d,v12.2d,v15.2d838ushr v15.2d,v12.2d,#16839zip1 v12.4h,v5.4h,v12.4h840ins v15.d[1],v14.d[0]841add v13.2d,v13.2d,v15.2d842st1 {v12.s}[0], [x7],#4843ushr v15.2d,v13.2d,#16844mov v5.16b,v13.16b845ext v13.16b,v13.16b,v13.16b,#8846add v13.2d,v13.2d,v15.2d847ushr v15.2d,v13.2d,#16848zip1 v13.4h,v5.4h,v13.4h849ins v15.d[1],v14.d[0]850ld1 {v6.2d,v7.2d}, [x6],#32851subs x8,x8,#8852st1 {v13.s}[0], [x7],#4853bne .LNEON_tail854855st1 {v15.s}[0], [x7],#4 // top-most bit856sub x3,x3,x5,lsl#2 // rewind x3857subs x1,sp,#0 // clear carry flag858add x2,sp,x5,lsl#2859860.LNEON_sub:861ldp w4,w5,[x1],#8862ldp w6,w7,[x1],#8863ldp w8,w9,[x3],#8864ldp w10,w11,[x3],#8865sbcs w8,w4,w8866sbcs w9,w5,w9867sbcs w10,w6,w10868sbcs w11,w7,w11869sub x17,x2,x1870stp w8,w9,[x0],#8871stp w10,w11,[x0],#8872cbnz x17,.LNEON_sub873874ldr w10, [x1] // load top-most bit875mov x11,sp876eor v0.16b,v0.16b,v0.16b877sub x11,x2,x11 // this is num*4878eor v1.16b,v1.16b,v1.16b879mov x1,sp880sub x0,x0,x11 // rewind x0881mov x3,x2 // second 3/4th of frame882sbcs w10,w10,wzr // result is carry flag883884.LNEON_copy_n_zap:885ldp w4,w5,[x1],#8886ldp w6,w7,[x1],#8887ldp w8,w9,[x0],#8888ldp w10,w11,[x0]889sub x0,x0,#8890b.cs .LCopy_1891mov w8,w4892mov w9,w5893mov w10,w6894mov w11,w7895.LCopy_1:896st1 {v0.2d,v1.2d}, [x3],#32 // wipe897st1 {v0.2d,v1.2d}, [x3],#32 // wipe898ldp w4,w5,[x1],#8899ldp w6,w7,[x1],#8900stp w8,w9,[x0],#8901stp w10,w11,[x0],#8902sub x1,x1,#32903ldp w8,w9,[x0],#8904ldp w10,w11,[x0]905sub x0,x0,#8906b.cs .LCopy_2907mov w8, w4908mov w9, w5909mov w10, w6910mov w11, w7911.LCopy_2:912st1 {v0.2d,v1.2d}, [x1],#32 // wipe913st1 {v0.2d,v1.2d}, [x3],#32 // wipe914sub x17,x2,x1 // preserves carry915stp w8,w9,[x0],#8916stp w10,w11,[x0],#8917cbnz x17,.LNEON_copy_n_zap918919mov sp,x16920ldp d14,d15,[sp,#64]921ldp d12,d13,[sp,#48]922ldp d10,d11,[sp,#32]923ldp d8,d9,[sp,#16]924ldr x29,[sp],#80925AARCH64_VALIDATE_LINK_REGISTER926ret // bx lr927928.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon929.type __bn_sqr8x_mont,%function930.align 5931__bn_sqr8x_mont:932cmp x1,x2933b.ne __bn_mul4x_mont934.Lsqr8x_mont:935// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to936// only from bn_mul_mont which has already signed the return address.937stp x29,x30,[sp,#-128]!938add x29,sp,#0939stp x19,x20,[sp,#16]940stp x21,x22,[sp,#32]941stp x23,x24,[sp,#48]942stp x25,x26,[sp,#64]943stp x27,x28,[sp,#80]944stp x0,x3,[sp,#96] // offload rp and np945946ldp x6,x7,[x1,#8*0]947ldp x8,x9,[x1,#8*2]948ldp x10,x11,[x1,#8*4]949ldp x12,x13,[x1,#8*6]950951sub x2,sp,x5,lsl#4952lsl x5,x5,#3953ldr x4,[x4] // *n0954mov sp,x2 // alloca955sub x27,x5,#8*8956b .Lsqr8x_zero_start957958.Lsqr8x_zero:959sub x27,x27,#8*8960stp xzr,xzr,[x2,#8*0]961stp xzr,xzr,[x2,#8*2]962stp xzr,xzr,[x2,#8*4]963stp xzr,xzr,[x2,#8*6]964.Lsqr8x_zero_start:965stp xzr,xzr,[x2,#8*8]966stp xzr,xzr,[x2,#8*10]967stp xzr,xzr,[x2,#8*12]968stp xzr,xzr,[x2,#8*14]969add x2,x2,#8*16970cbnz x27,.Lsqr8x_zero971972add x3,x1,x5973add x1,x1,#8*8974mov x19,xzr975mov x20,xzr976mov x21,xzr977mov x22,xzr978mov x23,xzr979mov x24,xzr980mov x25,xzr981mov x26,xzr982mov x2,sp983str x4,[x29,#112] // offload n0984985// Multiply everything but a[i]*a[i]986.align 4987.Lsqr8x_outer_loop:988// a[1]a[0] (i)989// a[2]a[0]990// a[3]a[0]991// a[4]a[0]992// a[5]a[0]993// a[6]a[0]994// a[7]a[0]995// a[2]a[1] (ii)996// a[3]a[1]997// a[4]a[1]998// a[5]a[1]999// a[6]a[1]1000// a[7]a[1]1001// a[3]a[2] (iii)1002// a[4]a[2]1003// a[5]a[2]1004// a[6]a[2]1005// a[7]a[2]1006// a[4]a[3] (iv)1007// a[5]a[3]1008// a[6]a[3]1009// a[7]a[3]1010// a[5]a[4] (v)1011// a[6]a[4]1012// a[7]a[4]1013// a[6]a[5] (vi)1014// a[7]a[5]1015// a[7]a[6] (vii)10161017mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)1018mul x15,x8,x61019mul x16,x9,x61020mul x17,x10,x61021adds x20,x20,x14 // t[1]+lo(a[1]*a[0])1022mul x14,x11,x61023adcs x21,x21,x151024mul x15,x12,x61025adcs x22,x22,x161026mul x16,x13,x61027adcs x23,x23,x171028umulh x17,x7,x6 // hi(a[1..7]*a[0])1029adcs x24,x24,x141030umulh x14,x8,x61031adcs x25,x25,x151032umulh x15,x9,x61033adcs x26,x26,x161034umulh x16,x10,x61035stp x19,x20,[x2],#8*2 // t[0..1]1036adc x19,xzr,xzr // t[8]1037adds x21,x21,x17 // t[2]+lo(a[1]*a[0])1038umulh x17,x11,x61039adcs x22,x22,x141040umulh x14,x12,x61041adcs x23,x23,x151042umulh x15,x13,x61043adcs x24,x24,x161044mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)1045adcs x25,x25,x171046mul x17,x9,x71047adcs x26,x26,x141048mul x14,x10,x71049adc x19,x19,x1510501051mul x15,x11,x71052adds x22,x22,x161053mul x16,x12,x71054adcs x23,x23,x171055mul x17,x13,x71056adcs x24,x24,x141057umulh x14,x8,x7 // hi(a[2..7]*a[1])1058adcs x25,x25,x151059umulh x15,x9,x71060adcs x26,x26,x161061umulh x16,x10,x71062adcs x19,x19,x171063umulh x17,x11,x71064stp x21,x22,[x2],#8*2 // t[2..3]1065adc x20,xzr,xzr // t[9]1066adds x23,x23,x141067umulh x14,x12,x71068adcs x24,x24,x151069umulh x15,x13,x71070adcs x25,x25,x161071mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)1072adcs x26,x26,x171073mul x17,x10,x81074adcs x19,x19,x141075mul x14,x11,x81076adc x20,x20,x1510771078mul x15,x12,x81079adds x24,x24,x161080mul x16,x13,x81081adcs x25,x25,x171082umulh x17,x9,x8 // hi(a[3..7]*a[2])1083adcs x26,x26,x141084umulh x14,x10,x81085adcs x19,x19,x151086umulh x15,x11,x81087adcs x20,x20,x161088umulh x16,x12,x81089stp x23,x24,[x2],#8*2 // t[4..5]1090adc x21,xzr,xzr // t[10]1091adds x25,x25,x171092umulh x17,x13,x81093adcs x26,x26,x141094mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)1095adcs x19,x19,x151096mul x15,x11,x91097adcs x20,x20,x161098mul x16,x12,x91099adc x21,x21,x1711001101mul x17,x13,x91102adds x26,x26,x141103umulh x14,x10,x9 // hi(a[4..7]*a[3])1104adcs x19,x19,x151105umulh x15,x11,x91106adcs x20,x20,x161107umulh x16,x12,x91108adcs x21,x21,x171109umulh x17,x13,x91110stp x25,x26,[x2],#8*2 // t[6..7]1111adc x22,xzr,xzr // t[11]1112adds x19,x19,x141113mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)1114adcs x20,x20,x151115mul x15,x12,x101116adcs x21,x21,x161117mul x16,x13,x101118adc x22,x22,x1711191120umulh x17,x11,x10 // hi(a[5..7]*a[4])1121adds x20,x20,x141122umulh x14,x12,x101123adcs x21,x21,x151124umulh x15,x13,x101125adcs x22,x22,x161126mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)1127adc x23,xzr,xzr // t[12]1128adds x21,x21,x171129mul x17,x13,x111130adcs x22,x22,x141131umulh x14,x12,x11 // hi(a[6..7]*a[5])1132adc x23,x23,x1511331134umulh x15,x13,x111135adds x22,x22,x161136mul x16,x13,x12 // lo(a[7]*a[6]) (vii)1137adcs x23,x23,x171138umulh x17,x13,x12 // hi(a[7]*a[6])1139adc x24,xzr,xzr // t[13]1140adds x23,x23,x141141sub x27,x3,x1 // done yet?1142adc x24,x24,x1511431144adds x24,x24,x161145sub x14,x3,x5 // rewinded ap1146adc x25,xzr,xzr // t[14]1147add x25,x25,x1711481149cbz x27,.Lsqr8x_outer_break11501151mov x4,x61152ldp x6,x7,[x2,#8*0]1153ldp x8,x9,[x2,#8*2]1154ldp x10,x11,[x2,#8*4]1155ldp x12,x13,[x2,#8*6]1156adds x19,x19,x61157adcs x20,x20,x71158ldp x6,x7,[x1,#8*0]1159adcs x21,x21,x81160adcs x22,x22,x91161ldp x8,x9,[x1,#8*2]1162adcs x23,x23,x101163adcs x24,x24,x111164ldp x10,x11,[x1,#8*4]1165adcs x25,x25,x121166mov x0,x11167adcs x26,xzr,x131168ldp x12,x13,[x1,#8*6]1169add x1,x1,#8*81170//adc x28,xzr,xzr // moved below1171mov x27,#-8*811721173// a[8]a[0]1174// a[9]a[0]1175// a[a]a[0]1176// a[b]a[0]1177// a[c]a[0]1178// a[d]a[0]1179// a[e]a[0]1180// a[f]a[0]1181// a[8]a[1]1182// a[f]a[1]........................1183// a[8]a[2]1184// a[f]a[2]........................1185// a[8]a[3]1186// a[f]a[3]........................1187// a[8]a[4]1188// a[f]a[4]........................1189// a[8]a[5]1190// a[f]a[5]........................1191// a[8]a[6]1192// a[f]a[6]........................1193// a[8]a[7]1194// a[f]a[7]........................1195.Lsqr8x_mul:1196mul x14,x6,x41197adc x28,xzr,xzr // carry bit, modulo-scheduled1198mul x15,x7,x41199add x27,x27,#81200mul x16,x8,x41201mul x17,x9,x41202adds x19,x19,x141203mul x14,x10,x41204adcs x20,x20,x151205mul x15,x11,x41206adcs x21,x21,x161207mul x16,x12,x41208adcs x22,x22,x171209mul x17,x13,x41210adcs x23,x23,x141211umulh x14,x6,x41212adcs x24,x24,x151213umulh x15,x7,x41214adcs x25,x25,x161215umulh x16,x8,x41216adcs x26,x26,x171217umulh x17,x9,x41218adc x28,x28,xzr1219str x19,[x2],#81220adds x19,x20,x141221umulh x14,x10,x41222adcs x20,x21,x151223umulh x15,x11,x41224adcs x21,x22,x161225umulh x16,x12,x41226adcs x22,x23,x171227umulh x17,x13,x41228ldr x4,[x0,x27]1229adcs x23,x24,x141230adcs x24,x25,x151231adcs x25,x26,x161232adcs x26,x28,x171233//adc x28,xzr,xzr // moved above1234cbnz x27,.Lsqr8x_mul1235// note that carry flag is guaranteed1236// to be zero at this point1237cmp x1,x3 // done yet?1238b.eq .Lsqr8x_break12391240ldp x6,x7,[x2,#8*0]1241ldp x8,x9,[x2,#8*2]1242ldp x10,x11,[x2,#8*4]1243ldp x12,x13,[x2,#8*6]1244adds x19,x19,x61245ldur x4,[x0,#-8*8]1246adcs x20,x20,x71247ldp x6,x7,[x1,#8*0]1248adcs x21,x21,x81249adcs x22,x22,x91250ldp x8,x9,[x1,#8*2]1251adcs x23,x23,x101252adcs x24,x24,x111253ldp x10,x11,[x1,#8*4]1254adcs x25,x25,x121255mov x27,#-8*81256adcs x26,x26,x131257ldp x12,x13,[x1,#8*6]1258add x1,x1,#8*81259//adc x28,xzr,xzr // moved above1260b .Lsqr8x_mul12611262.align 41263.Lsqr8x_break:1264ldp x6,x7,[x0,#8*0]1265add x1,x0,#8*81266ldp x8,x9,[x0,#8*2]1267sub x14,x3,x1 // is it last iteration?1268ldp x10,x11,[x0,#8*4]1269sub x15,x2,x141270ldp x12,x13,[x0,#8*6]1271cbz x14,.Lsqr8x_outer_loop12721273stp x19,x20,[x2,#8*0]1274ldp x19,x20,[x15,#8*0]1275stp x21,x22,[x2,#8*2]1276ldp x21,x22,[x15,#8*2]1277stp x23,x24,[x2,#8*4]1278ldp x23,x24,[x15,#8*4]1279stp x25,x26,[x2,#8*6]1280mov x2,x151281ldp x25,x26,[x15,#8*6]1282b .Lsqr8x_outer_loop12831284.align 41285.Lsqr8x_outer_break:1286// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]1287ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]1288ldp x15,x16,[sp,#8*1]1289ldp x11,x13,[x14,#8*2]1290add x1,x14,#8*41291ldp x17,x14,[sp,#8*3]12921293stp x19,x20,[x2,#8*0]1294mul x19,x7,x71295stp x21,x22,[x2,#8*2]1296umulh x7,x7,x71297stp x23,x24,[x2,#8*4]1298mul x8,x9,x91299stp x25,x26,[x2,#8*6]1300mov x2,sp1301umulh x9,x9,x91302adds x20,x7,x15,lsl#11303extr x15,x16,x15,#631304sub x27,x5,#8*413051306.Lsqr4x_shift_n_add:1307adcs x21,x8,x151308extr x16,x17,x16,#631309sub x27,x27,#8*41310adcs x22,x9,x161311ldp x15,x16,[x2,#8*5]1312mul x10,x11,x111313ldp x7,x9,[x1],#8*21314umulh x11,x11,x111315mul x12,x13,x131316umulh x13,x13,x131317extr x17,x14,x17,#631318stp x19,x20,[x2,#8*0]1319adcs x23,x10,x171320extr x14,x15,x14,#631321stp x21,x22,[x2,#8*2]1322adcs x24,x11,x141323ldp x17,x14,[x2,#8*7]1324extr x15,x16,x15,#631325adcs x25,x12,x151326extr x16,x17,x16,#631327adcs x26,x13,x161328ldp x15,x16,[x2,#8*9]1329mul x6,x7,x71330ldp x11,x13,[x1],#8*21331umulh x7,x7,x71332mul x8,x9,x91333umulh x9,x9,x91334stp x23,x24,[x2,#8*4]1335extr x17,x14,x17,#631336stp x25,x26,[x2,#8*6]1337add x2,x2,#8*81338adcs x19,x6,x171339extr x14,x15,x14,#631340adcs x20,x7,x141341ldp x17,x14,[x2,#8*3]1342extr x15,x16,x15,#631343cbnz x27,.Lsqr4x_shift_n_add1344ldp x1,x4,[x29,#104] // pull np and n013451346adcs x21,x8,x151347extr x16,x17,x16,#631348adcs x22,x9,x161349ldp x15,x16,[x2,#8*5]1350mul x10,x11,x111351umulh x11,x11,x111352stp x19,x20,[x2,#8*0]1353mul x12,x13,x131354umulh x13,x13,x131355stp x21,x22,[x2,#8*2]1356extr x17,x14,x17,#631357adcs x23,x10,x171358extr x14,x15,x14,#631359ldp x19,x20,[sp,#8*0]1360adcs x24,x11,x141361extr x15,x16,x15,#631362ldp x6,x7,[x1,#8*0]1363adcs x25,x12,x151364extr x16,xzr,x16,#631365ldp x8,x9,[x1,#8*2]1366adc x26,x13,x161367ldp x10,x11,[x1,#8*4]13681369// Reduce by 512 bits per iteration1370mul x28,x4,x19 // t[0]*n01371ldp x12,x13,[x1,#8*6]1372add x3,x1,x51373ldp x21,x22,[sp,#8*2]1374stp x23,x24,[x2,#8*4]1375ldp x23,x24,[sp,#8*4]1376stp x25,x26,[x2,#8*6]1377ldp x25,x26,[sp,#8*6]1378add x1,x1,#8*81379mov x30,xzr // initial top-most carry1380mov x2,sp1381mov x27,#813821383.Lsqr8x_reduction:1384// (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)1385mul x15,x7,x281386sub x27,x27,#11387mul x16,x8,x281388str x28,[x2],#8 // put aside t[0]*n0 for tail processing1389mul x17,x9,x281390// (*) adds xzr,x19,x141391subs xzr,x19,#1 // (*)1392mul x14,x10,x281393adcs x19,x20,x151394mul x15,x11,x281395adcs x20,x21,x161396mul x16,x12,x281397adcs x21,x22,x171398mul x17,x13,x281399adcs x22,x23,x141400umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)1401adcs x23,x24,x151402umulh x15,x7,x281403adcs x24,x25,x161404umulh x16,x8,x281405adcs x25,x26,x171406umulh x17,x9,x281407adc x26,xzr,xzr1408adds x19,x19,x141409umulh x14,x10,x281410adcs x20,x20,x151411umulh x15,x11,x281412adcs x21,x21,x161413umulh x16,x12,x281414adcs x22,x22,x171415umulh x17,x13,x281416mul x28,x4,x19 // next t[0]*n01417adcs x23,x23,x141418adcs x24,x24,x151419adcs x25,x25,x161420adc x26,x26,x171421cbnz x27,.Lsqr8x_reduction14221423ldp x14,x15,[x2,#8*0]1424ldp x16,x17,[x2,#8*2]1425mov x0,x21426sub x27,x3,x1 // done yet?1427adds x19,x19,x141428adcs x20,x20,x151429ldp x14,x15,[x2,#8*4]1430adcs x21,x21,x161431adcs x22,x22,x171432ldp x16,x17,[x2,#8*6]1433adcs x23,x23,x141434adcs x24,x24,x151435adcs x25,x25,x161436adcs x26,x26,x171437//adc x28,xzr,xzr // moved below1438cbz x27,.Lsqr8x8_post_condition14391440ldur x4,[x2,#-8*8]1441ldp x6,x7,[x1,#8*0]1442ldp x8,x9,[x1,#8*2]1443ldp x10,x11,[x1,#8*4]1444mov x27,#-8*81445ldp x12,x13,[x1,#8*6]1446add x1,x1,#8*814471448.Lsqr8x_tail:1449mul x14,x6,x41450adc x28,xzr,xzr // carry bit, modulo-scheduled1451mul x15,x7,x41452add x27,x27,#81453mul x16,x8,x41454mul x17,x9,x41455adds x19,x19,x141456mul x14,x10,x41457adcs x20,x20,x151458mul x15,x11,x41459adcs x21,x21,x161460mul x16,x12,x41461adcs x22,x22,x171462mul x17,x13,x41463adcs x23,x23,x141464umulh x14,x6,x41465adcs x24,x24,x151466umulh x15,x7,x41467adcs x25,x25,x161468umulh x16,x8,x41469adcs x26,x26,x171470umulh x17,x9,x41471adc x28,x28,xzr1472str x19,[x2],#81473adds x19,x20,x141474umulh x14,x10,x41475adcs x20,x21,x151476umulh x15,x11,x41477adcs x21,x22,x161478umulh x16,x12,x41479adcs x22,x23,x171480umulh x17,x13,x41481ldr x4,[x0,x27]1482adcs x23,x24,x141483adcs x24,x25,x151484adcs x25,x26,x161485adcs x26,x28,x171486//adc x28,xzr,xzr // moved above1487cbnz x27,.Lsqr8x_tail1488// note that carry flag is guaranteed1489// to be zero at this point1490ldp x6,x7,[x2,#8*0]1491sub x27,x3,x1 // done yet?1492sub x16,x3,x5 // rewinded np1493ldp x8,x9,[x2,#8*2]1494ldp x10,x11,[x2,#8*4]1495ldp x12,x13,[x2,#8*6]1496cbz x27,.Lsqr8x_tail_break14971498ldur x4,[x0,#-8*8]1499adds x19,x19,x61500adcs x20,x20,x71501ldp x6,x7,[x1,#8*0]1502adcs x21,x21,x81503adcs x22,x22,x91504ldp x8,x9,[x1,#8*2]1505adcs x23,x23,x101506adcs x24,x24,x111507ldp x10,x11,[x1,#8*4]1508adcs x25,x25,x121509mov x27,#-8*81510adcs x26,x26,x131511ldp x12,x13,[x1,#8*6]1512add x1,x1,#8*81513//adc x28,xzr,xzr // moved above1514b .Lsqr8x_tail15151516.align 41517.Lsqr8x_tail_break:1518ldr x4,[x29,#112] // pull n01519add x27,x2,#8*8 // end of current t[num] window15201521subs xzr,x30,#1 // "move" top-most carry to carry bit1522adcs x14,x19,x61523adcs x15,x20,x71524ldp x19,x20,[x0,#8*0]1525adcs x21,x21,x81526ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]1527adcs x22,x22,x91528ldp x8,x9,[x16,#8*2]1529adcs x23,x23,x101530adcs x24,x24,x111531ldp x10,x11,[x16,#8*4]1532adcs x25,x25,x121533adcs x26,x26,x131534ldp x12,x13,[x16,#8*6]1535add x1,x16,#8*81536adc x30,xzr,xzr // top-most carry1537mul x28,x4,x191538stp x14,x15,[x2,#8*0]1539stp x21,x22,[x2,#8*2]1540ldp x21,x22,[x0,#8*2]1541stp x23,x24,[x2,#8*4]1542ldp x23,x24,[x0,#8*4]1543cmp x27,x29 // did we hit the bottom?1544stp x25,x26,[x2,#8*6]1545mov x2,x0 // slide the window1546ldp x25,x26,[x0,#8*6]1547mov x27,#81548b.ne .Lsqr8x_reduction15491550// Final step. We see if result is larger than modulus, and1551// if it is, subtract the modulus. But comparison implies1552// subtraction. So we subtract modulus, see if it borrowed,1553// and conditionally copy original value.1554ldr x0,[x29,#96] // pull rp1555add x2,x2,#8*81556subs x14,x19,x61557sbcs x15,x20,x71558sub x27,x5,#8*81559mov x3,x0 // x0 copy15601561.Lsqr8x_sub:1562sbcs x16,x21,x81563ldp x6,x7,[x1,#8*0]1564sbcs x17,x22,x91565stp x14,x15,[x0,#8*0]1566sbcs x14,x23,x101567ldp x8,x9,[x1,#8*2]1568sbcs x15,x24,x111569stp x16,x17,[x0,#8*2]1570sbcs x16,x25,x121571ldp x10,x11,[x1,#8*4]1572sbcs x17,x26,x131573ldp x12,x13,[x1,#8*6]1574add x1,x1,#8*81575ldp x19,x20,[x2,#8*0]1576sub x27,x27,#8*81577ldp x21,x22,[x2,#8*2]1578ldp x23,x24,[x2,#8*4]1579ldp x25,x26,[x2,#8*6]1580add x2,x2,#8*81581stp x14,x15,[x0,#8*4]1582sbcs x14,x19,x61583stp x16,x17,[x0,#8*6]1584add x0,x0,#8*81585sbcs x15,x20,x71586cbnz x27,.Lsqr8x_sub15871588sbcs x16,x21,x81589mov x2,sp1590add x1,sp,x51591ldp x6,x7,[x3,#8*0]1592sbcs x17,x22,x91593stp x14,x15,[x0,#8*0]1594sbcs x14,x23,x101595ldp x8,x9,[x3,#8*2]1596sbcs x15,x24,x111597stp x16,x17,[x0,#8*2]1598sbcs x16,x25,x121599ldp x19,x20,[x1,#8*0]1600sbcs x17,x26,x131601ldp x21,x22,[x1,#8*2]1602sbcs xzr,x30,xzr // did it borrow?1603ldr x30,[x29,#8] // pull return address1604stp x14,x15,[x0,#8*4]1605stp x16,x17,[x0,#8*6]16061607sub x27,x5,#8*41608.Lsqr4x_cond_copy:1609sub x27,x27,#8*41610csel x14,x19,x6,lo1611stp xzr,xzr,[x2,#8*0]1612csel x15,x20,x7,lo1613ldp x6,x7,[x3,#8*4]1614ldp x19,x20,[x1,#8*4]1615csel x16,x21,x8,lo1616stp xzr,xzr,[x2,#8*2]1617add x2,x2,#8*41618csel x17,x22,x9,lo1619ldp x8,x9,[x3,#8*6]1620ldp x21,x22,[x1,#8*6]1621add x1,x1,#8*41622stp x14,x15,[x3,#8*0]1623stp x16,x17,[x3,#8*2]1624add x3,x3,#8*41625stp xzr,xzr,[x1,#8*0]1626stp xzr,xzr,[x1,#8*2]1627cbnz x27,.Lsqr4x_cond_copy16281629csel x14,x19,x6,lo1630stp xzr,xzr,[x2,#8*0]1631csel x15,x20,x7,lo1632stp xzr,xzr,[x2,#8*2]1633csel x16,x21,x8,lo1634csel x17,x22,x9,lo1635stp x14,x15,[x3,#8*0]1636stp x16,x17,[x3,#8*2]16371638b .Lsqr8x_done16391640.align 41641.Lsqr8x8_post_condition:1642adc x28,xzr,xzr1643ldr x30,[x29,#8] // pull return address1644// x19-7,x28 hold result, x6-7 hold modulus1645subs x6,x19,x61646ldr x1,[x29,#96] // pull rp1647sbcs x7,x20,x71648stp xzr,xzr,[sp,#8*0]1649sbcs x8,x21,x81650stp xzr,xzr,[sp,#8*2]1651sbcs x9,x22,x91652stp xzr,xzr,[sp,#8*4]1653sbcs x10,x23,x101654stp xzr,xzr,[sp,#8*6]1655sbcs x11,x24,x111656stp xzr,xzr,[sp,#8*8]1657sbcs x12,x25,x121658stp xzr,xzr,[sp,#8*10]1659sbcs x13,x26,x131660stp xzr,xzr,[sp,#8*12]1661sbcs x28,x28,xzr // did it borrow?1662stp xzr,xzr,[sp,#8*14]16631664// x6-7 hold result-modulus1665csel x6,x19,x6,lo1666csel x7,x20,x7,lo1667csel x8,x21,x8,lo1668csel x9,x22,x9,lo1669stp x6,x7,[x1,#8*0]1670csel x10,x23,x10,lo1671csel x11,x24,x11,lo1672stp x8,x9,[x1,#8*2]1673csel x12,x25,x12,lo1674csel x13,x26,x13,lo1675stp x10,x11,[x1,#8*4]1676stp x12,x13,[x1,#8*6]16771678.Lsqr8x_done:1679ldp x19,x20,[x29,#16]1680mov sp,x291681ldp x21,x22,[x29,#32]1682mov x0,#11683ldp x23,x24,[x29,#48]1684ldp x25,x26,[x29,#64]1685ldp x27,x28,[x29,#80]1686ldr x29,[sp],#1281687// x30 is loaded earlier1688AARCH64_VALIDATE_LINK_REGISTER1689ret1690.size __bn_sqr8x_mont,.-__bn_sqr8x_mont1691.type __bn_mul4x_mont,%function1692.align 51693__bn_mul4x_mont:1694// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to1695// only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address.1696stp x29,x30,[sp,#-128]!1697add x29,sp,#01698stp x19,x20,[sp,#16]1699stp x21,x22,[sp,#32]1700stp x23,x24,[sp,#48]1701stp x25,x26,[sp,#64]1702stp x27,x28,[sp,#80]17031704sub x26,sp,x5,lsl#31705lsl x5,x5,#31706ldr x4,[x4] // *n01707sub sp,x26,#8*4 // alloca17081709add x10,x2,x51710add x27,x1,x51711stp x0,x10,[x29,#96] // offload rp and &b[num]17121713ldr x24,[x2,#8*0] // b[0]1714ldp x6,x7,[x1,#8*0] // a[0..3]1715ldp x8,x9,[x1,#8*2]1716add x1,x1,#8*41717mov x19,xzr1718mov x20,xzr1719mov x21,xzr1720mov x22,xzr1721ldp x14,x15,[x3,#8*0] // n[0..3]1722ldp x16,x17,[x3,#8*2]1723adds x3,x3,#8*4 // clear carry bit1724mov x0,xzr1725mov x28,#01726mov x26,sp17271728.Loop_mul4x_1st_reduction:1729mul x10,x6,x24 // lo(a[0..3]*b[0])1730adc x0,x0,xzr // modulo-scheduled1731mul x11,x7,x241732add x28,x28,#81733mul x12,x8,x241734and x28,x28,#311735mul x13,x9,x241736adds x19,x19,x101737umulh x10,x6,x24 // hi(a[0..3]*b[0])1738adcs x20,x20,x111739mul x25,x19,x4 // t[0]*n01740adcs x21,x21,x121741umulh x11,x7,x241742adcs x22,x22,x131743umulh x12,x8,x241744adc x23,xzr,xzr1745umulh x13,x9,x241746ldr x24,[x2,x28] // next b[i] (or b[0])1747adds x20,x20,x101748// (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)1749str x25,[x26],#8 // put aside t[0]*n0 for tail processing1750adcs x21,x21,x111751mul x11,x15,x251752adcs x22,x22,x121753mul x12,x16,x251754adc x23,x23,x13 // can't overflow1755mul x13,x17,x251756// (*) adds xzr,x19,x101757subs xzr,x19,#1 // (*)1758umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)1759adcs x19,x20,x111760umulh x11,x15,x251761adcs x20,x21,x121762umulh x12,x16,x251763adcs x21,x22,x131764umulh x13,x17,x251765adcs x22,x23,x01766adc x0,xzr,xzr1767adds x19,x19,x101768sub x10,x27,x11769adcs x20,x20,x111770adcs x21,x21,x121771adcs x22,x22,x131772//adc x0,x0,xzr1773cbnz x28,.Loop_mul4x_1st_reduction17741775cbz x10,.Lmul4x4_post_condition17761777ldp x6,x7,[x1,#8*0] // a[4..7]1778ldp x8,x9,[x1,#8*2]1779add x1,x1,#8*41780ldr x25,[sp] // a[0]*n01781ldp x14,x15,[x3,#8*0] // n[4..7]1782ldp x16,x17,[x3,#8*2]1783add x3,x3,#8*417841785.Loop_mul4x_1st_tail:1786mul x10,x6,x24 // lo(a[4..7]*b[i])1787adc x0,x0,xzr // modulo-scheduled1788mul x11,x7,x241789add x28,x28,#81790mul x12,x8,x241791and x28,x28,#311792mul x13,x9,x241793adds x19,x19,x101794umulh x10,x6,x24 // hi(a[4..7]*b[i])1795adcs x20,x20,x111796umulh x11,x7,x241797adcs x21,x21,x121798umulh x12,x8,x241799adcs x22,x22,x131800umulh x13,x9,x241801adc x23,xzr,xzr1802ldr x24,[x2,x28] // next b[i] (or b[0])1803adds x20,x20,x101804mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)1805adcs x21,x21,x111806mul x11,x15,x251807adcs x22,x22,x121808mul x12,x16,x251809adc x23,x23,x13 // can't overflow1810mul x13,x17,x251811adds x19,x19,x101812umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)1813adcs x20,x20,x111814umulh x11,x15,x251815adcs x21,x21,x121816umulh x12,x16,x251817adcs x22,x22,x131818adcs x23,x23,x01819umulh x13,x17,x251820adc x0,xzr,xzr1821ldr x25,[sp,x28] // next t[0]*n01822str x19,[x26],#8 // result!!!1823adds x19,x20,x101824sub x10,x27,x1 // done yet?1825adcs x20,x21,x111826adcs x21,x22,x121827adcs x22,x23,x131828//adc x0,x0,xzr1829cbnz x28,.Loop_mul4x_1st_tail18301831sub x11,x27,x5 // rewinded x11832cbz x10,.Lmul4x_proceed18331834ldp x6,x7,[x1,#8*0]1835ldp x8,x9,[x1,#8*2]1836add x1,x1,#8*41837ldp x14,x15,[x3,#8*0]1838ldp x16,x17,[x3,#8*2]1839add x3,x3,#8*41840b .Loop_mul4x_1st_tail18411842.align 51843.Lmul4x_proceed:1844ldr x24,[x2,#8*4]! // *++b1845adc x30,x0,xzr1846ldp x6,x7,[x11,#8*0] // a[0..3]1847sub x3,x3,x5 // rewind np1848ldp x8,x9,[x11,#8*2]1849add x1,x11,#8*418501851stp x19,x20,[x26,#8*0] // result!!!1852ldp x19,x20,[sp,#8*4] // t[0..3]1853stp x21,x22,[x26,#8*2] // result!!!1854ldp x21,x22,[sp,#8*6]18551856ldp x14,x15,[x3,#8*0] // n[0..3]1857mov x26,sp1858ldp x16,x17,[x3,#8*2]1859adds x3,x3,#8*4 // clear carry bit1860mov x0,xzr18611862.align 41863.Loop_mul4x_reduction:1864mul x10,x6,x24 // lo(a[0..3]*b[4])1865adc x0,x0,xzr // modulo-scheduled1866mul x11,x7,x241867add x28,x28,#81868mul x12,x8,x241869and x28,x28,#311870mul x13,x9,x241871adds x19,x19,x101872umulh x10,x6,x24 // hi(a[0..3]*b[4])1873adcs x20,x20,x111874mul x25,x19,x4 // t[0]*n01875adcs x21,x21,x121876umulh x11,x7,x241877adcs x22,x22,x131878umulh x12,x8,x241879adc x23,xzr,xzr1880umulh x13,x9,x241881ldr x24,[x2,x28] // next b[i]1882adds x20,x20,x101883// (*) mul x10,x14,x251884str x25,[x26],#8 // put aside t[0]*n0 for tail processing1885adcs x21,x21,x111886mul x11,x15,x25 // lo(n[0..3]*t[0]*n01887adcs x22,x22,x121888mul x12,x16,x251889adc x23,x23,x13 // can't overflow1890mul x13,x17,x251891// (*) adds xzr,x19,x101892subs xzr,x19,#1 // (*)1893umulh x10,x14,x25 // hi(n[0..3]*t[0]*n01894adcs x19,x20,x111895umulh x11,x15,x251896adcs x20,x21,x121897umulh x12,x16,x251898adcs x21,x22,x131899umulh x13,x17,x251900adcs x22,x23,x01901adc x0,xzr,xzr1902adds x19,x19,x101903adcs x20,x20,x111904adcs x21,x21,x121905adcs x22,x22,x131906//adc x0,x0,xzr1907cbnz x28,.Loop_mul4x_reduction19081909adc x0,x0,xzr1910ldp x10,x11,[x26,#8*4] // t[4..7]1911ldp x12,x13,[x26,#8*6]1912ldp x6,x7,[x1,#8*0] // a[4..7]1913ldp x8,x9,[x1,#8*2]1914add x1,x1,#8*41915adds x19,x19,x101916adcs x20,x20,x111917adcs x21,x21,x121918adcs x22,x22,x131919//adc x0,x0,xzr19201921ldr x25,[sp] // t[0]*n01922ldp x14,x15,[x3,#8*0] // n[4..7]1923ldp x16,x17,[x3,#8*2]1924add x3,x3,#8*419251926.align 41927.Loop_mul4x_tail:1928mul x10,x6,x24 // lo(a[4..7]*b[4])1929adc x0,x0,xzr // modulo-scheduled1930mul x11,x7,x241931add x28,x28,#81932mul x12,x8,x241933and x28,x28,#311934mul x13,x9,x241935adds x19,x19,x101936umulh x10,x6,x24 // hi(a[4..7]*b[4])1937adcs x20,x20,x111938umulh x11,x7,x241939adcs x21,x21,x121940umulh x12,x8,x241941adcs x22,x22,x131942umulh x13,x9,x241943adc x23,xzr,xzr1944ldr x24,[x2,x28] // next b[i]1945adds x20,x20,x101946mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)1947adcs x21,x21,x111948mul x11,x15,x251949adcs x22,x22,x121950mul x12,x16,x251951adc x23,x23,x13 // can't overflow1952mul x13,x17,x251953adds x19,x19,x101954umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)1955adcs x20,x20,x111956umulh x11,x15,x251957adcs x21,x21,x121958umulh x12,x16,x251959adcs x22,x22,x131960umulh x13,x17,x251961adcs x23,x23,x01962ldr x25,[sp,x28] // next a[0]*n01963adc x0,xzr,xzr1964str x19,[x26],#8 // result!!!1965adds x19,x20,x101966sub x10,x27,x1 // done yet?1967adcs x20,x21,x111968adcs x21,x22,x121969adcs x22,x23,x131970//adc x0,x0,xzr1971cbnz x28,.Loop_mul4x_tail19721973sub x11,x3,x5 // rewinded np?1974adc x0,x0,xzr1975cbz x10,.Loop_mul4x_break19761977ldp x10,x11,[x26,#8*4]1978ldp x12,x13,[x26,#8*6]1979ldp x6,x7,[x1,#8*0]1980ldp x8,x9,[x1,#8*2]1981add x1,x1,#8*41982adds x19,x19,x101983adcs x20,x20,x111984adcs x21,x21,x121985adcs x22,x22,x131986//adc x0,x0,xzr1987ldp x14,x15,[x3,#8*0]1988ldp x16,x17,[x3,#8*2]1989add x3,x3,#8*41990b .Loop_mul4x_tail19911992.align 41993.Loop_mul4x_break:1994ldp x12,x13,[x29,#96] // pull rp and &b[num]1995adds x19,x19,x301996add x2,x2,#8*4 // bp++1997adcs x20,x20,xzr1998sub x1,x1,x5 // rewind ap1999adcs x21,x21,xzr2000stp x19,x20,[x26,#8*0] // result!!!2001adcs x22,x22,xzr2002ldp x19,x20,[sp,#8*4] // t[0..3]2003adc x30,x0,xzr2004stp x21,x22,[x26,#8*2] // result!!!2005cmp x2,x13 // done yet?2006ldp x21,x22,[sp,#8*6]2007ldp x14,x15,[x11,#8*0] // n[0..3]2008ldp x16,x17,[x11,#8*2]2009add x3,x11,#8*42010b.eq .Lmul4x_post20112012ldr x24,[x2]2013ldp x6,x7,[x1,#8*0] // a[0..3]2014ldp x8,x9,[x1,#8*2]2015adds x1,x1,#8*4 // clear carry bit2016mov x0,xzr2017mov x26,sp2018b .Loop_mul4x_reduction20192020.align 42021.Lmul4x_post:2022// Final step. We see if result is larger than modulus, and2023// if it is, subtract the modulus. But comparison implies2024// subtraction. So we subtract modulus, see if it borrowed,2025// and conditionally copy original value.2026mov x0,x122027mov x27,x12 // x0 copy2028subs x10,x19,x142029add x26,sp,#8*82030sbcs x11,x20,x152031sub x28,x5,#8*420322033.Lmul4x_sub:2034sbcs x12,x21,x162035ldp x14,x15,[x3,#8*0]2036sub x28,x28,#8*42037ldp x19,x20,[x26,#8*0]2038sbcs x13,x22,x172039ldp x16,x17,[x3,#8*2]2040add x3,x3,#8*42041ldp x21,x22,[x26,#8*2]2042add x26,x26,#8*42043stp x10,x11,[x0,#8*0]2044sbcs x10,x19,x142045stp x12,x13,[x0,#8*2]2046add x0,x0,#8*42047sbcs x11,x20,x152048cbnz x28,.Lmul4x_sub20492050sbcs x12,x21,x162051mov x26,sp2052add x1,sp,#8*42053ldp x6,x7,[x27,#8*0]2054sbcs x13,x22,x172055stp x10,x11,[x0,#8*0]2056ldp x8,x9,[x27,#8*2]2057stp x12,x13,[x0,#8*2]2058ldp x19,x20,[x1,#8*0]2059ldp x21,x22,[x1,#8*2]2060sbcs xzr,x30,xzr // did it borrow?2061ldr x30,[x29,#8] // pull return address20622063sub x28,x5,#8*42064.Lmul4x_cond_copy:2065sub x28,x28,#8*42066csel x10,x19,x6,lo2067stp xzr,xzr,[x26,#8*0]2068csel x11,x20,x7,lo2069ldp x6,x7,[x27,#8*4]2070ldp x19,x20,[x1,#8*4]2071csel x12,x21,x8,lo2072stp xzr,xzr,[x26,#8*2]2073add x26,x26,#8*42074csel x13,x22,x9,lo2075ldp x8,x9,[x27,#8*6]2076ldp x21,x22,[x1,#8*6]2077add x1,x1,#8*42078stp x10,x11,[x27,#8*0]2079stp x12,x13,[x27,#8*2]2080add x27,x27,#8*42081cbnz x28,.Lmul4x_cond_copy20822083csel x10,x19,x6,lo2084stp xzr,xzr,[x26,#8*0]2085csel x11,x20,x7,lo2086stp xzr,xzr,[x26,#8*2]2087csel x12,x21,x8,lo2088stp xzr,xzr,[x26,#8*3]2089csel x13,x22,x9,lo2090stp xzr,xzr,[x26,#8*4]2091stp x10,x11,[x27,#8*0]2092stp x12,x13,[x27,#8*2]20932094b .Lmul4x_done20952096.align 42097.Lmul4x4_post_condition:2098adc x0,x0,xzr2099ldr x1,[x29,#96] // pull rp2100// x19-3,x0 hold result, x14-7 hold modulus2101subs x6,x19,x142102ldr x30,[x29,#8] // pull return address2103sbcs x7,x20,x152104stp xzr,xzr,[sp,#8*0]2105sbcs x8,x21,x162106stp xzr,xzr,[sp,#8*2]2107sbcs x9,x22,x172108stp xzr,xzr,[sp,#8*4]2109sbcs xzr,x0,xzr // did it borrow?2110stp xzr,xzr,[sp,#8*6]21112112// x6-3 hold result-modulus2113csel x6,x19,x6,lo2114csel x7,x20,x7,lo2115csel x8,x21,x8,lo2116csel x9,x22,x9,lo2117stp x6,x7,[x1,#8*0]2118stp x8,x9,[x1,#8*2]21192120.Lmul4x_done:2121ldp x19,x20,[x29,#16]2122mov sp,x292123ldp x21,x22,[x29,#32]2124mov x0,#12125ldp x23,x24,[x29,#48]2126ldp x25,x26,[x29,#64]2127ldp x27,x28,[x29,#80]2128ldr x29,[sp],#1282129// x30 loaded earlier2130AARCH64_VALIDATE_LINK_REGISTER2131ret2132.size __bn_mul4x_mont,.-__bn_mul4x_mont2133.section .rodata2134.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,02135.align 22136.align 4213721382139