Path: blob/main/sys/crypto/openssl/arm/chacha-armv4.S
39482 views
/* Do not modify. This file is auto-generated from chacha-armv4.pl. */1#include "arm_arch.h"23#if defined(__thumb2__) || defined(__clang__)4.syntax unified5#endif6#if defined(__thumb2__)7.thumb8#else9.code 3210#endif1112#if defined(__thumb2__) || defined(__clang__)13#define ldrhsb ldrbhs14#endif1516.text1718.align 519.Lsigma:20.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral21.Lone:22.long 1,0,0,023#if __ARM_MAX_ARCH__>=724.LOPENSSL_armcap:25# ifdef _WIN3226.word OPENSSL_armcap_P27# else28.word OPENSSL_armcap_P-.LChaCha20_ctr3229# endif30#else31.word -132#endif3334.globl ChaCha20_ctr3235.type ChaCha20_ctr32,%function36.align 537ChaCha20_ctr32:38.LChaCha20_ctr32:39ldr r12,[sp,#0] @ pull pointer to counter and nonce40stmdb sp!,{r0,r1,r2,r4-r11,lr}41#if __ARM_ARCH__<7 && !defined(__thumb2__)42sub r14,pc,#16 @ ChaCha20_ctr3243#else44adr r14,.LChaCha20_ctr3245#endif46cmp r2,#0 @ len==0?47#ifdef __thumb2__48itt eq49#endif50addeq sp,sp,#4*351beq .Lno_data52#if __ARM_MAX_ARCH__>=753cmp r2,#192 @ test len54bls .Lshort55ldr r4,[r14,#-32]56# if !defined(_WIN32)57ldr r4,[r14,r4]58# endif59# if defined(__APPLE__) || defined(_WIN32)60ldr r4,[r4]61# endif62tst r4,#ARMV7_NEON63bne .LChaCha20_neon64.Lshort:65#endif66ldmia r12,{r4,r5,r6,r7} @ load counter and nonce67sub sp,sp,#4*(16) @ off-load area68sub r14,r14,#64 @ .Lsigma69stmdb sp!,{r4,r5,r6,r7} @ copy counter and nonce70ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key71ldmia r14,{r0,r1,r2,r3} @ load sigma72stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy key73stmdb sp!,{r0,r1,r2,r3} @ copy sigma74str r10,[sp,#4*(16+10)] @ off-load "rx"75str r11,[sp,#4*(16+11)] @ off-load "rx"76b .Loop_outer_enter7778.align 479.Loop_outer:80ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material81str r11,[sp,#4*(32+2)] @ save len82str r12, [sp,#4*(32+1)] @ save inp83str r14, [sp,#4*(32+0)] @ save out84.Loop_outer_enter:85ldr r11, [sp,#4*(15)]86ldr r12,[sp,#4*(12)] @ modulo-scheduled load87ldr r10, [sp,#4*(13)]88ldr r14,[sp,#4*(14)]89str r11, [sp,#4*(16+15)]90mov r11,#1091b .Loop9293.align 494.Loop:95subs r11,r11,#196add r0,r0,r497mov r12,r12,ror#1698add r1,r1,r599mov r10,r10,ror#16100eor r12,r12,r0,ror#16101eor r10,r10,r1,ror#16102add r8,r8,r12103mov r4,r4,ror#20104add r9,r9,r10105mov r5,r5,ror#20106eor r4,r4,r8,ror#20107eor r5,r5,r9,ror#20108add r0,r0,r4109mov r12,r12,ror#24110add r1,r1,r5111mov r10,r10,ror#24112eor r12,r12,r0,ror#24113eor r10,r10,r1,ror#24114add r8,r8,r12115mov r4,r4,ror#25116add r9,r9,r10117mov r5,r5,ror#25118str r10,[sp,#4*(16+13)]119ldr r10,[sp,#4*(16+15)]120eor r4,r4,r8,ror#25121eor r5,r5,r9,ror#25122str r8,[sp,#4*(16+8)]123ldr r8,[sp,#4*(16+10)]124add r2,r2,r6125mov r14,r14,ror#16126str r9,[sp,#4*(16+9)]127ldr r9,[sp,#4*(16+11)]128add r3,r3,r7129mov r10,r10,ror#16130eor r14,r14,r2,ror#16131eor r10,r10,r3,ror#16132add r8,r8,r14133mov r6,r6,ror#20134add r9,r9,r10135mov r7,r7,ror#20136eor r6,r6,r8,ror#20137eor r7,r7,r9,ror#20138add r2,r2,r6139mov r14,r14,ror#24140add r3,r3,r7141mov r10,r10,ror#24142eor r14,r14,r2,ror#24143eor r10,r10,r3,ror#24144add r8,r8,r14145mov r6,r6,ror#25146add r9,r9,r10147mov r7,r7,ror#25148eor r6,r6,r8,ror#25149eor r7,r7,r9,ror#25150add r0,r0,r5151mov r10,r10,ror#16152add r1,r1,r6153mov r12,r12,ror#16154eor r10,r10,r0,ror#16155eor r12,r12,r1,ror#16156add r8,r8,r10157mov r5,r5,ror#20158add r9,r9,r12159mov r6,r6,ror#20160eor r5,r5,r8,ror#20161eor r6,r6,r9,ror#20162add r0,r0,r5163mov r10,r10,ror#24164add r1,r1,r6165mov r12,r12,ror#24166eor r10,r10,r0,ror#24167eor r12,r12,r1,ror#24168add r8,r8,r10169mov r5,r5,ror#25170str r10,[sp,#4*(16+15)]171ldr r10,[sp,#4*(16+13)]172add r9,r9,r12173mov r6,r6,ror#25174eor r5,r5,r8,ror#25175eor r6,r6,r9,ror#25176str r8,[sp,#4*(16+10)]177ldr r8,[sp,#4*(16+8)]178add r2,r2,r7179mov r10,r10,ror#16180str r9,[sp,#4*(16+11)]181ldr r9,[sp,#4*(16+9)]182add r3,r3,r4183mov r14,r14,ror#16184eor r10,r10,r2,ror#16185eor r14,r14,r3,ror#16186add r8,r8,r10187mov r7,r7,ror#20188add r9,r9,r14189mov r4,r4,ror#20190eor r7,r7,r8,ror#20191eor r4,r4,r9,ror#20192add r2,r2,r7193mov r10,r10,ror#24194add r3,r3,r4195mov r14,r14,ror#24196eor r10,r10,r2,ror#24197eor r14,r14,r3,ror#24198add r8,r8,r10199mov r7,r7,ror#25200add r9,r9,r14201mov r4,r4,ror#25202eor r7,r7,r8,ror#25203eor r4,r4,r9,ror#25204bne .Loop205206ldr r11,[sp,#4*(32+2)] @ load len207208str r8, [sp,#4*(16+8)] @ modulo-scheduled store209str r9, [sp,#4*(16+9)]210str r12,[sp,#4*(16+12)]211str r10, [sp,#4*(16+13)]212str r14,[sp,#4*(16+14)]213214@ at this point we have first half of 512-bit result in215@ rx and second half at sp+4*(16+8)216217cmp r11,#64 @ done yet?218#ifdef __thumb2__219itete lo220#endif221addlo r12,sp,#4*(0) @ shortcut or ...222ldrhs r12,[sp,#4*(32+1)] @ ... load inp223addlo r14,sp,#4*(0) @ shortcut or ...224ldrhs r14,[sp,#4*(32+0)] @ ... load out225226ldr r8,[sp,#4*(0)] @ load key material227ldr r9,[sp,#4*(1)]228229#if __ARM_ARCH__>=6 || !defined(__ARMEB__)230# if __ARM_ARCH__<7231orr r10,r12,r14232tst r10,#3 @ are input and output aligned?233ldr r10,[sp,#4*(2)]234bne .Lunaligned235cmp r11,#64 @ restore flags236# else237ldr r10,[sp,#4*(2)]238# endif239ldr r11,[sp,#4*(3)]240241add r0,r0,r8 @ accumulate key material242add r1,r1,r9243# ifdef __thumb2__244itt hs245# endif246ldrhs r8,[r12],#16 @ load input247ldrhs r9,[r12,#-12]248249add r2,r2,r10250add r3,r3,r11251# ifdef __thumb2__252itt hs253# endif254ldrhs r10,[r12,#-8]255ldrhs r11,[r12,#-4]256# if __ARM_ARCH__>=6 && defined(__ARMEB__)257rev r0,r0258rev r1,r1259rev r2,r2260rev r3,r3261# endif262# ifdef __thumb2__263itt hs264# endif265eorhs r0,r0,r8 @ xor with input266eorhs r1,r1,r9267add r8,sp,#4*(4)268str r0,[r14],#16 @ store output269# ifdef __thumb2__270itt hs271# endif272eorhs r2,r2,r10273eorhs r3,r3,r11274ldmia r8,{r8,r9,r10,r11} @ load key material275str r1,[r14,#-12]276str r2,[r14,#-8]277str r3,[r14,#-4]278279add r4,r4,r8 @ accumulate key material280add r5,r5,r9281# ifdef __thumb2__282itt hs283# endif284ldrhs r8,[r12],#16 @ load input285ldrhs r9,[r12,#-12]286add r6,r6,r10287add r7,r7,r11288# ifdef __thumb2__289itt hs290# endif291ldrhs r10,[r12,#-8]292ldrhs r11,[r12,#-4]293# if __ARM_ARCH__>=6 && defined(__ARMEB__)294rev r4,r4295rev r5,r5296rev r6,r6297rev r7,r7298# endif299# ifdef __thumb2__300itt hs301# endif302eorhs r4,r4,r8303eorhs r5,r5,r9304add r8,sp,#4*(8)305str r4,[r14],#16 @ store output306# ifdef __thumb2__307itt hs308# endif309eorhs r6,r6,r10310eorhs r7,r7,r11311str r5,[r14,#-12]312ldmia r8,{r8,r9,r10,r11} @ load key material313str r6,[r14,#-8]314add r0,sp,#4*(16+8)315str r7,[r14,#-4]316317ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half318319add r0,r0,r8 @ accumulate key material320add r1,r1,r9321# ifdef __thumb2__322itt hs323# endif324ldrhs r8,[r12],#16 @ load input325ldrhs r9,[r12,#-12]326# ifdef __thumb2__327itt hi328# endif329strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it330strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it331add r2,r2,r10332add r3,r3,r11333# ifdef __thumb2__334itt hs335# endif336ldrhs r10,[r12,#-8]337ldrhs r11,[r12,#-4]338# if __ARM_ARCH__>=6 && defined(__ARMEB__)339rev r0,r0340rev r1,r1341rev r2,r2342rev r3,r3343# endif344# ifdef __thumb2__345itt hs346# endif347eorhs r0,r0,r8348eorhs r1,r1,r9349add r8,sp,#4*(12)350str r0,[r14],#16 @ store output351# ifdef __thumb2__352itt hs353# endif354eorhs r2,r2,r10355eorhs r3,r3,r11356str r1,[r14,#-12]357ldmia r8,{r8,r9,r10,r11} @ load key material358str r2,[r14,#-8]359str r3,[r14,#-4]360361add r4,r4,r8 @ accumulate key material362add r5,r5,r9363# ifdef __thumb2__364itt hi365# endif366addhi r8,r8,#1 @ next counter value367strhi r8,[sp,#4*(12)] @ save next counter value368# ifdef __thumb2__369itt hs370# endif371ldrhs r8,[r12],#16 @ load input372ldrhs r9,[r12,#-12]373add r6,r6,r10374add r7,r7,r11375# ifdef __thumb2__376itt hs377# endif378ldrhs r10,[r12,#-8]379ldrhs r11,[r12,#-4]380# if __ARM_ARCH__>=6 && defined(__ARMEB__)381rev r4,r4382rev r5,r5383rev r6,r6384rev r7,r7385# endif386# ifdef __thumb2__387itt hs388# endif389eorhs r4,r4,r8390eorhs r5,r5,r9391# ifdef __thumb2__392it ne393# endif394ldrne r8,[sp,#4*(32+2)] @ re-load len395# ifdef __thumb2__396itt hs397# endif398eorhs r6,r6,r10399eorhs r7,r7,r11400str r4,[r14],#16 @ store output401str r5,[r14,#-12]402# ifdef __thumb2__403it hs404# endif405subhs r11,r8,#64 @ len-=64406str r6,[r14,#-8]407str r7,[r14,#-4]408bhi .Loop_outer409410beq .Ldone411# if __ARM_ARCH__<7412b .Ltail413414.align 4415.Lunaligned:@ unaligned endian-neutral path416cmp r11,#64 @ restore flags417# endif418#endif419#if __ARM_ARCH__<7420ldr r11,[sp,#4*(3)]421add r0,r0,r8 @ accumulate key material422add r1,r1,r9423add r2,r2,r10424# ifdef __thumb2__425itete lo426# endif427eorlo r8,r8,r8 @ zero or ...428ldrhsb r8,[r12],#16 @ ... load input429eorlo r9,r9,r9430ldrhsb r9,[r12,#-12]431432add r3,r3,r11433# ifdef __thumb2__434itete lo435# endif436eorlo r10,r10,r10437ldrhsb r10,[r12,#-8]438eorlo r11,r11,r11439ldrhsb r11,[r12,#-4]440441eor r0,r8,r0 @ xor with input (or zero)442eor r1,r9,r1443# ifdef __thumb2__444itt hs445# endif446ldrhsb r8,[r12,#-15] @ load more input447ldrhsb r9,[r12,#-11]448eor r2,r10,r2449strb r0,[r14],#16 @ store output450eor r3,r11,r3451# ifdef __thumb2__452itt hs453# endif454ldrhsb r10,[r12,#-7]455ldrhsb r11,[r12,#-3]456strb r1,[r14,#-12]457eor r0,r8,r0,lsr#8458strb r2,[r14,#-8]459eor r1,r9,r1,lsr#8460# ifdef __thumb2__461itt hs462# endif463ldrhsb r8,[r12,#-14] @ load more input464ldrhsb r9,[r12,#-10]465strb r3,[r14,#-4]466eor r2,r10,r2,lsr#8467strb r0,[r14,#-15]468eor r3,r11,r3,lsr#8469# ifdef __thumb2__470itt hs471# endif472ldrhsb r10,[r12,#-6]473ldrhsb r11,[r12,#-2]474strb r1,[r14,#-11]475eor r0,r8,r0,lsr#8476strb r2,[r14,#-7]477eor r1,r9,r1,lsr#8478# ifdef __thumb2__479itt hs480# endif481ldrhsb r8,[r12,#-13] @ load more input482ldrhsb r9,[r12,#-9]483strb r3,[r14,#-3]484eor r2,r10,r2,lsr#8485strb r0,[r14,#-14]486eor r3,r11,r3,lsr#8487# ifdef __thumb2__488itt hs489# endif490ldrhsb r10,[r12,#-5]491ldrhsb r11,[r12,#-1]492strb r1,[r14,#-10]493strb r2,[r14,#-6]494eor r0,r8,r0,lsr#8495strb r3,[r14,#-2]496eor r1,r9,r1,lsr#8497strb r0,[r14,#-13]498eor r2,r10,r2,lsr#8499strb r1,[r14,#-9]500eor r3,r11,r3,lsr#8501strb r2,[r14,#-5]502strb r3,[r14,#-1]503add r8,sp,#4*(4+0)504ldmia r8,{r8,r9,r10,r11} @ load key material505add r0,sp,#4*(16+8)506add r4,r4,r8 @ accumulate key material507add r5,r5,r9508add r6,r6,r10509# ifdef __thumb2__510itete lo511# endif512eorlo r8,r8,r8 @ zero or ...513ldrhsb r8,[r12],#16 @ ... load input514eorlo r9,r9,r9515ldrhsb r9,[r12,#-12]516517add r7,r7,r11518# ifdef __thumb2__519itete lo520# endif521eorlo r10,r10,r10522ldrhsb r10,[r12,#-8]523eorlo r11,r11,r11524ldrhsb r11,[r12,#-4]525526eor r4,r8,r4 @ xor with input (or zero)527eor r5,r9,r5528# ifdef __thumb2__529itt hs530# endif531ldrhsb r8,[r12,#-15] @ load more input532ldrhsb r9,[r12,#-11]533eor r6,r10,r6534strb r4,[r14],#16 @ store output535eor r7,r11,r7536# ifdef __thumb2__537itt hs538# endif539ldrhsb r10,[r12,#-7]540ldrhsb r11,[r12,#-3]541strb r5,[r14,#-12]542eor r4,r8,r4,lsr#8543strb r6,[r14,#-8]544eor r5,r9,r5,lsr#8545# ifdef __thumb2__546itt hs547# endif548ldrhsb r8,[r12,#-14] @ load more input549ldrhsb r9,[r12,#-10]550strb r7,[r14,#-4]551eor r6,r10,r6,lsr#8552strb r4,[r14,#-15]553eor r7,r11,r7,lsr#8554# ifdef __thumb2__555itt hs556# endif557ldrhsb r10,[r12,#-6]558ldrhsb r11,[r12,#-2]559strb r5,[r14,#-11]560eor r4,r8,r4,lsr#8561strb r6,[r14,#-7]562eor r5,r9,r5,lsr#8563# ifdef __thumb2__564itt hs565# endif566ldrhsb r8,[r12,#-13] @ load more input567ldrhsb r9,[r12,#-9]568strb r7,[r14,#-3]569eor r6,r10,r6,lsr#8570strb r4,[r14,#-14]571eor r7,r11,r7,lsr#8572# ifdef __thumb2__573itt hs574# endif575ldrhsb r10,[r12,#-5]576ldrhsb r11,[r12,#-1]577strb r5,[r14,#-10]578strb r6,[r14,#-6]579eor r4,r8,r4,lsr#8580strb r7,[r14,#-2]581eor r5,r9,r5,lsr#8582strb r4,[r14,#-13]583eor r6,r10,r6,lsr#8584strb r5,[r14,#-9]585eor r7,r11,r7,lsr#8586strb r6,[r14,#-5]587strb r7,[r14,#-1]588add r8,sp,#4*(4+4)589ldmia r8,{r8,r9,r10,r11} @ load key material590ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half591# ifdef __thumb2__592itt hi593# endif594strhi r10,[sp,#4*(16+10)] @ copy "rx"595strhi r11,[sp,#4*(16+11)] @ copy "rx"596add r0,r0,r8 @ accumulate key material597add r1,r1,r9598add r2,r2,r10599# ifdef __thumb2__600itete lo601# endif602eorlo r8,r8,r8 @ zero or ...603ldrhsb r8,[r12],#16 @ ... load input604eorlo r9,r9,r9605ldrhsb r9,[r12,#-12]606607add r3,r3,r11608# ifdef __thumb2__609itete lo610# endif611eorlo r10,r10,r10612ldrhsb r10,[r12,#-8]613eorlo r11,r11,r11614ldrhsb r11,[r12,#-4]615616eor r0,r8,r0 @ xor with input (or zero)617eor r1,r9,r1618# ifdef __thumb2__619itt hs620# endif621ldrhsb r8,[r12,#-15] @ load more input622ldrhsb r9,[r12,#-11]623eor r2,r10,r2624strb r0,[r14],#16 @ store output625eor r3,r11,r3626# ifdef __thumb2__627itt hs628# endif629ldrhsb r10,[r12,#-7]630ldrhsb r11,[r12,#-3]631strb r1,[r14,#-12]632eor r0,r8,r0,lsr#8633strb r2,[r14,#-8]634eor r1,r9,r1,lsr#8635# ifdef __thumb2__636itt hs637# endif638ldrhsb r8,[r12,#-14] @ load more input639ldrhsb r9,[r12,#-10]640strb r3,[r14,#-4]641eor r2,r10,r2,lsr#8642strb r0,[r14,#-15]643eor r3,r11,r3,lsr#8644# ifdef __thumb2__645itt hs646# endif647ldrhsb r10,[r12,#-6]648ldrhsb r11,[r12,#-2]649strb r1,[r14,#-11]650eor r0,r8,r0,lsr#8651strb r2,[r14,#-7]652eor r1,r9,r1,lsr#8653# ifdef __thumb2__654itt hs655# endif656ldrhsb r8,[r12,#-13] @ load more input657ldrhsb r9,[r12,#-9]658strb r3,[r14,#-3]659eor r2,r10,r2,lsr#8660strb r0,[r14,#-14]661eor r3,r11,r3,lsr#8662# ifdef __thumb2__663itt hs664# endif665ldrhsb r10,[r12,#-5]666ldrhsb r11,[r12,#-1]667strb r1,[r14,#-10]668strb r2,[r14,#-6]669eor r0,r8,r0,lsr#8670strb r3,[r14,#-2]671eor r1,r9,r1,lsr#8672strb r0,[r14,#-13]673eor r2,r10,r2,lsr#8674strb r1,[r14,#-9]675eor r3,r11,r3,lsr#8676strb r2,[r14,#-5]677strb r3,[r14,#-1]678add r8,sp,#4*(4+8)679ldmia r8,{r8,r9,r10,r11} @ load key material680add r4,r4,r8 @ accumulate key material681# ifdef __thumb2__682itt hi683# endif684addhi r8,r8,#1 @ next counter value685strhi r8,[sp,#4*(12)] @ save next counter value686add r5,r5,r9687add r6,r6,r10688# ifdef __thumb2__689itete lo690# endif691eorlo r8,r8,r8 @ zero or ...692ldrhsb r8,[r12],#16 @ ... load input693eorlo r9,r9,r9694ldrhsb r9,[r12,#-12]695696add r7,r7,r11697# ifdef __thumb2__698itete lo699# endif700eorlo r10,r10,r10701ldrhsb r10,[r12,#-8]702eorlo r11,r11,r11703ldrhsb r11,[r12,#-4]704705eor r4,r8,r4 @ xor with input (or zero)706eor r5,r9,r5707# ifdef __thumb2__708itt hs709# endif710ldrhsb r8,[r12,#-15] @ load more input711ldrhsb r9,[r12,#-11]712eor r6,r10,r6713strb r4,[r14],#16 @ store output714eor r7,r11,r7715# ifdef __thumb2__716itt hs717# endif718ldrhsb r10,[r12,#-7]719ldrhsb r11,[r12,#-3]720strb r5,[r14,#-12]721eor r4,r8,r4,lsr#8722strb r6,[r14,#-8]723eor r5,r9,r5,lsr#8724# ifdef __thumb2__725itt hs726# endif727ldrhsb r8,[r12,#-14] @ load more input728ldrhsb r9,[r12,#-10]729strb r7,[r14,#-4]730eor r6,r10,r6,lsr#8731strb r4,[r14,#-15]732eor r7,r11,r7,lsr#8733# ifdef __thumb2__734itt hs735# endif736ldrhsb r10,[r12,#-6]737ldrhsb r11,[r12,#-2]738strb r5,[r14,#-11]739eor r4,r8,r4,lsr#8740strb r6,[r14,#-7]741eor r5,r9,r5,lsr#8742# ifdef __thumb2__743itt hs744# endif745ldrhsb r8,[r12,#-13] @ load more input746ldrhsb r9,[r12,#-9]747strb r7,[r14,#-3]748eor r6,r10,r6,lsr#8749strb r4,[r14,#-14]750eor r7,r11,r7,lsr#8751# ifdef __thumb2__752itt hs753# endif754ldrhsb r10,[r12,#-5]755ldrhsb r11,[r12,#-1]756strb r5,[r14,#-10]757strb r6,[r14,#-6]758eor r4,r8,r4,lsr#8759strb r7,[r14,#-2]760eor r5,r9,r5,lsr#8761strb r4,[r14,#-13]762eor r6,r10,r6,lsr#8763strb r5,[r14,#-9]764eor r7,r11,r7,lsr#8765strb r6,[r14,#-5]766strb r7,[r14,#-1]767# ifdef __thumb2__768it ne769# endif770ldrne r8,[sp,#4*(32+2)] @ re-load len771# ifdef __thumb2__772it hs773# endif774subhs r11,r8,#64 @ len-=64775bhi .Loop_outer776777beq .Ldone778#endif779780.Ltail:781ldr r12,[sp,#4*(32+1)] @ load inp782add r9,sp,#4*(0)783ldr r14,[sp,#4*(32+0)] @ load out784785.Loop_tail:786ldrb r10,[r9],#1 @ read buffer on stack787ldrb r11,[r12],#1 @ read input788subs r8,r8,#1789eor r11,r11,r10790strb r11,[r14],#1 @ store output791bne .Loop_tail792793.Ldone:794add sp,sp,#4*(32+3)795.Lno_data:796ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}797.size ChaCha20_ctr32,.-ChaCha20_ctr32798#if __ARM_MAX_ARCH__>=7799.arch armv7-a800.fpu neon801802.type ChaCha20_neon,%function803.align 5804ChaCha20_neon:805ldr r12,[sp,#0] @ pull pointer to counter and nonce806stmdb sp!,{r0,r1,r2,r4-r11,lr}807.LChaCha20_neon:808adr r14,.Lsigma809vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI spec says so810stmdb sp!,{r0,r1,r2,r3}811812vld1.32 {q1,q2},[r3] @ load key813ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key814815sub sp,sp,#4*(16+16)816vld1.32 {q3},[r12] @ load counter and nonce817add r12,sp,#4*8818ldmia r14,{r0,r1,r2,r3} @ load sigma819vld1.32 {q0},[r14]! @ load sigma820vld1.32 {q12},[r14] @ one821vst1.32 {q2,q3},[r12] @ copy 1/2key|counter|nonce822vst1.32 {q0,q1},[sp] @ copy sigma|1/2key823824str r10,[sp,#4*(16+10)] @ off-load "rx"825str r11,[sp,#4*(16+11)] @ off-load "rx"826vshl.i32 d26,d24,#1 @ two827vstr d24,[sp,#4*(16+0)]828vshl.i32 d28,d24,#2 @ four829vstr d26,[sp,#4*(16+2)]830vmov q4,q0831vstr d28,[sp,#4*(16+4)]832vmov q8,q0833vmov q5,q1834vmov q9,q1835b .Loop_neon_enter836837.align 4838.Loop_neon_outer:839ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material840cmp r11,#64*2 @ if len<=64*2841bls .Lbreak_neon @ switch to integer-only842vmov q4,q0843str r11,[sp,#4*(32+2)] @ save len844vmov q8,q0845str r12, [sp,#4*(32+1)] @ save inp846vmov q5,q1847str r14, [sp,#4*(32+0)] @ save out848vmov q9,q1849.Loop_neon_enter:850ldr r11, [sp,#4*(15)]851vadd.i32 q7,q3,q12 @ counter+1852ldr r12,[sp,#4*(12)] @ modulo-scheduled load853vmov q6,q2854ldr r10, [sp,#4*(13)]855vmov q10,q2856ldr r14,[sp,#4*(14)]857vadd.i32 q11,q7,q12 @ counter+2858str r11, [sp,#4*(16+15)]859mov r11,#10860add r12,r12,#3 @ counter+3861b .Loop_neon862863.align 4864.Loop_neon:865subs r11,r11,#1866vadd.i32 q0,q0,q1867add r0,r0,r4868vadd.i32 q4,q4,q5869mov r12,r12,ror#16870vadd.i32 q8,q8,q9871add r1,r1,r5872veor q3,q3,q0873mov r10,r10,ror#16874veor q7,q7,q4875eor r12,r12,r0,ror#16876veor q11,q11,q8877eor r10,r10,r1,ror#16878vrev32.16 q3,q3879add r8,r8,r12880vrev32.16 q7,q7881mov r4,r4,ror#20882vrev32.16 q11,q11883add r9,r9,r10884vadd.i32 q2,q2,q3885mov r5,r5,ror#20886vadd.i32 q6,q6,q7887eor r4,r4,r8,ror#20888vadd.i32 q10,q10,q11889eor r5,r5,r9,ror#20890veor q12,q1,q2891add r0,r0,r4892veor q13,q5,q6893mov r12,r12,ror#24894veor q14,q9,q10895add r1,r1,r5896vshr.u32 q1,q12,#20897mov r10,r10,ror#24898vshr.u32 q5,q13,#20899eor r12,r12,r0,ror#24900vshr.u32 q9,q14,#20901eor r10,r10,r1,ror#24902vsli.32 q1,q12,#12903add r8,r8,r12904vsli.32 q5,q13,#12905mov r4,r4,ror#25906vsli.32 q9,q14,#12907add r9,r9,r10908vadd.i32 q0,q0,q1909mov r5,r5,ror#25910vadd.i32 q4,q4,q5911str r10,[sp,#4*(16+13)]912vadd.i32 q8,q8,q9913ldr r10,[sp,#4*(16+15)]914veor q12,q3,q0915eor r4,r4,r8,ror#25916veor q13,q7,q4917eor r5,r5,r9,ror#25918veor q14,q11,q8919str r8,[sp,#4*(16+8)]920vshr.u32 q3,q12,#24921ldr r8,[sp,#4*(16+10)]922vshr.u32 q7,q13,#24923add r2,r2,r6924vshr.u32 q11,q14,#24925mov r14,r14,ror#16926vsli.32 q3,q12,#8927str r9,[sp,#4*(16+9)]928vsli.32 q7,q13,#8929ldr r9,[sp,#4*(16+11)]930vsli.32 q11,q14,#8931add r3,r3,r7932vadd.i32 q2,q2,q3933mov r10,r10,ror#16934vadd.i32 q6,q6,q7935eor r14,r14,r2,ror#16936vadd.i32 q10,q10,q11937eor r10,r10,r3,ror#16938veor q12,q1,q2939add r8,r8,r14940veor q13,q5,q6941mov r6,r6,ror#20942veor q14,q9,q10943add r9,r9,r10944vshr.u32 q1,q12,#25945mov r7,r7,ror#20946vshr.u32 q5,q13,#25947eor r6,r6,r8,ror#20948vshr.u32 q9,q14,#25949eor r7,r7,r9,ror#20950vsli.32 q1,q12,#7951add r2,r2,r6952vsli.32 q5,q13,#7953mov r14,r14,ror#24954vsli.32 q9,q14,#7955add r3,r3,r7956vext.8 q2,q2,q2,#8957mov r10,r10,ror#24958vext.8 q6,q6,q6,#8959eor r14,r14,r2,ror#24960vext.8 q10,q10,q10,#8961eor r10,r10,r3,ror#24962vext.8 q1,q1,q1,#4963add r8,r8,r14964vext.8 q5,q5,q5,#4965mov r6,r6,ror#25966vext.8 q9,q9,q9,#4967add r9,r9,r10968vext.8 q3,q3,q3,#12969mov r7,r7,ror#25970vext.8 q7,q7,q7,#12971eor r6,r6,r8,ror#25972vext.8 q11,q11,q11,#12973eor r7,r7,r9,ror#25974vadd.i32 q0,q0,q1975add r0,r0,r5976vadd.i32 q4,q4,q5977mov r10,r10,ror#16978vadd.i32 q8,q8,q9979add r1,r1,r6980veor q3,q3,q0981mov r12,r12,ror#16982veor q7,q7,q4983eor r10,r10,r0,ror#16984veor q11,q11,q8985eor r12,r12,r1,ror#16986vrev32.16 q3,q3987add r8,r8,r10988vrev32.16 q7,q7989mov r5,r5,ror#20990vrev32.16 q11,q11991add r9,r9,r12992vadd.i32 q2,q2,q3993mov r6,r6,ror#20994vadd.i32 q6,q6,q7995eor r5,r5,r8,ror#20996vadd.i32 q10,q10,q11997eor r6,r6,r9,ror#20998veor q12,q1,q2999add r0,r0,r51000veor q13,q5,q61001mov r10,r10,ror#241002veor q14,q9,q101003add r1,r1,r61004vshr.u32 q1,q12,#201005mov r12,r12,ror#241006vshr.u32 q5,q13,#201007eor r10,r10,r0,ror#241008vshr.u32 q9,q14,#201009eor r12,r12,r1,ror#241010vsli.32 q1,q12,#121011add r8,r8,r101012vsli.32 q5,q13,#121013mov r5,r5,ror#251014vsli.32 q9,q14,#121015str r10,[sp,#4*(16+15)]1016vadd.i32 q0,q0,q11017ldr r10,[sp,#4*(16+13)]1018vadd.i32 q4,q4,q51019add r9,r9,r121020vadd.i32 q8,q8,q91021mov r6,r6,ror#251022veor q12,q3,q01023eor r5,r5,r8,ror#251024veor q13,q7,q41025eor r6,r6,r9,ror#251026veor q14,q11,q81027str r8,[sp,#4*(16+10)]1028vshr.u32 q3,q12,#241029ldr r8,[sp,#4*(16+8)]1030vshr.u32 q7,q13,#241031add r2,r2,r71032vshr.u32 q11,q14,#241033mov r10,r10,ror#161034vsli.32 q3,q12,#81035str r9,[sp,#4*(16+11)]1036vsli.32 q7,q13,#81037ldr r9,[sp,#4*(16+9)]1038vsli.32 q11,q14,#81039add r3,r3,r41040vadd.i32 q2,q2,q31041mov r14,r14,ror#161042vadd.i32 q6,q6,q71043eor r10,r10,r2,ror#161044vadd.i32 q10,q10,q111045eor r14,r14,r3,ror#161046veor q12,q1,q21047add r8,r8,r101048veor q13,q5,q61049mov r7,r7,ror#201050veor q14,q9,q101051add r9,r9,r141052vshr.u32 q1,q12,#251053mov r4,r4,ror#201054vshr.u32 q5,q13,#251055eor r7,r7,r8,ror#201056vshr.u32 q9,q14,#251057eor r4,r4,r9,ror#201058vsli.32 q1,q12,#71059add r2,r2,r71060vsli.32 q5,q13,#71061mov r10,r10,ror#241062vsli.32 q9,q14,#71063add r3,r3,r41064vext.8 q2,q2,q2,#81065mov r14,r14,ror#241066vext.8 q6,q6,q6,#81067eor r10,r10,r2,ror#241068vext.8 q10,q10,q10,#81069eor r14,r14,r3,ror#241070vext.8 q1,q1,q1,#121071add r8,r8,r101072vext.8 q5,q5,q5,#121073mov r7,r7,ror#251074vext.8 q9,q9,q9,#121075add r9,r9,r141076vext.8 q3,q3,q3,#41077mov r4,r4,ror#251078vext.8 q7,q7,q7,#41079eor r7,r7,r8,ror#251080vext.8 q11,q11,q11,#41081eor r4,r4,r9,ror#251082bne .Loop_neon10831084add r11,sp,#321085vld1.32 {q12,q13},[sp] @ load key material1086vld1.32 {q14,q15},[r11]10871088ldr r11,[sp,#4*(32+2)] @ load len10891090str r8, [sp,#4*(16+8)] @ modulo-scheduled store1091str r9, [sp,#4*(16+9)]1092str r12,[sp,#4*(16+12)]1093str r10, [sp,#4*(16+13)]1094str r14,[sp,#4*(16+14)]10951096@ at this point we have first half of 512-bit result in1097@ rx and second half at sp+4*(16+8)10981099ldr r12,[sp,#4*(32+1)] @ load inp1100ldr r14,[sp,#4*(32+0)] @ load out11011102vadd.i32 q0,q0,q12 @ accumulate key material1103vadd.i32 q4,q4,q121104vadd.i32 q8,q8,q121105vldr d24,[sp,#4*(16+0)] @ one11061107vadd.i32 q1,q1,q131108vadd.i32 q5,q5,q131109vadd.i32 q9,q9,q131110vldr d26,[sp,#4*(16+2)] @ two11111112vadd.i32 q2,q2,q141113vadd.i32 q6,q6,q141114vadd.i32 q10,q10,q141115vadd.i32 d14,d14,d24 @ counter+11116vadd.i32 d22,d22,d26 @ counter+211171118vadd.i32 q3,q3,q151119vadd.i32 q7,q7,q151120vadd.i32 q11,q11,q1511211122cmp r11,#64*41123blo .Ltail_neon11241125vld1.8 {q12,q13},[r12]! @ load input1126mov r11,sp1127vld1.8 {q14,q15},[r12]!1128veor q0,q0,q12 @ xor with input1129veor q1,q1,q131130vld1.8 {q12,q13},[r12]!1131veor q2,q2,q141132veor q3,q3,q151133vld1.8 {q14,q15},[r12]!11341135veor q4,q4,q121136vst1.8 {q0,q1},[r14]! @ store output1137veor q5,q5,q131138vld1.8 {q12,q13},[r12]!1139veor q6,q6,q141140vst1.8 {q2,q3},[r14]!1141veor q7,q7,q151142vld1.8 {q14,q15},[r12]!11431144veor q8,q8,q121145vld1.32 {q0,q1},[r11]! @ load for next iteration1146veor d25,d25,d251147vldr d24,[sp,#4*(16+4)] @ four1148veor q9,q9,q131149vld1.32 {q2,q3},[r11]1150veor q10,q10,q141151vst1.8 {q4,q5},[r14]!1152veor q11,q11,q151153vst1.8 {q6,q7},[r14]!11541155vadd.i32 d6,d6,d24 @ next counter value1156vldr d24,[sp,#4*(16+0)] @ one11571158ldmia sp,{r8,r9,r10,r11} @ load key material1159add r0,r0,r8 @ accumulate key material1160ldr r8,[r12],#16 @ load input1161vst1.8 {q8,q9},[r14]!1162add r1,r1,r91163ldr r9,[r12,#-12]1164vst1.8 {q10,q11},[r14]!1165add r2,r2,r101166ldr r10,[r12,#-8]1167add r3,r3,r111168ldr r11,[r12,#-4]1169# ifdef __ARMEB__1170rev r0,r01171rev r1,r11172rev r2,r21173rev r3,r31174# endif1175eor r0,r0,r8 @ xor with input1176add r8,sp,#4*(4)1177eor r1,r1,r91178str r0,[r14],#16 @ store output1179eor r2,r2,r101180str r1,[r14,#-12]1181eor r3,r3,r111182ldmia r8,{r8,r9,r10,r11} @ load key material1183str r2,[r14,#-8]1184str r3,[r14,#-4]11851186add r4,r4,r8 @ accumulate key material1187ldr r8,[r12],#16 @ load input1188add r5,r5,r91189ldr r9,[r12,#-12]1190add r6,r6,r101191ldr r10,[r12,#-8]1192add r7,r7,r111193ldr r11,[r12,#-4]1194# ifdef __ARMEB__1195rev r4,r41196rev r5,r51197rev r6,r61198rev r7,r71199# endif1200eor r4,r4,r81201add r8,sp,#4*(8)1202eor r5,r5,r91203str r4,[r14],#16 @ store output1204eor r6,r6,r101205str r5,[r14,#-12]1206eor r7,r7,r111207ldmia r8,{r8,r9,r10,r11} @ load key material1208str r6,[r14,#-8]1209add r0,sp,#4*(16+8)1210str r7,[r14,#-4]12111212ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half12131214add r0,r0,r8 @ accumulate key material1215ldr r8,[r12],#16 @ load input1216add r1,r1,r91217ldr r9,[r12,#-12]1218# ifdef __thumb2__1219it hi1220# endif1221strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it1222add r2,r2,r101223ldr r10,[r12,#-8]1224# ifdef __thumb2__1225it hi1226# endif1227strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it1228add r3,r3,r111229ldr r11,[r12,#-4]1230# ifdef __ARMEB__1231rev r0,r01232rev r1,r11233rev r2,r21234rev r3,r31235# endif1236eor r0,r0,r81237add r8,sp,#4*(12)1238eor r1,r1,r91239str r0,[r14],#16 @ store output1240eor r2,r2,r101241str r1,[r14,#-12]1242eor r3,r3,r111243ldmia r8,{r8,r9,r10,r11} @ load key material1244str r2,[r14,#-8]1245str r3,[r14,#-4]12461247add r4,r4,r8 @ accumulate key material1248add r8,r8,#4 @ next counter value1249add r5,r5,r91250str r8,[sp,#4*(12)] @ save next counter value1251ldr r8,[r12],#16 @ load input1252add r6,r6,r101253add r4,r4,#3 @ counter+31254ldr r9,[r12,#-12]1255add r7,r7,r111256ldr r10,[r12,#-8]1257ldr r11,[r12,#-4]1258# ifdef __ARMEB__1259rev r4,r41260rev r5,r51261rev r6,r61262rev r7,r71263# endif1264eor r4,r4,r81265# ifdef __thumb2__1266it hi1267# endif1268ldrhi r8,[sp,#4*(32+2)] @ re-load len1269eor r5,r5,r91270eor r6,r6,r101271str r4,[r14],#16 @ store output1272eor r7,r7,r111273str r5,[r14,#-12]1274sub r11,r8,#64*4 @ len-=64*41275str r6,[r14,#-8]1276str r7,[r14,#-4]1277bhi .Loop_neon_outer12781279b .Ldone_neon12801281.align 41282.Lbreak_neon:1283@ harmonize NEON and integer-only stack frames: load data1284@ from NEON frame, but save to integer-only one; distance1285@ between the two is 4*(32+4+16-32)=4*(20).12861287str r11, [sp,#4*(20+32+2)] @ save len1288add r11,sp,#4*(32+4)1289str r12, [sp,#4*(20+32+1)] @ save inp1290str r14, [sp,#4*(20+32+0)] @ save out12911292ldr r12,[sp,#4*(16+10)]1293ldr r14,[sp,#4*(16+11)]1294vldmia r11,{d8,d9,d10,d11,d12,d13,d14,d15} @ fulfill ABI requirement1295str r12,[sp,#4*(20+16+10)] @ copy "rx"1296str r14,[sp,#4*(20+16+11)] @ copy "rx"12971298ldr r11, [sp,#4*(15)]1299ldr r12,[sp,#4*(12)] @ modulo-scheduled load1300ldr r10, [sp,#4*(13)]1301ldr r14,[sp,#4*(14)]1302str r11, [sp,#4*(20+16+15)]1303add r11,sp,#4*(20)1304vst1.32 {q0,q1},[r11]! @ copy key1305add sp,sp,#4*(20) @ switch frame1306vst1.32 {q2,q3},[r11]1307mov r11,#101308b .Loop @ go integer-only13091310.align 41311.Ltail_neon:1312cmp r11,#64*31313bhs .L192_or_more_neon1314cmp r11,#64*21315bhs .L128_or_more_neon1316cmp r11,#64*11317bhs .L64_or_more_neon13181319add r8,sp,#4*(8)1320vst1.8 {q0,q1},[sp]1321add r10,sp,#4*(0)1322vst1.8 {q2,q3},[r8]1323b .Loop_tail_neon13241325.align 41326.L64_or_more_neon:1327vld1.8 {q12,q13},[r12]!1328vld1.8 {q14,q15},[r12]!1329veor q0,q0,q121330veor q1,q1,q131331veor q2,q2,q141332veor q3,q3,q151333vst1.8 {q0,q1},[r14]!1334vst1.8 {q2,q3},[r14]!13351336beq .Ldone_neon13371338add r8,sp,#4*(8)1339vst1.8 {q4,q5},[sp]1340add r10,sp,#4*(0)1341vst1.8 {q6,q7},[r8]1342sub r11,r11,#64*1 @ len-=64*11343b .Loop_tail_neon13441345.align 41346.L128_or_more_neon:1347vld1.8 {q12,q13},[r12]!1348vld1.8 {q14,q15},[r12]!1349veor q0,q0,q121350veor q1,q1,q131351vld1.8 {q12,q13},[r12]!1352veor q2,q2,q141353veor q3,q3,q151354vld1.8 {q14,q15},[r12]!13551356veor q4,q4,q121357veor q5,q5,q131358vst1.8 {q0,q1},[r14]!1359veor q6,q6,q141360vst1.8 {q2,q3},[r14]!1361veor q7,q7,q151362vst1.8 {q4,q5},[r14]!1363vst1.8 {q6,q7},[r14]!13641365beq .Ldone_neon13661367add r8,sp,#4*(8)1368vst1.8 {q8,q9},[sp]1369add r10,sp,#4*(0)1370vst1.8 {q10,q11},[r8]1371sub r11,r11,#64*2 @ len-=64*21372b .Loop_tail_neon13731374.align 41375.L192_or_more_neon:1376vld1.8 {q12,q13},[r12]!1377vld1.8 {q14,q15},[r12]!1378veor q0,q0,q121379veor q1,q1,q131380vld1.8 {q12,q13},[r12]!1381veor q2,q2,q141382veor q3,q3,q151383vld1.8 {q14,q15},[r12]!13841385veor q4,q4,q121386veor q5,q5,q131387vld1.8 {q12,q13},[r12]!1388veor q6,q6,q141389vst1.8 {q0,q1},[r14]!1390veor q7,q7,q151391vld1.8 {q14,q15},[r12]!13921393veor q8,q8,q121394vst1.8 {q2,q3},[r14]!1395veor q9,q9,q131396vst1.8 {q4,q5},[r14]!1397veor q10,q10,q141398vst1.8 {q6,q7},[r14]!1399veor q11,q11,q151400vst1.8 {q8,q9},[r14]!1401vst1.8 {q10,q11},[r14]!14021403beq .Ldone_neon14041405ldmia sp,{r8,r9,r10,r11} @ load key material1406add r0,r0,r8 @ accumulate key material1407add r8,sp,#4*(4)1408add r1,r1,r91409add r2,r2,r101410add r3,r3,r111411ldmia r8,{r8,r9,r10,r11} @ load key material14121413add r4,r4,r8 @ accumulate key material1414add r8,sp,#4*(8)1415add r5,r5,r91416add r6,r6,r101417add r7,r7,r111418ldmia r8,{r8,r9,r10,r11} @ load key material1419# ifdef __ARMEB__1420rev r0,r01421rev r1,r11422rev r2,r21423rev r3,r31424rev r4,r41425rev r5,r51426rev r6,r61427rev r7,r71428# endif1429stmia sp,{r0,r1,r2,r3,r4,r5,r6,r7}1430add r0,sp,#4*(16+8)14311432ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half14331434add r0,r0,r8 @ accumulate key material1435add r8,sp,#4*(12)1436add r1,r1,r91437add r2,r2,r101438add r3,r3,r111439ldmia r8,{r8,r9,r10,r11} @ load key material14401441add r4,r4,r8 @ accumulate key material1442add r8,sp,#4*(8)1443add r5,r5,r91444add r4,r4,#3 @ counter+31445add r6,r6,r101446add r7,r7,r111447ldr r11,[sp,#4*(32+2)] @ re-load len1448# ifdef __ARMEB__1449rev r0,r01450rev r1,r11451rev r2,r21452rev r3,r31453rev r4,r41454rev r5,r51455rev r6,r61456rev r7,r71457# endif1458stmia r8,{r0,r1,r2,r3,r4,r5,r6,r7}1459add r10,sp,#4*(0)1460sub r11,r11,#64*3 @ len-=64*314611462.Loop_tail_neon:1463ldrb r8,[r10],#1 @ read buffer on stack1464ldrb r9,[r12],#1 @ read input1465subs r11,r11,#11466eor r8,r8,r91467strb r8,[r14],#1 @ store output1468bne .Loop_tail_neon14691470.Ldone_neon:1471add sp,sp,#4*(32+4)1472vldmia sp,{d8,d9,d10,d11,d12,d13,d14,d15}1473add sp,sp,#4*(16+3)1474ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}1475.size ChaCha20_neon,.-ChaCha20_neon14761477.hidden OPENSSL_armcap_P1478#endif147914801481