Path: blob/main/sys/crypto/openssl/arm/ghash-armv4.S
39482 views
/* Do not modify. This file is auto-generated from ghash-armv4.pl. */1#include "arm_arch.h"23#if defined(__thumb2__) || defined(__clang__)4.syntax unified5#define ldrplb ldrbpl6#define ldrneb ldrbne7#endif8#if defined(__thumb2__)9.thumb10#else11.code 3212#endif1314.text1516.type rem_4bit,%object17.align 518rem_4bit:19.short 0x0000,0x1C20,0x3840,0x246020.short 0x7080,0x6CA0,0x48C0,0x54E021.short 0xE100,0xFD20,0xD940,0xC56022.short 0x9180,0x8DA0,0xA9C0,0xB5E023.size rem_4bit,.-rem_4bit2425.type rem_4bit_get,%function26rem_4bit_get:27#if defined(__thumb2__)28adr r2,rem_4bit29#else30sub r2,pc,#8+32 @ &rem_4bit31#endif32b .Lrem_4bit_got33nop34nop35.size rem_4bit_get,.-rem_4bit_get3637.globl gcm_ghash_4bit38.type gcm_ghash_4bit,%function39.align 440gcm_ghash_4bit:41#if defined(__thumb2__)42adr r12,rem_4bit43#else44sub r12,pc,#8+48 @ &rem_4bit45#endif46add r3,r2,r3 @ r3 to point at the end47stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} @ save r3/end too4849ldmia r12,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy rem_4bit ...50stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ ... to stack5152ldrb r12,[r2,#15]53ldrb r14,[r0,#15]54.Louter:55eor r12,r12,r1456and r14,r12,#0xf057and r12,r12,#0x0f58mov r3,#145960add r7,r1,r12,lsl#461ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo]62add r11,r1,r1463ldrb r12,[r2,#14]6465and r14,r4,#0xf @ rem66ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi]67add r14,r14,r1468eor r4,r8,r4,lsr#469ldrh r8,[sp,r14] @ rem_4bit[rem]70eor r4,r4,r5,lsl#2871ldrb r14,[r0,#14]72eor r5,r9,r5,lsr#473eor r5,r5,r6,lsl#2874eor r6,r10,r6,lsr#475eor r6,r6,r7,lsl#2876eor r7,r11,r7,lsr#477eor r12,r12,r1478and r14,r12,#0xf079and r12,r12,#0x0f80eor r7,r7,r8,lsl#168182.Linner:83add r11,r1,r12,lsl#484and r12,r4,#0xf @ rem85subs r3,r3,#186add r12,r12,r1287ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo]88eor r4,r8,r4,lsr#489eor r4,r4,r5,lsl#2890eor r5,r9,r5,lsr#491eor r5,r5,r6,lsl#2892ldrh r8,[sp,r12] @ rem_4bit[rem]93eor r6,r10,r6,lsr#494#ifdef __thumb2__95it pl96#endif97ldrplb r12,[r2,r3]98eor r6,r6,r7,lsl#2899eor r7,r11,r7,lsr#4100101add r11,r1,r14102and r14,r4,#0xf @ rem103eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]104add r14,r14,r14105ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi]106eor r4,r8,r4,lsr#4107#ifdef __thumb2__108it pl109#endif110ldrplb r8,[r0,r3]111eor r4,r4,r5,lsl#28112eor r5,r9,r5,lsr#4113ldrh r9,[sp,r14]114eor r5,r5,r6,lsl#28115eor r6,r10,r6,lsr#4116eor r6,r6,r7,lsl#28117#ifdef __thumb2__118it pl119#endif120eorpl r12,r12,r8121eor r7,r11,r7,lsr#4122#ifdef __thumb2__123itt pl124#endif125andpl r14,r12,#0xf0126andpl r12,r12,#0x0f127eor r7,r7,r9,lsl#16 @ ^= rem_4bit[rem]128bpl .Linner129130ldr r3,[sp,#32] @ re-load r3/end131add r2,r2,#16132mov r14,r4133#if __ARM_ARCH__>=7 && defined(__ARMEL__)134rev r4,r4135str r4,[r0,#12]136#elif defined(__ARMEB__)137str r4,[r0,#12]138#else139mov r9,r4,lsr#8140strb r4,[r0,#12+3]141mov r10,r4,lsr#16142strb r9,[r0,#12+2]143mov r11,r4,lsr#24144strb r10,[r0,#12+1]145strb r11,[r0,#12]146#endif147cmp r2,r3148#if __ARM_ARCH__>=7 && defined(__ARMEL__)149rev r5,r5150str r5,[r0,#8]151#elif defined(__ARMEB__)152str r5,[r0,#8]153#else154mov r9,r5,lsr#8155strb r5,[r0,#8+3]156mov r10,r5,lsr#16157strb r9,[r0,#8+2]158mov r11,r5,lsr#24159strb r10,[r0,#8+1]160strb r11,[r0,#8]161#endif162163#ifdef __thumb2__164it ne165#endif166ldrneb r12,[r2,#15]167#if __ARM_ARCH__>=7 && defined(__ARMEL__)168rev r6,r6169str r6,[r0,#4]170#elif defined(__ARMEB__)171str r6,[r0,#4]172#else173mov r9,r6,lsr#8174strb r6,[r0,#4+3]175mov r10,r6,lsr#16176strb r9,[r0,#4+2]177mov r11,r6,lsr#24178strb r10,[r0,#4+1]179strb r11,[r0,#4]180#endif181182#if __ARM_ARCH__>=7 && defined(__ARMEL__)183rev r7,r7184str r7,[r0,#0]185#elif defined(__ARMEB__)186str r7,[r0,#0]187#else188mov r9,r7,lsr#8189strb r7,[r0,#0+3]190mov r10,r7,lsr#16191strb r9,[r0,#0+2]192mov r11,r7,lsr#24193strb r10,[r0,#0+1]194strb r11,[r0,#0]195#endif196197bne .Louter198199add sp,sp,#36200#if __ARM_ARCH__>=5201ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}202#else203ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}204tst lr,#1205moveq pc,lr @ be binary compatible with V4, yet206.word 0xe12fff1e @ interoperable with Thumb ISA:-)207#endif208.size gcm_ghash_4bit,.-gcm_ghash_4bit209210.globl gcm_gmult_4bit211.type gcm_gmult_4bit,%function212gcm_gmult_4bit:213stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}214ldrb r12,[r0,#15]215b rem_4bit_get216.Lrem_4bit_got:217and r14,r12,#0xf0218and r12,r12,#0x0f219mov r3,#14220221add r7,r1,r12,lsl#4222ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo]223ldrb r12,[r0,#14]224225add r11,r1,r14226and r14,r4,#0xf @ rem227ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi]228add r14,r14,r14229eor r4,r8,r4,lsr#4230ldrh r8,[r2,r14] @ rem_4bit[rem]231eor r4,r4,r5,lsl#28232eor r5,r9,r5,lsr#4233eor r5,r5,r6,lsl#28234eor r6,r10,r6,lsr#4235eor r6,r6,r7,lsl#28236eor r7,r11,r7,lsr#4237and r14,r12,#0xf0238eor r7,r7,r8,lsl#16239and r12,r12,#0x0f240241.Loop:242add r11,r1,r12,lsl#4243and r12,r4,#0xf @ rem244subs r3,r3,#1245add r12,r12,r12246ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo]247eor r4,r8,r4,lsr#4248eor r4,r4,r5,lsl#28249eor r5,r9,r5,lsr#4250eor r5,r5,r6,lsl#28251ldrh r8,[r2,r12] @ rem_4bit[rem]252eor r6,r10,r6,lsr#4253#ifdef __thumb2__254it pl255#endif256ldrplb r12,[r0,r3]257eor r6,r6,r7,lsl#28258eor r7,r11,r7,lsr#4259260add r11,r1,r14261and r14,r4,#0xf @ rem262eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]263add r14,r14,r14264ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi]265eor r4,r8,r4,lsr#4266eor r4,r4,r5,lsl#28267eor r5,r9,r5,lsr#4268ldrh r8,[r2,r14] @ rem_4bit[rem]269eor r5,r5,r6,lsl#28270eor r6,r10,r6,lsr#4271eor r6,r6,r7,lsl#28272eor r7,r11,r7,lsr#4273#ifdef __thumb2__274itt pl275#endif276andpl r14,r12,#0xf0277andpl r12,r12,#0x0f278eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]279bpl .Loop280#if __ARM_ARCH__>=7 && defined(__ARMEL__)281rev r4,r4282str r4,[r0,#12]283#elif defined(__ARMEB__)284str r4,[r0,#12]285#else286mov r9,r4,lsr#8287strb r4,[r0,#12+3]288mov r10,r4,lsr#16289strb r9,[r0,#12+2]290mov r11,r4,lsr#24291strb r10,[r0,#12+1]292strb r11,[r0,#12]293#endif294295#if __ARM_ARCH__>=7 && defined(__ARMEL__)296rev r5,r5297str r5,[r0,#8]298#elif defined(__ARMEB__)299str r5,[r0,#8]300#else301mov r9,r5,lsr#8302strb r5,[r0,#8+3]303mov r10,r5,lsr#16304strb r9,[r0,#8+2]305mov r11,r5,lsr#24306strb r10,[r0,#8+1]307strb r11,[r0,#8]308#endif309310#if __ARM_ARCH__>=7 && defined(__ARMEL__)311rev r6,r6312str r6,[r0,#4]313#elif defined(__ARMEB__)314str r6,[r0,#4]315#else316mov r9,r6,lsr#8317strb r6,[r0,#4+3]318mov r10,r6,lsr#16319strb r9,[r0,#4+2]320mov r11,r6,lsr#24321strb r10,[r0,#4+1]322strb r11,[r0,#4]323#endif324325#if __ARM_ARCH__>=7 && defined(__ARMEL__)326rev r7,r7327str r7,[r0,#0]328#elif defined(__ARMEB__)329str r7,[r0,#0]330#else331mov r9,r7,lsr#8332strb r7,[r0,#0+3]333mov r10,r7,lsr#16334strb r9,[r0,#0+2]335mov r11,r7,lsr#24336strb r10,[r0,#0+1]337strb r11,[r0,#0]338#endif339340#if __ARM_ARCH__>=5341ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}342#else343ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}344tst lr,#1345moveq pc,lr @ be binary compatible with V4, yet346.word 0xe12fff1e @ interoperable with Thumb ISA:-)347#endif348.size gcm_gmult_4bit,.-gcm_gmult_4bit349#if __ARM_MAX_ARCH__>=7350.arch armv7-a351.fpu neon352353.globl gcm_init_neon354.type gcm_init_neon,%function355.align 4356gcm_init_neon:357vld1.64 d7,[r1]! @ load H358vmov.i8 q8,#0xe1359vld1.64 d6,[r1]360vshl.i64 d17,#57361vshr.u64 d16,#63 @ t0=0xc2....01362vdup.8 q9,d7[7]363vshr.u64 d26,d6,#63364vshr.s8 q9,#7 @ broadcast carry bit365vshl.i64 q3,q3,#1366vand q8,q8,q9367vorr d7,d26 @ H<<<=1368veor q3,q3,q8 @ twisted H369vstmia r0,{q3}370371bx lr @ bx lr372.size gcm_init_neon,.-gcm_init_neon373374.globl gcm_gmult_neon375.type gcm_gmult_neon,%function376.align 4377gcm_gmult_neon:378vld1.64 d7,[r0]! @ load Xi379vld1.64 d6,[r0]!380vmov.i64 d29,#0x0000ffffffffffff381vldmia r1,{d26,d27} @ load twisted H382vmov.i64 d30,#0x00000000ffffffff383#ifdef __ARMEL__384vrev64.8 q3,q3385#endif386vmov.i64 d31,#0x000000000000ffff387veor d28,d26,d27 @ Karatsuba pre-processing388mov r3,#16389b .Lgmult_neon390.size gcm_gmult_neon,.-gcm_gmult_neon391392.globl gcm_ghash_neon393.type gcm_ghash_neon,%function394.align 4395gcm_ghash_neon:396vld1.64 d1,[r0]! @ load Xi397vld1.64 d0,[r0]!398vmov.i64 d29,#0x0000ffffffffffff399vldmia r1,{d26,d27} @ load twisted H400vmov.i64 d30,#0x00000000ffffffff401#ifdef __ARMEL__402vrev64.8 q0,q0403#endif404vmov.i64 d31,#0x000000000000ffff405veor d28,d26,d27 @ Karatsuba pre-processing406407.Loop_neon:408vld1.64 d7,[r2]! @ load inp409vld1.64 d6,[r2]!410#ifdef __ARMEL__411vrev64.8 q3,q3412#endif413veor q3,q0 @ inp^=Xi414.Lgmult_neon:415vext.8 d16, d26, d26, #1 @ A1416vmull.p8 q8, d16, d6 @ F = A1*B417vext.8 d0, d6, d6, #1 @ B1418vmull.p8 q0, d26, d0 @ E = A*B1419vext.8 d18, d26, d26, #2 @ A2420vmull.p8 q9, d18, d6 @ H = A2*B421vext.8 d22, d6, d6, #2 @ B2422vmull.p8 q11, d26, d22 @ G = A*B2423vext.8 d20, d26, d26, #3 @ A3424veor q8, q8, q0 @ L = E + F425vmull.p8 q10, d20, d6 @ J = A3*B426vext.8 d0, d6, d6, #3 @ B3427veor q9, q9, q11 @ M = G + H428vmull.p8 q0, d26, d0 @ I = A*B3429veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8430vand d17, d17, d29431vext.8 d22, d6, d6, #4 @ B4432veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16433vand d19, d19, d30434vmull.p8 q11, d26, d22 @ K = A*B4435veor q10, q10, q0 @ N = I + J436veor d16, d16, d17437veor d18, d18, d19438veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24439vand d21, d21, d31440vext.8 q8, q8, q8, #15441veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32442vmov.i64 d23, #0443vext.8 q9, q9, q9, #14444veor d20, d20, d21445vmull.p8 q0, d26, d6 @ D = A*B446vext.8 q11, q11, q11, #12447vext.8 q10, q10, q10, #13448veor q8, q8, q9449veor q10, q10, q11450veor q0, q0, q8451veor q0, q0, q10452veor d6,d6,d7 @ Karatsuba pre-processing453vext.8 d16, d28, d28, #1 @ A1454vmull.p8 q8, d16, d6 @ F = A1*B455vext.8 d2, d6, d6, #1 @ B1456vmull.p8 q1, d28, d2 @ E = A*B1457vext.8 d18, d28, d28, #2 @ A2458vmull.p8 q9, d18, d6 @ H = A2*B459vext.8 d22, d6, d6, #2 @ B2460vmull.p8 q11, d28, d22 @ G = A*B2461vext.8 d20, d28, d28, #3 @ A3462veor q8, q8, q1 @ L = E + F463vmull.p8 q10, d20, d6 @ J = A3*B464vext.8 d2, d6, d6, #3 @ B3465veor q9, q9, q11 @ M = G + H466vmull.p8 q1, d28, d2 @ I = A*B3467veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8468vand d17, d17, d29469vext.8 d22, d6, d6, #4 @ B4470veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16471vand d19, d19, d30472vmull.p8 q11, d28, d22 @ K = A*B4473veor q10, q10, q1 @ N = I + J474veor d16, d16, d17475veor d18, d18, d19476veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24477vand d21, d21, d31478vext.8 q8, q8, q8, #15479veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32480vmov.i64 d23, #0481vext.8 q9, q9, q9, #14482veor d20, d20, d21483vmull.p8 q1, d28, d6 @ D = A*B484vext.8 q11, q11, q11, #12485vext.8 q10, q10, q10, #13486veor q8, q8, q9487veor q10, q10, q11488veor q1, q1, q8489veor q1, q1, q10490vext.8 d16, d27, d27, #1 @ A1491vmull.p8 q8, d16, d7 @ F = A1*B492vext.8 d4, d7, d7, #1 @ B1493vmull.p8 q2, d27, d4 @ E = A*B1494vext.8 d18, d27, d27, #2 @ A2495vmull.p8 q9, d18, d7 @ H = A2*B496vext.8 d22, d7, d7, #2 @ B2497vmull.p8 q11, d27, d22 @ G = A*B2498vext.8 d20, d27, d27, #3 @ A3499veor q8, q8, q2 @ L = E + F500vmull.p8 q10, d20, d7 @ J = A3*B501vext.8 d4, d7, d7, #3 @ B3502veor q9, q9, q11 @ M = G + H503vmull.p8 q2, d27, d4 @ I = A*B3504veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8505vand d17, d17, d29506vext.8 d22, d7, d7, #4 @ B4507veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16508vand d19, d19, d30509vmull.p8 q11, d27, d22 @ K = A*B4510veor q10, q10, q2 @ N = I + J511veor d16, d16, d17512veor d18, d18, d19513veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24514vand d21, d21, d31515vext.8 q8, q8, q8, #15516veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32517vmov.i64 d23, #0518vext.8 q9, q9, q9, #14519veor d20, d20, d21520vmull.p8 q2, d27, d7 @ D = A*B521vext.8 q11, q11, q11, #12522vext.8 q10, q10, q10, #13523veor q8, q8, q9524veor q10, q10, q11525veor q2, q2, q8526veor q2, q2, q10527veor q1,q1,q0 @ Karatsuba post-processing528veor q1,q1,q2529veor d1,d1,d2530veor d4,d4,d3 @ Xh|Xl - 256-bit result531532@ equivalent of reduction_avx from ghash-x86_64.pl533vshl.i64 q9,q0,#57 @ 1st phase534vshl.i64 q10,q0,#62535veor q10,q10,q9 @536vshl.i64 q9,q0,#63537veor q10, q10, q9 @538veor d1,d1,d20 @539veor d4,d4,d21540541vshr.u64 q10,q0,#1 @ 2nd phase542veor q2,q2,q0543veor q0,q0,q10 @544vshr.u64 q10,q10,#6545vshr.u64 q0,q0,#1 @546veor q0,q0,q2 @547veor q0,q0,q10 @548549subs r3,#16550bne .Loop_neon551552#ifdef __ARMEL__553vrev64.8 q0,q0554#endif555sub r0,#16556vst1.64 d1,[r0]! @ write out Xi557vst1.64 d0,[r0]558559bx lr @ bx lr560.size gcm_ghash_neon,.-gcm_ghash_neon561#endif562.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0563.align 2564.align 2565566567