Path: blob/main/contrib/arm-optimized-routines/string/arm/strcmp.S
39556 views
/*1* strcmp for ARMv72*3* Copyright (c) 2012-2022, Arm Limited.4* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception5*/67#if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 189/* Implementation of strcmp for ARMv7 when DSP instructions are10available. Use ldrd to support wider loads, provided the data11is sufficiently aligned. Use saturating arithmetic to optimize12the compares. */1314#include "asmdefs.h"1516/* Build Options:17STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first18byte in the string. If comparing completely random strings19the pre-check will save time, since there is a very high20probability of a mismatch in the first character: we save21significant overhead if this is the common case. However,22if strings are likely to be identical (eg because we're23verifying a hit in a hash table), then this check is largely24redundant. */2526#define STRCMP_NO_PRECHECK 02728/* Ensure the .cantunwind directive is prepended to .fnend.29Leaf functions cannot throw exceptions - EHABI only supports30synchronous exceptions. */31#define IS_LEAF3233/* This version uses Thumb-2 code. */34.thumb35.syntax unified3637#ifdef __ARM_BIG_ENDIAN38#define S2LO lsl39#define S2LOEQ lsleq40#define S2HI lsr41#define MSB 0x000000ff42#define LSB 0xff00000043#define BYTE0_OFFSET 2444#define BYTE1_OFFSET 1645#define BYTE2_OFFSET 846#define BYTE3_OFFSET 047#else /* not __ARM_BIG_ENDIAN */48#define S2LO lsr49#define S2LOEQ lsreq50#define S2HI lsl51#define BYTE0_OFFSET 052#define BYTE1_OFFSET 853#define BYTE2_OFFSET 1654#define BYTE3_OFFSET 2455#define MSB 0xff00000056#define LSB 0x000000ff57#endif /* not __ARM_BIG_ENDIAN */5859/* Parameters and result. */60#define src1 r061#define src2 r162#define result r0 /* Overlaps src1. */6364/* Internal variables. */65#define tmp1 r466#define tmp2 r567#define const_m1 r126869/* Additional internal variables for 64-bit aligned data. */70#define data1a r271#define data1b r372#define data2a r673#define data2b r774#define syndrome_a tmp175#define syndrome_b tmp27677/* Additional internal variables for 32-bit aligned data. */78#define data1 r279#define data2 r380#define syndrome tmp2818283/* Macro to compute and return the result value for word-aligned84cases. */85.macro strcmp_epilogue_aligned synd d1 d2 restore_r686#ifdef __ARM_BIG_ENDIAN87/* If data1 contains a zero byte, then syndrome will contain a 1 in88bit 7 of that byte. Otherwise, the highest set bit in the89syndrome will highlight the first different bit. It is therefore90sufficient to extract the eight bits starting with the syndrome91bit. */92clz tmp1, \synd93lsl r1, \d2, tmp194.if \restore_r695ldrd r6, r7, [sp, #8]96.endif97.cfi_restore 698.cfi_restore 799lsl \d1, \d1, tmp1100.cfi_remember_state101lsr result, \d1, #24102ldrd r4, r5, [sp], #16103.cfi_restore 4104.cfi_restore 5105.cfi_adjust_cfa_offset -16106sub result, result, r1, lsr #24107epilogue push_ip=HAVE_PAC_LEAF108#else109/* To use the big-endian trick we'd have to reverse all three words.110that's slower than this approach. */111rev \synd, \synd112clz tmp1, \synd113bic tmp1, tmp1, #7114lsr r1, \d2, tmp1115.cfi_remember_state116.if \restore_r6117ldrd r6, r7, [sp, #8]118.endif119.cfi_restore 6120.cfi_restore 7121lsr \d1, \d1, tmp1122and result, \d1, #255123and r1, r1, #255124ldrd r4, r5, [sp], #16125.cfi_restore 4126.cfi_restore 5127.cfi_adjust_cfa_offset -16128sub result, result, r1129130epilogue push_ip=HAVE_PAC_LEAF131#endif132.endm133134ENTRY(__strcmp_arm)135prologue push_ip=HAVE_PAC_LEAF136#if STRCMP_NO_PRECHECK == 0137ldrb r2, [src1]138ldrb r3, [src2]139cmp r2, #1140it cs141cmpcs r2, r3142bne L(fastpath_exit)143#endif144strd r4, r5, [sp, #-16]!145.cfi_adjust_cfa_offset 16146.cfi_rel_offset 4, 0147.cfi_rel_offset 5, 4148orr tmp1, src1, src2149strd r6, r7, [sp, #8]150.cfi_rel_offset 6, 8151.cfi_rel_offset 7, 12152mvn const_m1, #0153lsl r2, tmp1, #29154cbz r2, L(loop_aligned8)155156L(not_aligned):157eor tmp1, src1, src2158tst tmp1, #7159bne L(misaligned8)160161/* Deal with mutual misalignment by aligning downwards and then162masking off the unwanted loaded data to prevent a difference. */163and tmp1, src1, #7164bic src1, src1, #7165and tmp2, tmp1, #3166bic src2, src2, #7167lsl tmp2, tmp2, #3 /* Bytes -> bits. */168ldrd data1a, data1b, [src1], #16169tst tmp1, #4170ldrd data2a, data2b, [src2], #16171/* In thumb code we can't use MVN with a register shift, but172we do have ORN. */173S2HI tmp1, const_m1, tmp2174orn data1a, data1a, tmp1175orn data2a, data2a, tmp1176beq L(start_realigned8)177orn data1b, data1b, tmp1178mov data1a, const_m1179orn data2b, data2b, tmp1180mov data2a, const_m1181b L(start_realigned8)182183/* Unwind the inner loop by a factor of 2, giving 16 bytes per184pass. */185.p2align 5,,12 /* Don't start in the tail bytes of a cache line. */186.p2align 2 /* Always word aligned. */187L(loop_aligned8):188ldrd data1a, data1b, [src1], #16189ldrd data2a, data2b, [src2], #16190L(start_realigned8):191uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */192eor syndrome_a, data1a, data2a193sel syndrome_a, syndrome_a, const_m1194cbnz syndrome_a, L(diff_in_a)195uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */196eor syndrome_b, data1b, data2b197sel syndrome_b, syndrome_b, const_m1198cbnz syndrome_b, L(diff_in_b)199200ldrd data1a, data1b, [src1, #-8]201ldrd data2a, data2b, [src2, #-8]202uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */203eor syndrome_a, data1a, data2a204sel syndrome_a, syndrome_a, const_m1205uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */206eor syndrome_b, data1b, data2b207sel syndrome_b, syndrome_b, const_m1208/* Can't use CBZ for backwards branch. */209orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */210beq L(loop_aligned8)211212L(diff_found):213cbnz syndrome_a, L(diff_in_a)214215L(diff_in_b):216strcmp_epilogue_aligned syndrome_b, data1b, data2b 1217218L(diff_in_a):219.cfi_restore_state220strcmp_epilogue_aligned syndrome_a, data1a, data2a 1221222.cfi_restore_state223L(misaligned8):224tst tmp1, #3225bne L(misaligned4)226ands tmp1, src1, #3227bne L(mutual_align4)228229/* Unrolled by a factor of 2, to reduce the number of post-increment230operations. */231L(loop_aligned4):232ldr data1, [src1], #8233ldr data2, [src2], #8234L(start_realigned4):235uadd8 syndrome, data1, const_m1 /* Only need GE bits. */236eor syndrome, data1, data2237sel syndrome, syndrome, const_m1238cbnz syndrome, L(aligned4_done)239ldr data1, [src1, #-4]240ldr data2, [src2, #-4]241uadd8 syndrome, data1, const_m1242eor syndrome, data1, data2243sel syndrome, syndrome, const_m1244cmp syndrome, #0245beq L(loop_aligned4)246247L(aligned4_done):248strcmp_epilogue_aligned syndrome, data1, data2, 0249250L(mutual_align4):251.cfi_restore_state252/* Deal with mutual misalignment by aligning downwards and then253masking off the unwanted loaded data to prevent a difference. */254lsl tmp1, tmp1, #3 /* Bytes -> bits. */255bic src1, src1, #3256ldr data1, [src1], #8257bic src2, src2, #3258ldr data2, [src2], #8259260/* In thumb code we can't use MVN with a register shift, but261we do have ORN. */262S2HI tmp1, const_m1, tmp1263orn data1, data1, tmp1264orn data2, data2, tmp1265b L(start_realigned4)266267L(misaligned4):268ands tmp1, src1, #3269beq L(src1_aligned)270sub src2, src2, tmp1271bic src1, src1, #3272lsls tmp1, tmp1, #31273ldr data1, [src1], #4274beq L(aligned_m2)275bcs L(aligned_m1)276277#if STRCMP_NO_PRECHECK == 1278ldrb data2, [src2, #1]279uxtb tmp1, data1, ror #BYTE1_OFFSET280subs tmp1, tmp1, data2281bne L(misaligned_exit)282cbz data2, L(misaligned_exit)283284L(aligned_m2):285ldrb data2, [src2, #2]286uxtb tmp1, data1, ror #BYTE2_OFFSET287subs tmp1, tmp1, data2288bne L(misaligned_exit)289cbz data2, L(misaligned_exit)290291L(aligned_m1):292ldrb data2, [src2, #3]293uxtb tmp1, data1, ror #BYTE3_OFFSET294subs tmp1, tmp1, data2295bne L(misaligned_exit)296add src2, src2, #4297cbnz data2, L(src1_aligned)298#else /* STRCMP_NO_PRECHECK */299/* If we've done the pre-check, then we don't need to check the300first byte again here. */301ldrb data2, [src2, #2]302uxtb tmp1, data1, ror #BYTE2_OFFSET303subs tmp1, tmp1, data2304bne L(misaligned_exit)305cbz data2, L(misaligned_exit)306307L(aligned_m2):308ldrb data2, [src2, #3]309uxtb tmp1, data1, ror #BYTE3_OFFSET310subs tmp1, tmp1, data2311bne L(misaligned_exit)312cbnz data2, L(aligned_m1)313#endif314315L(misaligned_exit):316.cfi_remember_state317mov result, tmp1318ldr r4, [sp], #16319.cfi_restore 4320.cfi_adjust_cfa_offset -16321epilogue push_ip=HAVE_PAC_LEAF322323#if STRCMP_NO_PRECHECK == 0324L(fastpath_exit):325.cfi_restore_state326.cfi_remember_state327sub r0, r2, r3328epilogue push_ip=HAVE_PAC_LEAF329330L(aligned_m1):331.cfi_restore_state332.cfi_remember_state333add src2, src2, #4334#endif335L(src1_aligned):336.cfi_restore_state337/* src1 is word aligned, but src2 has no common alignment338with it. */339ldr data1, [src1], #4340lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */341342bic src2, src2, #3343ldr data2, [src2], #4344bhi L(overlap1) /* C=1, Z=0 => src2[1:0] = 0b11. */345bcs L(overlap2) /* C=1, Z=1 => src2[1:0] = 0b10. */346347/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */348L(overlap3):349bic tmp1, data1, #MSB350uadd8 syndrome, data1, const_m1351eors syndrome, tmp1, data2, S2LO #8352sel syndrome, syndrome, const_m1353bne 4f354cbnz syndrome, 5f355ldr data2, [src2], #4356eor tmp1, tmp1, data1357cmp tmp1, data2, S2HI #24358bne 6f359ldr data1, [src1], #4360b L(overlap3)3614:362S2LO data2, data2, #8363b L(strcmp_tail)3643655:366bics syndrome, syndrome, #MSB367bne L(strcmp_done_equal)368369/* We can only get here if the MSB of data1 contains 0, so370fast-path the exit. */371ldrb result, [src2]372.cfi_remember_state373ldrd r4, r5, [sp], #16374.cfi_restore 4375.cfi_restore 5376/* R6/7 Not used in this sequence. */377.cfi_restore 6378.cfi_restore 7379.cfi_adjust_cfa_offset -16380neg result, result381epilogue push_ip=HAVE_PAC_LEAF3826:383.cfi_restore_state384S2LO data1, data1, #24385and data2, data2, #LSB386b L(strcmp_tail)387388.p2align 5,,12 /* Ensure at least 3 instructions in cache line. */389L(overlap2):390and tmp1, data1, const_m1, S2LO #16391uadd8 syndrome, data1, const_m1392eors syndrome, tmp1, data2, S2LO #16393sel syndrome, syndrome, const_m1394bne 4f395cbnz syndrome, 5f396ldr data2, [src2], #4397eor tmp1, tmp1, data1398cmp tmp1, data2, S2HI #16399bne 6f400ldr data1, [src1], #4401b L(overlap2)4024:403S2LO data2, data2, #16404b L(strcmp_tail)4055:406ands syndrome, syndrome, const_m1, S2LO #16407bne L(strcmp_done_equal)408409ldrh data2, [src2]410S2LO data1, data1, #16411#ifdef __ARM_BIG_ENDIAN412lsl data2, data2, #16413#endif414b L(strcmp_tail)4154166:417S2LO data1, data1, #16418and data2, data2, const_m1, S2LO #16419b L(strcmp_tail)420421.p2align 5,,12 /* Ensure at least 3 instructions in cache line. */422L(overlap1):423and tmp1, data1, #LSB424uadd8 syndrome, data1, const_m1425eors syndrome, tmp1, data2, S2LO #24426sel syndrome, syndrome, const_m1427bne 4f428cbnz syndrome, 5f429ldr data2, [src2], #4430eor tmp1, tmp1, data1431cmp tmp1, data2, S2HI #8432bne 6f433ldr data1, [src1], #4434b L(overlap1)4354:436S2LO data2, data2, #24437b L(strcmp_tail)4385:439tst syndrome, #LSB440bne L(strcmp_done_equal)441ldr data2, [src2]4426:443S2LO data1, data1, #8444bic data2, data2, #MSB445b L(strcmp_tail)446447L(strcmp_done_equal):448mov result, #0449.cfi_remember_state450ldrd r4, r5, [sp], #16451.cfi_restore 4452.cfi_restore 5453/* R6/7 not used in this sequence. */454.cfi_restore 6455.cfi_restore 7456.cfi_adjust_cfa_offset -16457epilogue push_ip=HAVE_PAC_LEAF458459L(strcmp_tail):460.cfi_restore_state461#ifndef __ARM_BIG_ENDIAN462rev data1, data1463rev data2, data2464/* Now everything looks big-endian... */465#endif466uadd8 tmp1, data1, const_m1467eor tmp1, data1, data2468sel syndrome, tmp1, const_m1469clz tmp1, syndrome470lsl data1, data1, tmp1471lsl data2, data2, tmp1472lsr result, data1, #24473ldrd r4, r5, [sp], #16474.cfi_restore 4475.cfi_restore 5476/* R6/7 not used in this sequence. */477.cfi_restore 6478.cfi_restore 7479.cfi_adjust_cfa_offset -16480sub result, result, data2, lsr #24481epilogue push_ip=HAVE_PAC_LEAF482483END (__strcmp_arm)484485#endif /* __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 */486487488