Path: blob/main/contrib/arm-optimized-routines/string/aarch64/strncmp.S
39486 views
/*1* strncmp - compare two strings2*3* Copyright (c) 2013-2022, Arm Limited.4* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception5*/67/* Assumptions:8*9* ARMv8-a, AArch64.10* MTE compatible.11*/1213#include "asmdefs.h"1415#define REP8_01 0x010101010101010116#define REP8_7f 0x7f7f7f7f7f7f7f7f1718/* Parameters and result. */19#define src1 x020#define src2 x121#define limit x222#define result x02324/* Internal variables. */25#define data1 x326#define data1w w327#define data2 x428#define data2w w429#define has_nul x530#define diff x631#define syndrome x732#define tmp1 x833#define tmp2 x934#define tmp3 x1035#define zeroones x1136#define pos x1237#define mask x1338#define endloop x1439#define count mask40#define offset pos41#define neg_offset x154243/* Define endian dependent shift operations.44On big-endian early bytes are at MSB and on little-endian LSB.45LS_FW means shifting towards early bytes.46LS_BK means shifting towards later bytes.47*/48#ifdef __AARCH64EB__49#define LS_FW lsl50#define LS_BK lsr51#else52#define LS_FW lsr53#define LS_BK lsl54#endif5556ENTRY (__strncmp_aarch64)57cbz limit, L(ret0)58eor tmp1, src1, src259mov zeroones, #REP8_0160tst tmp1, #761and count, src1, #762b.ne L(misaligned8)63cbnz count, L(mutual_align)6465/* NUL detection works on the principle that (X - 1) & (~X) & 0x8066(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and67can be done in parallel across the entire word. */68.p2align 469L(loop_aligned):70ldr data1, [src1], #871ldr data2, [src2], #872L(start_realigned):73subs limit, limit, #874sub tmp1, data1, zeroones75orr tmp2, data1, #REP8_7f76eor diff, data1, data2 /* Non-zero if differences found. */77csinv endloop, diff, xzr, hi /* Last Dword or differences. */78bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */79ccmp endloop, #0, #0, eq80b.eq L(loop_aligned)81/* End of main loop */8283L(full_check):84#ifndef __AARCH64EB__85orr syndrome, diff, has_nul86add limit, limit, 8 /* Rewind limit to before last subs. */87L(syndrome_check):88/* Limit was reached. Check if the NUL byte or the difference89is before the limit. */90rev syndrome, syndrome91rev data1, data192clz pos, syndrome93rev data2, data294lsl data1, data1, pos95cmp limit, pos, lsr #396lsl data2, data2, pos97/* But we need to zero-extend (char is unsigned) the value and then98perform a signed 32-bit subtraction. */99lsr data1, data1, #56100sub result, data1, data2, lsr #56101csel result, result, xzr, hi102ret103#else104/* Not reached the limit, must have found the end or a diff. */105tbz limit, #63, L(not_limit)106add tmp1, limit, 8107cbz limit, L(not_limit)108109lsl limit, tmp1, #3 /* Bits -> bytes. */110mov mask, #~0111lsr mask, mask, limit112bic data1, data1, mask113bic data2, data2, mask114115/* Make sure that the NUL byte is marked in the syndrome. */116orr has_nul, has_nul, mask117118L(not_limit):119/* For big-endian we cannot use the trick with the syndrome value120as carry-propagation can corrupt the upper bits if the trailing121bytes in the string contain 0x01. */122/* However, if there is no NUL byte in the dword, we can generate123the result directly. We can't just subtract the bytes as the124MSB might be significant. */125cbnz has_nul, 1f126cmp data1, data2127cset result, ne128cneg result, result, lo129ret1301:131/* Re-compute the NUL-byte detection, using a byte-reversed value. */132rev tmp3, data1133sub tmp1, tmp3, zeroones134orr tmp2, tmp3, #REP8_7f135bic has_nul, tmp1, tmp2136rev has_nul, has_nul137orr syndrome, diff, has_nul138clz pos, syndrome139/* The most-significant-non-zero bit of the syndrome marks either the140first bit that is different, or the top bit of the first zero byte.141Shifting left now will bring the critical information into the142top bits. */143L(end_quick):144lsl data1, data1, pos145lsl data2, data2, pos146/* But we need to zero-extend (char is unsigned) the value and then147perform a signed 32-bit subtraction. */148lsr data1, data1, #56149sub result, data1, data2, lsr #56150ret151#endif152153L(mutual_align):154/* Sources are mutually aligned, but are not currently at an155alignment boundary. Round down the addresses and then mask off156the bytes that precede the start point.157We also need to adjust the limit calculations, but without158overflowing if the limit is near ULONG_MAX. */159bic src1, src1, #7160bic src2, src2, #7161ldr data1, [src1], #8162neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */163ldr data2, [src2], #8164mov tmp2, #~0165LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */166/* Adjust the limit and ensure it doesn't overflow. */167adds limit, limit, count168csinv limit, limit, xzr, lo169orr data1, data1, tmp2170orr data2, data2, tmp2171b L(start_realigned)172173.p2align 4174/* Don't bother with dwords for up to 16 bytes. */175L(misaligned8):176cmp limit, #16177b.hs L(try_misaligned_words)178179L(byte_loop):180/* Perhaps we can do better than this. */181ldrb data1w, [src1], #1182ldrb data2w, [src2], #1183subs limit, limit, #1184ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */185ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */186b.eq L(byte_loop)187L(done):188sub result, data1, data2189ret190/* Align the SRC1 to a dword by doing a bytewise compare and then do191the dword loop. */192L(try_misaligned_words):193cbz count, L(src1_aligned)194195neg count, count196and count, count, #7197sub limit, limit, count198199L(page_end_loop):200ldrb data1w, [src1], #1201ldrb data2w, [src2], #1202cmp data1w, #1203ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */204b.ne L(done)205subs count, count, #1206b.hi L(page_end_loop)207208/* The following diagram explains the comparison of misaligned strings.209The bytes are shown in natural order. For little-endian, it is210reversed in the registers. The "x" bytes are before the string.211The "|" separates data that is loaded at one time.212src1 | a a a a a a a a | b b b c c c c c | . . .213src2 | x x x x x a a a a a a a a b b b | c c c c c . . .214215After shifting in each step, the data looks like this:216STEP_A STEP_B STEP_C217data1 a a a a a a a a b b b c c c c c b b b c c c c c218data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c219220The bytes with "0" are eliminated from the syndrome via mask.221222Align SRC2 down to 16 bytes. This way we can read 16 bytes at a223time from SRC2. The comparison happens in 3 steps. After each step224the loop can exit, or read from SRC1 or SRC2. */225L(src1_aligned):226/* Calculate offset from 8 byte alignment to string start in bits. No227need to mask offset since shifts are ignoring upper bits. */228lsl offset, src2, #3229bic src2, src2, #0xf230mov mask, -1231neg neg_offset, offset232ldr data1, [src1], #8233ldp tmp1, tmp2, [src2], #16234LS_BK mask, mask, neg_offset235and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */236/* Skip the first compare if data in tmp1 is irrelevant. */237tbnz offset, 6, L(misaligned_mid_loop)238239L(loop_misaligned):240/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/241LS_FW data2, tmp1, offset242LS_BK tmp1, tmp2, neg_offset243subs limit, limit, #8244orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/245sub has_nul, data1, zeroones246eor diff, data1, data2 /* Non-zero if differences found. */247orr tmp3, data1, #REP8_7f248csinv endloop, diff, xzr, hi /* If limit, set to all ones. */249bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */250orr tmp3, endloop, has_nul251cbnz tmp3, L(full_check)252253ldr data1, [src1], #8254L(misaligned_mid_loop):255/* STEP_B: Compare first part of data1 to second part of tmp2. */256LS_FW data2, tmp2, offset257#ifdef __AARCH64EB__258/* For big-endian we do a byte reverse to avoid carry-propagation259problem described above. This way we can reuse the has_nul in the260next step and also use syndrome value trick at the end. */261rev tmp3, data1262#define data1_fixed tmp3263#else264#define data1_fixed data1265#endif266sub has_nul, data1_fixed, zeroones267orr tmp3, data1_fixed, #REP8_7f268eor diff, data2, data1 /* Non-zero if differences found. */269bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */270#ifdef __AARCH64EB__271rev has_nul, has_nul272#endif273cmp limit, neg_offset, lsr #3274orr syndrome, diff, has_nul275bic syndrome, syndrome, mask /* Ignore later bytes. */276csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */277cbnz tmp3, L(syndrome_check)278279/* STEP_C: Compare second part of data1 to first part of tmp1. */280ldp tmp1, tmp2, [src2], #16281cmp limit, #8282LS_BK data2, tmp1, neg_offset283eor diff, data2, data1 /* Non-zero if differences found. */284orr syndrome, diff, has_nul285and syndrome, syndrome, mask /* Ignore earlier bytes. */286csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */287cbnz tmp3, L(syndrome_check)288289ldr data1, [src1], #8290sub limit, limit, #8291b L(loop_misaligned)292293#ifdef __AARCH64EB__294L(syndrome_check):295clz pos, syndrome296cmp pos, limit, lsl #3297b.lo L(end_quick)298#endif299300L(ret0):301mov result, #0302ret303END(__strncmp_aarch64)304305306307