/* SPDX-License-Identifier: GPL-2.0-only */1/*2* Copyright (c) 2013-2022, Arm Limited.3*4* Adapted from the original at:5* https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strncmp.S6*/78#include <linux/linkage.h>9#include <asm/assembler.h>1011/* Assumptions:12*13* ARMv8-a, AArch64.14* MTE compatible.15*/1617#define L(label) .L ## label1819#define REP8_01 0x010101010101010120#define REP8_7f 0x7f7f7f7f7f7f7f7f2122/* Parameters and result. */23#define src1 x024#define src2 x125#define limit x226#define result x02728/* Internal variables. */29#define data1 x330#define data1w w331#define data2 x432#define data2w w433#define has_nul x534#define diff x635#define syndrome x736#define tmp1 x837#define tmp2 x938#define tmp3 x1039#define zeroones x1140#define pos x1241#define mask x1342#define endloop x1443#define count mask44#define offset pos45#define neg_offset x154647/* Define endian dependent shift operations.48On big-endian early bytes are at MSB and on little-endian LSB.49LS_FW means shifting towards early bytes.50LS_BK means shifting towards later bytes.51*/52#ifdef __AARCH64EB__53#define LS_FW lsl54#define LS_BK lsr55#else56#define LS_FW lsr57#define LS_BK lsl58#endif5960SYM_FUNC_START(__pi_strncmp)61cbz limit, L(ret0)62eor tmp1, src1, src263mov zeroones, #REP8_0164tst tmp1, #765and count, src1, #766b.ne L(misaligned8)67cbnz count, L(mutual_align)6869/* NUL detection works on the principle that (X - 1) & (~X) & 0x8070(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and71can be done in parallel across the entire word. */72.p2align 473L(loop_aligned):74ldr data1, [src1], #875ldr data2, [src2], #876L(start_realigned):77subs limit, limit, #878sub tmp1, data1, zeroones79orr tmp2, data1, #REP8_7f80eor diff, data1, data2 /* Non-zero if differences found. */81csinv endloop, diff, xzr, hi /* Last Dword or differences. */82bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */83ccmp endloop, #0, #0, eq84b.eq L(loop_aligned)85/* End of main loop */8687L(full_check):88#ifndef __AARCH64EB__89orr syndrome, diff, has_nul90add limit, limit, 8 /* Rewind limit to before last subs. */91L(syndrome_check):92/* Limit was reached. Check if the NUL byte or the difference93is before the limit. */94rev syndrome, syndrome95rev data1, data196clz pos, syndrome97rev data2, data298lsl data1, data1, pos99cmp limit, pos, lsr #3100lsl data2, data2, pos101/* But we need to zero-extend (char is unsigned) the value and then102perform a signed 32-bit subtraction. */103lsr data1, data1, #56104sub result, data1, data2, lsr #56105csel result, result, xzr, hi106ret107#else108/* Not reached the limit, must have found the end or a diff. */109tbz limit, #63, L(not_limit)110add tmp1, limit, 8111cbz limit, L(not_limit)112113lsl limit, tmp1, #3 /* Bits -> bytes. */114mov mask, #~0115lsr mask, mask, limit116bic data1, data1, mask117bic data2, data2, mask118119/* Make sure that the NUL byte is marked in the syndrome. */120orr has_nul, has_nul, mask121122L(not_limit):123/* For big-endian we cannot use the trick with the syndrome value124as carry-propagation can corrupt the upper bits if the trailing125bytes in the string contain 0x01. */126/* However, if there is no NUL byte in the dword, we can generate127the result directly. We can't just subtract the bytes as the128MSB might be significant. */129cbnz has_nul, 1f130cmp data1, data2131cset result, ne132cneg result, result, lo133ret1341:135/* Re-compute the NUL-byte detection, using a byte-reversed value. */136rev tmp3, data1137sub tmp1, tmp3, zeroones138orr tmp2, tmp3, #REP8_7f139bic has_nul, tmp1, tmp2140rev has_nul, has_nul141orr syndrome, diff, has_nul142clz pos, syndrome143/* The most-significant-non-zero bit of the syndrome marks either the144first bit that is different, or the top bit of the first zero byte.145Shifting left now will bring the critical information into the146top bits. */147L(end_quick):148lsl data1, data1, pos149lsl data2, data2, pos150/* But we need to zero-extend (char is unsigned) the value and then151perform a signed 32-bit subtraction. */152lsr data1, data1, #56153sub result, data1, data2, lsr #56154ret155#endif156157L(mutual_align):158/* Sources are mutually aligned, but are not currently at an159alignment boundary. Round down the addresses and then mask off160the bytes that precede the start point.161We also need to adjust the limit calculations, but without162overflowing if the limit is near ULONG_MAX. */163bic src1, src1, #7164bic src2, src2, #7165ldr data1, [src1], #8166neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */167ldr data2, [src2], #8168mov tmp2, #~0169LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */170/* Adjust the limit and ensure it doesn't overflow. */171adds limit, limit, count172csinv limit, limit, xzr, lo173orr data1, data1, tmp2174orr data2, data2, tmp2175b L(start_realigned)176177.p2align 4178/* Don't bother with dwords for up to 16 bytes. */179L(misaligned8):180cmp limit, #16181b.hs L(try_misaligned_words)182183L(byte_loop):184/* Perhaps we can do better than this. */185ldrb data1w, [src1], #1186ldrb data2w, [src2], #1187subs limit, limit, #1188ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */189ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */190b.eq L(byte_loop)191L(done):192sub result, data1, data2193ret194/* Align the SRC1 to a dword by doing a bytewise compare and then do195the dword loop. */196L(try_misaligned_words):197cbz count, L(src1_aligned)198199neg count, count200and count, count, #7201sub limit, limit, count202203L(page_end_loop):204ldrb data1w, [src1], #1205ldrb data2w, [src2], #1206cmp data1w, #1207ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */208b.ne L(done)209subs count, count, #1210b.hi L(page_end_loop)211212/* The following diagram explains the comparison of misaligned strings.213The bytes are shown in natural order. For little-endian, it is214reversed in the registers. The "x" bytes are before the string.215The "|" separates data that is loaded at one time.216src1 | a a a a a a a a | b b b c c c c c | . . .217src2 | x x x x x a a a a a a a a b b b | c c c c c . . .218219After shifting in each step, the data looks like this:220STEP_A STEP_B STEP_C221data1 a a a a a a a a b b b c c c c c b b b c c c c c222data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c223224The bytes with "0" are eliminated from the syndrome via mask.225226Align SRC2 down to 16 bytes. This way we can read 16 bytes at a227time from SRC2. The comparison happens in 3 steps. After each step228the loop can exit, or read from SRC1 or SRC2. */229L(src1_aligned):230/* Calculate offset from 8 byte alignment to string start in bits. No231need to mask offset since shifts are ignoring upper bits. */232lsl offset, src2, #3233bic src2, src2, #0xf234mov mask, -1235neg neg_offset, offset236ldr data1, [src1], #8237ldp tmp1, tmp2, [src2], #16238LS_BK mask, mask, neg_offset239and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */240/* Skip the first compare if data in tmp1 is irrelevant. */241tbnz offset, 6, L(misaligned_mid_loop)242243L(loop_misaligned):244/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/245LS_FW data2, tmp1, offset246LS_BK tmp1, tmp2, neg_offset247subs limit, limit, #8248orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/249sub has_nul, data1, zeroones250eor diff, data1, data2 /* Non-zero if differences found. */251orr tmp3, data1, #REP8_7f252csinv endloop, diff, xzr, hi /* If limit, set to all ones. */253bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */254orr tmp3, endloop, has_nul255cbnz tmp3, L(full_check)256257ldr data1, [src1], #8258L(misaligned_mid_loop):259/* STEP_B: Compare first part of data1 to second part of tmp2. */260LS_FW data2, tmp2, offset261#ifdef __AARCH64EB__262/* For big-endian we do a byte reverse to avoid carry-propagation263problem described above. This way we can reuse the has_nul in the264next step and also use syndrome value trick at the end. */265rev tmp3, data1266#define data1_fixed tmp3267#else268#define data1_fixed data1269#endif270sub has_nul, data1_fixed, zeroones271orr tmp3, data1_fixed, #REP8_7f272eor diff, data2, data1 /* Non-zero if differences found. */273bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */274#ifdef __AARCH64EB__275rev has_nul, has_nul276#endif277cmp limit, neg_offset, lsr #3278orr syndrome, diff, has_nul279bic syndrome, syndrome, mask /* Ignore later bytes. */280csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */281cbnz tmp3, L(syndrome_check)282283/* STEP_C: Compare second part of data1 to first part of tmp1. */284ldp tmp1, tmp2, [src2], #16285cmp limit, #8286LS_BK data2, tmp1, neg_offset287eor diff, data2, data1 /* Non-zero if differences found. */288orr syndrome, diff, has_nul289and syndrome, syndrome, mask /* Ignore earlier bytes. */290csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */291cbnz tmp3, L(syndrome_check)292293ldr data1, [src1], #8294sub limit, limit, #8295b L(loop_misaligned)296297#ifdef __AARCH64EB__298L(syndrome_check):299clz pos, syndrome300cmp pos, limit, lsl #3301b.lo L(end_quick)302#endif303304L(ret0):305mov result, #0306ret307SYM_FUNC_END(__pi_strncmp)308SYM_FUNC_ALIAS_WEAK(strncmp, __pi_strncmp)309EXPORT_SYMBOL_NOKASAN(strncmp)310311312