Path: blob/main/contrib/arm-optimized-routines/string/aarch64/memrchr.S
39491 views
/*1* memrchr - find last character in a memory zone.2*3* Copyright (c) 2020-2022, Arm Limited.4* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception5*/67/* Assumptions:8*9* ARMv8-a, AArch64, Advanced SIMD.10* MTE compatible.11*/1213#include "asmdefs.h"1415#define srcin x016#define chrin w117#define cntin x218#define result x01920#define src x321#define cntrem x422#define synd x523#define shift x624#define tmp x725#define end x826#define endm1 x92728#define vrepchr v029#define qdata q130#define vdata v131#define vhas_chr v232#define vend v333#define dend d33435/*36Core algorithm:37For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits38per byte. We take 4 bits of every comparison byte with shift right and narrow39by 4 instruction. Since the bits in the nibble mask reflect the order in40which things occur in the original string, counting leading zeros identifies41exactly which byte matched. */4243ENTRY (__memrchr_aarch64)44add end, srcin, cntin45sub endm1, end, 146bic src, endm1, 1547cbz cntin, L(nomatch)48ld1 {vdata.16b}, [src]49dup vrepchr.16b, chrin50cmeq vhas_chr.16b, vdata.16b, vrepchr.16b51neg shift, end, lsl 252shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */53fmov synd, dend54lsl synd, synd, shift55cbz synd, L(start_loop)5657clz synd, synd58sub result, endm1, synd, lsr 259cmp cntin, synd, lsr 260csel result, result, xzr, hi61ret6263nop64L(start_loop):65subs cntrem, src, srcin66b.ls L(nomatch)6768/* Make sure that it won't overread by a 16-byte chunk */69sub cntrem, cntrem, 170tbz cntrem, 4, L(loop32_2)71add src, src, 167273.p2align 574L(loop32):75ldr qdata, [src, -32]!76cmeq vhas_chr.16b, vdata.16b, vrepchr.16b77umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */78fmov synd, dend79cbnz synd, L(end)8081L(loop32_2):82ldr qdata, [src, -16]83subs cntrem, cntrem, 3284cmeq vhas_chr.16b, vdata.16b, vrepchr.16b85b.lo L(end_2)86umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */87fmov synd, dend88cbz synd, L(loop32)89L(end_2):90sub src, src, 1691L(end):92shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */93fmov synd, dend9495add tmp, src, 1596#ifdef __AARCH64EB__97rbit synd, synd98#endif99clz synd, synd100sub tmp, tmp, synd, lsr 2101cmp tmp, srcin102csel result, tmp, xzr, hs103ret104105L(nomatch):106mov result, 0107ret108109END (__memrchr_aarch64)110111112113