Path: blob/main/contrib/arm-optimized-routines/string/aarch64/strrchr-mte.S
39498 views
/*1* strrchr - find last position of a character in a string.2*3* Copyright (c) 2020-2023, Arm Limited.4* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception5*/67/* Assumptions:8*9* ARMv8-a, AArch64, Advanced SIMD.10* MTE compatible.11*/1213#include "asmdefs.h"1415#define srcin x016#define chrin w117#define result x01819#define src x220#define tmp x321#define synd x322#define shift x423#define src_match x424#define nul_match x525#define chr_match x62627#define vrepchr v028#define vdata v129#define vhas_nul v230#define vhas_chr v331#define vrepmask v432#define vend v533#define dend d53435/* Core algorithm.3637For each 16-byte chunk we calculate a 64-bit syndrome value, with38four bits per byte (LSB is always in bits 0 and 1, for both big39and little-endian systems). For each tuple, bits 0-1 are set if40the relevant byte matched the requested character; bits 2-3 are set41if the relevant byte matched the NUL end of string. */4243ENTRY (__strrchr_aarch64_mte)44bic src, srcin, 1545dup vrepchr.16b, chrin46movi vrepmask.16b, 0x3347ld1 {vdata.16b}, [src]48cmeq vhas_nul.16b, vdata.16b, 049cmeq vhas_chr.16b, vdata.16b, vrepchr.16b50bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b51shrn vend.8b, vhas_nul.8h, 452lsl shift, srcin, 253fmov synd, dend54lsr synd, synd, shift55lsl synd, synd, shift56ands nul_match, synd, 0xcccccccccccccccc57bne L(tail)58cbnz synd, L(loop2_start)5960.p2align 461L(loop1):62ldr q1, [src, 16]63cmeq vhas_chr.16b, vdata.16b, vrepchr.16b64cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b65umaxp vend.16b, vhas_nul.16b, vhas_nul.16b66fmov synd, dend67cbnz synd, L(loop1_end)68ldr q1, [src, 32]!69cmeq vhas_chr.16b, vdata.16b, vrepchr.16b70cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b71umaxp vend.16b, vhas_nul.16b, vhas_nul.16b72fmov synd, dend73cbz synd, L(loop1)74sub src, src, 1675L(loop1_end):76add src, src, 1677cmeq vhas_nul.16b, vdata.16b, 078#ifdef __AARCH64EB__79bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b80shrn vend.8b, vhas_nul.8h, 481fmov synd, dend82rbit synd, synd83#else84bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b85shrn vend.8b, vhas_nul.8h, 486fmov synd, dend87#endif88ands nul_match, synd, 0xcccccccccccccccc89beq L(loop2_start)90L(tail):91sub nul_match, nul_match, 192and chr_match, synd, 0x333333333333333393ands chr_match, chr_match, nul_match94add result, src, 1595clz tmp, chr_match96sub result, result, tmp, lsr 297csel result, result, xzr, ne98ret99100.p2align 4101nop102nop103L(loop2_start):104add src, src, 16105bic vrepmask.8h, 0xf0106107L(loop2):108cmp synd, 0109csel src_match, src, src_match, ne110csel chr_match, synd, chr_match, ne111ld1 {vdata.16b}, [src], 16112cmeq vhas_nul.16b, vdata.16b, 0113cmeq vhas_chr.16b, vdata.16b, vrepchr.16b114bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b115umaxp vend.16b, vhas_nul.16b, vhas_nul.16b116fmov synd, dend117tst synd, 0xcccccccccccccccc118beq L(loop2)119120bic vhas_nul.8h, 0x0f, lsl 8121addp vend.16b, vhas_nul.16b, vhas_nul.16b122fmov synd, dend123and nul_match, synd, 0xcccccccccccccccc124sub nul_match, nul_match, 1125and tmp, synd, 0x3333333333333333126ands tmp, tmp, nul_match127csel chr_match, tmp, chr_match, ne128csel src_match, src, src_match, ne129sub src_match, src_match, 1130clz tmp, chr_match131sub result, src_match, tmp, lsr 2132ret133134END (__strrchr_aarch64_mte)135136137138