Path: blob/main/contrib/arm-optimized-routines/string/aarch64/memchr.S
39486 views
/*1* memchr - find a character in a memory zone2*3* Copyright (c) 2014-2022, Arm Limited.4* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception5*/67/* Assumptions:8*9* ARMv8-a, AArch6410* Neon Available.11*/1213#include "asmdefs.h"1415/* Arguments and results. */16#define srcin x017#define chrin w118#define cntin x21920#define result x02122#define src x323#define tmp x424#define wtmp2 w525#define synd x626#define soff x927#define cntrem x102829#define vrepchr v030#define vdata1 v131#define vdata2 v232#define vhas_chr1 v333#define vhas_chr2 v434#define vrepmask v535#define vend v63637/*38* Core algorithm:39*40* For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits41* per byte. For each tuple, bit 0 is set if the relevant byte matched the42* requested character and bit 1 is not used (faster than using a 32bit43* syndrome). Since the bits in the syndrome reflect exactly the order in which44* things occur in the original string, counting trailing zeros allows to45* identify exactly which byte has matched.46*/4748ENTRY (__memchr_aarch64)49/* Do not dereference srcin if no bytes to compare. */50cbz cntin, L(zero_length)51/*52* Magic constant 0x40100401 allows us to identify which lane matches53* the requested byte.54*/55mov wtmp2, #0x040156movk wtmp2, #0x4010, lsl #1657dup vrepchr.16b, chrin58/* Work with aligned 32-byte chunks */59bic src, srcin, #3160dup vrepmask.4s, wtmp261ands soff, srcin, #3162and cntrem, cntin, #3163b.eq L(loop)6465/*66* Input string is not 32-byte aligned. We calculate the syndrome67* value for the aligned 32 bytes block containing the first bytes68* and mask the irrelevant part.69*/7071ld1 {vdata1.16b, vdata2.16b}, [src], #3272sub tmp, soff, #3273adds cntin, cntin, tmp74cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b75cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b76and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b77and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b78addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */79addp vend.16b, vend.16b, vend.16b /* 128->64 */80mov synd, vend.d[0]81/* Clear the soff*2 lower bits */82lsl tmp, soff, #183lsr synd, synd, tmp84lsl synd, synd, tmp85/* The first block can also be the last */86b.ls L(masklast)87/* Have we found something already? */88cbnz synd, L(tail)8990L(loop):91ld1 {vdata1.16b, vdata2.16b}, [src], #3292subs cntin, cntin, #3293cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b94cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b95/* If we're out of data we finish regardless of the result */96b.ls L(end)97/* Use a fast check for the termination condition */98orr vend.16b, vhas_chr1.16b, vhas_chr2.16b99addp vend.2d, vend.2d, vend.2d100mov synd, vend.d[0]101/* We're not out of data, loop if we haven't found the character */102cbz synd, L(loop)103104L(end):105/* Termination condition found, let's calculate the syndrome value */106and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b107and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b108addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */109addp vend.16b, vend.16b, vend.16b /* 128->64 */110mov synd, vend.d[0]111/* Only do the clear for the last possible block */112b.hs L(tail)113114L(masklast):115/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */116add tmp, cntrem, soff117and tmp, tmp, #31118sub tmp, tmp, #32119neg tmp, tmp, lsl #1120lsl synd, synd, tmp121lsr synd, synd, tmp122123L(tail):124/* Count the trailing zeros using bit reversing */125rbit synd, synd126/* Compensate the last post-increment */127sub src, src, #32128/* Check that we have found a character */129cmp synd, #0130/* And count the leading zeros */131clz synd, synd132/* Compute the potential result */133add result, src, synd, lsr #1134/* Select result or NULL */135csel result, xzr, result, eq136ret137138L(zero_length):139mov result, #0140ret141142END (__memchr_aarch64)143144145146