Path: blob/main/contrib/arm-optimized-routines/string/aarch64/strchrnul.S
39486 views
/*1* strchrnul - find a character or nul in a string2*3* Copyright (c) 2014-2022, Arm Limited.4* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception5*/67/* Assumptions:8*9* ARMv8-a, AArch6410* Neon Available.11*/1213#include "asmdefs.h"1415/* Arguments and results. */16#define srcin x017#define chrin w11819#define result x02021#define src x222#define tmp1 x323#define wtmp2 w424#define tmp3 x52526#define vrepchr v027#define vdata1 v128#define vdata2 v229#define vhas_nul1 v330#define vhas_nul2 v431#define vhas_chr1 v532#define vhas_chr2 v633#define vrepmask v734#define vend1 v163536/* Core algorithm.3738For each 32-byte hunk we calculate a 64-bit syndrome value, with39two bits per byte (LSB is always in bits 0 and 1, for both big40and little-endian systems). For each tuple, bit 0 is set iff41the relevant byte matched the requested character or nul. Since the42bits in the syndrome reflect exactly the order in which things occur43in the original string a count_trailing_zeros() operation will44identify exactly which byte is causing the termination. */4546/* Locals and temporaries. */4748ENTRY (__strchrnul_aarch64)49/* Magic constant 0x40100401 to allow us to identify which lane50matches the termination condition. */51mov wtmp2, #0x040152movk wtmp2, #0x4010, lsl #1653dup vrepchr.16b, chrin54bic src, srcin, #31 /* Work with aligned 32-byte hunks. */55dup vrepmask.4s, wtmp256ands tmp1, srcin, #3157b.eq L(loop)5859/* Input string is not 32-byte aligned. Rather than forcing60the padding bytes to a safe value, we calculate the syndrome61for all the bytes, but then mask off those bits of the62syndrome that are related to the padding. */63ld1 {vdata1.16b, vdata2.16b}, [src], #3264neg tmp1, tmp165cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b66cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b67cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b68cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b69and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b70and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b71lsl tmp1, tmp1, #172addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->12873mov tmp3, #~074addp vend1.16b, vend1.16b, vend1.16b // 128->6475lsr tmp1, tmp3, tmp17677mov tmp3, vend1.d[0]78bic tmp1, tmp3, tmp1 // Mask padding bits.79cbnz tmp1, L(tail)8081.p2align 482L(loop):83ld1 {vdata1.16b, vdata2.16b}, [src], #3284cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b85cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b86cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b87cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b88orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b89umaxp vend1.16b, vend1.16b, vend1.16b90mov tmp1, vend1.d[0]91cbz tmp1, L(loop)9293/* Termination condition found. Now need to establish exactly why94we terminated. */95and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b96and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b97addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->12898addp vend1.16b, vend1.16b, vend1.16b // 128->6499100mov tmp1, vend1.d[0]101L(tail):102/* Count the trailing zeros, by bit reversing... */103rbit tmp1, tmp1104/* Re-bias source. */105sub src, src, #32106clz tmp1, tmp1 /* ... and counting the leading zeros. */107/* tmp1 is twice the offset into the fragment. */108add result, src, tmp1, lsr #1109ret110111END (__strchrnul_aarch64)112113114115