Path: blob/main/contrib/arm-optimized-routines/string/aarch64/strlen.S
39486 views
/*1* strlen - calculate the length of a string.2*3* Copyright (c) 2020-2022, Arm Limited.4* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception5*/67/* Assumptions:8*9* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.10* Not MTE compatible.11*/1213#include "asmdefs.h"1415#define srcin x016#define len x01718#define src x119#define data1 x220#define data2 x321#define has_nul1 x422#define has_nul2 x523#define tmp1 x424#define tmp2 x525#define tmp3 x626#define tmp4 x727#define zeroones x82829#define maskv v030#define maskd d031#define dataq1 q132#define dataq2 q233#define datav1 v134#define datav2 v235#define tmp x236#define tmpw w237#define synd x338#define syndw w339#define shift x44041/* For the first 32 bytes, NUL detection works on the principle that42(X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a43byte is zero, and can be done in parallel across the entire word. */4445#define REP8_01 0x010101010101010146#define REP8_7f 0x7f7f7f7f7f7f7f7f4748/* To test the page crossing code path more thoroughly, compile with49-DTEST_PAGE_CROSS - this will force all calls through the slower50entry path. This option is not intended for production use. */5152#ifdef TEST_PAGE_CROSS53# define MIN_PAGE_SIZE 3254#else55# define MIN_PAGE_SIZE 409656#endif5758/* Core algorithm:5960Since strings are short on average, we check the first 32 bytes of the61string for a NUL character without aligning the string. In order to use62unaligned loads safely we must do a page cross check first.6364If there is a NUL byte we calculate the length from the 2 8-byte words65using conditional select to reduce branch mispredictions (it is unlikely66strlen will be repeatedly called on strings with the same length).6768If the string is longer than 32 bytes, align src so we don't need further69page cross checks, and process 32 bytes per iteration using a fast SIMD70loop.7172If the page cross check fails, we read 32 bytes from an aligned address,73and ignore any characters before the string. If it contains a NUL74character, return the length, if not, continue in the main loop. */7576ENTRY (__strlen_aarch64)77and tmp1, srcin, MIN_PAGE_SIZE - 178cmp tmp1, MIN_PAGE_SIZE - 3279b.hi L(page_cross)8081/* Look for a NUL byte in the first 16 bytes. */82ldp data1, data2, [srcin]83mov zeroones, REP8_018485#ifdef __AARCH64EB__86/* For big-endian, carry propagation (if the final byte in the87string is 0x01) means we cannot use has_nul1/2 directly.88Since we expect strings to be small and early-exit,89byte-swap the data now so has_null1/2 will be correct. */90rev data1, data191rev data2, data292#endif93sub tmp1, data1, zeroones94orr tmp2, data1, REP8_7f95sub tmp3, data2, zeroones96orr tmp4, data2, REP8_7f97bics has_nul1, tmp1, tmp298bic has_nul2, tmp3, tmp499ccmp has_nul2, 0, 0, eq100b.eq L(bytes16_31)101102/* Find the exact offset of the first NUL byte in the first 16 bytes103from the string start. Enter with C = has_nul1 == 0. */104csel has_nul1, has_nul1, has_nul2, cc105mov len, 8106rev has_nul1, has_nul1107csel len, xzr, len, cc108clz tmp1, has_nul1109add len, len, tmp1, lsr 3110ret111112/* Look for a NUL byte at offset 16..31 in the string. */113L(bytes16_31):114ldp data1, data2, [srcin, 16]115#ifdef __AARCH64EB__116rev data1, data1117rev data2, data2118#endif119sub tmp1, data1, zeroones120orr tmp2, data1, REP8_7f121sub tmp3, data2, zeroones122orr tmp4, data2, REP8_7f123bics has_nul1, tmp1, tmp2124bic has_nul2, tmp3, tmp4125ccmp has_nul2, 0, 0, eq126b.eq L(loop_entry)127128/* Find the exact offset of the first NUL byte at offset 16..31 from129the string start. Enter with C = has_nul1 == 0. */130csel has_nul1, has_nul1, has_nul2, cc131mov len, 24132rev has_nul1, has_nul1133mov tmp3, 16134clz tmp1, has_nul1135csel len, tmp3, len, cc136add len, len, tmp1, lsr 3137ret138139nop140L(loop_entry):141bic src, srcin, 31142143.p2align 5144L(loop):145ldp dataq1, dataq2, [src, 32]!146uminp maskv.16b, datav1.16b, datav2.16b147uminp maskv.16b, maskv.16b, maskv.16b148cmeq maskv.8b, maskv.8b, 0149fmov synd, maskd150cbz synd, L(loop)151152/* Low 32 bits of synd are non-zero if a NUL was found in datav1. */153cmeq maskv.16b, datav1.16b, 0154sub len, src, srcin155cbnz syndw, 1f156cmeq maskv.16b, datav2.16b, 0157add len, len, 161581:159/* Generate a bitmask and compute correct byte offset. */160shrn maskv.8b, maskv.8h, 4161fmov synd, maskd162#ifndef __AARCH64EB__163rbit synd, synd164#endif165clz tmp, synd166add len, len, tmp, lsr 2167ret168169L(page_cross):170bic src, srcin, 31171mov tmpw, 0x0c03172movk tmpw, 0xc030, lsl 16173ld1 {datav1.16b, datav2.16b}, [src]174dup maskv.4s, tmpw175cmeq datav1.16b, datav1.16b, 0176cmeq datav2.16b, datav2.16b, 0177and datav1.16b, datav1.16b, maskv.16b178and datav2.16b, datav2.16b, maskv.16b179addp maskv.16b, datav1.16b, datav2.16b180addp maskv.16b, maskv.16b, maskv.16b181fmov synd, maskd182lsl shift, srcin, 1183lsr synd, synd, shift184cbz synd, L(loop)185186rbit synd, synd187clz len, synd188lsr len, len, 1189ret190191END (__strlen_aarch64)192193194