/* SPDX-License-Identifier: GPL-2.0-only */1/*2* Copyright (c) 2013-2021, Arm Limited.3*4* Adapted from the original at:5* https://github.com/ARM-software/optimized-routines/blob/98e4d6a5c13c8e54/string/aarch64/strlen.S6*/78#include <linux/linkage.h>9#include <asm/assembler.h>10#include <asm/mte-def.h>1112/* Assumptions:13*14* ARMv8-a, AArch64, unaligned accesses, min page size 4k.15*/1617#define L(label) .L ## label1819/* Arguments and results. */20#define srcin x021#define len x02223/* Locals and temporaries. */24#define src x125#define data1 x226#define data2 x327#define has_nul1 x428#define has_nul2 x529#define tmp1 x430#define tmp2 x531#define tmp3 x632#define tmp4 x733#define zeroones x83435/* NUL detection works on the principle that (X - 1) & (~X) & 0x8036(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and37can be done in parallel across the entire word. A faster check38(X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives39false hits for characters 129..255. */4041#define REP8_01 0x010101010101010142#define REP8_7f 0x7f7f7f7f7f7f7f7f43#define REP8_80 0x80808080808080804445/*46* When KASAN_HW_TAGS is in use, memory is checked at MTE_GRANULE_SIZE47* (16-byte) granularity, and we must ensure that no access straddles this48* alignment boundary.49*/50#ifdef CONFIG_KASAN_HW_TAGS51#define MIN_PAGE_SIZE MTE_GRANULE_SIZE52#else53#define MIN_PAGE_SIZE 409654#endif5556/* Since strings are short on average, we check the first 16 bytes57of the string for a NUL character. In order to do an unaligned ldp58safely we have to do a page cross check first. If there is a NUL59byte we calculate the length from the 2 8-byte words using60conditional select to reduce branch mispredictions (it is unlikely61strlen will be repeatedly called on strings with the same length).6263If the string is longer than 16 bytes, we align src so don't need64further page cross checks, and process 32 bytes per iteration65using the fast NUL check. If we encounter non-ASCII characters,66fallback to a second loop using the full NUL check.6768If the page cross check fails, we read 16 bytes from an aligned69address, remove any characters before the string, and continue70in the main loop using aligned loads. Since strings crossing a71page in the first 16 bytes are rare (probability of7216/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.7374AArch64 systems have a minimum page size of 4k. We don't bother75checking for larger page sizes - the cost of setting up the correct76page size is just not worth the extra gain from a small reduction in77the cases taking the slow path. Note that we only care about78whether the first fetch, which may be misaligned, crosses a page79boundary. */8081SYM_FUNC_START(__pi_strlen)82and tmp1, srcin, MIN_PAGE_SIZE - 183mov zeroones, REP8_0184cmp tmp1, MIN_PAGE_SIZE - 1685b.gt L(page_cross)86ldp data1, data2, [srcin]87#ifdef __AARCH64EB__88/* For big-endian, carry propagation (if the final byte in the89string is 0x01) means we cannot use has_nul1/2 directly.90Since we expect strings to be small and early-exit,91byte-swap the data now so has_null1/2 will be correct. */92rev data1, data193rev data2, data294#endif95sub tmp1, data1, zeroones96orr tmp2, data1, REP8_7f97sub tmp3, data2, zeroones98orr tmp4, data2, REP8_7f99bics has_nul1, tmp1, tmp2100bic has_nul2, tmp3, tmp4101ccmp has_nul2, 0, 0, eq102beq L(main_loop_entry)103104/* Enter with C = has_nul1 == 0. */105csel has_nul1, has_nul1, has_nul2, cc106mov len, 8107rev has_nul1, has_nul1108clz tmp1, has_nul1109csel len, xzr, len, cc110add len, len, tmp1, lsr 3111ret112113/* The inner loop processes 32 bytes per iteration and uses the fast114NUL check. If we encounter non-ASCII characters, use a second115loop with the accurate NUL check. */116.p2align 4117L(main_loop_entry):118bic src, srcin, 15119sub src, src, 16120L(main_loop):121ldp data1, data2, [src, 32]!122L(page_cross_entry):123sub tmp1, data1, zeroones124sub tmp3, data2, zeroones125orr tmp2, tmp1, tmp3126tst tmp2, zeroones, lsl 7127bne 1f128ldp data1, data2, [src, 16]129sub tmp1, data1, zeroones130sub tmp3, data2, zeroones131orr tmp2, tmp1, tmp3132tst tmp2, zeroones, lsl 7133beq L(main_loop)134add src, src, 161351:136/* The fast check failed, so do the slower, accurate NUL check. */137orr tmp2, data1, REP8_7f138orr tmp4, data2, REP8_7f139bics has_nul1, tmp1, tmp2140bic has_nul2, tmp3, tmp4141ccmp has_nul2, 0, 0, eq142beq L(nonascii_loop)143144/* Enter with C = has_nul1 == 0. */145L(tail):146#ifdef __AARCH64EB__147/* For big-endian, carry propagation (if the final byte in the148string is 0x01) means we cannot use has_nul1/2 directly. The149easiest way to get the correct byte is to byte-swap the data150and calculate the syndrome a second time. */151csel data1, data1, data2, cc152rev data1, data1153sub tmp1, data1, zeroones154orr tmp2, data1, REP8_7f155bic has_nul1, tmp1, tmp2156#else157csel has_nul1, has_nul1, has_nul2, cc158#endif159sub len, src, srcin160rev has_nul1, has_nul1161add tmp2, len, 8162clz tmp1, has_nul1163csel len, len, tmp2, cc164add len, len, tmp1, lsr 3165ret166167L(nonascii_loop):168ldp data1, data2, [src, 16]!169sub tmp1, data1, zeroones170orr tmp2, data1, REP8_7f171sub tmp3, data2, zeroones172orr tmp4, data2, REP8_7f173bics has_nul1, tmp1, tmp2174bic has_nul2, tmp3, tmp4175ccmp has_nul2, 0, 0, eq176bne L(tail)177ldp data1, data2, [src, 16]!178sub tmp1, data1, zeroones179orr tmp2, data1, REP8_7f180sub tmp3, data2, zeroones181orr tmp4, data2, REP8_7f182bics has_nul1, tmp1, tmp2183bic has_nul2, tmp3, tmp4184ccmp has_nul2, 0, 0, eq185beq L(nonascii_loop)186b L(tail)187188/* Load 16 bytes from [srcin & ~15] and force the bytes that precede189srcin to 0x7f, so we ignore any NUL bytes before the string.190Then continue in the aligned loop. */191L(page_cross):192bic src, srcin, 15193ldp data1, data2, [src]194lsl tmp1, srcin, 3195mov tmp4, -1196#ifdef __AARCH64EB__197/* Big-endian. Early bytes are at MSB. */198lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */199#else200/* Little-endian. Early bytes are at LSB. */201lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */202#endif203orr tmp1, tmp1, REP8_80204orn data1, data1, tmp1205orn tmp2, data2, tmp1206tst srcin, 8207csel data1, data1, tmp4, eq208csel data2, data2, tmp2, eq209b L(page_cross_entry)210SYM_FUNC_END(__pi_strlen)211SYM_FUNC_ALIAS_WEAK(strlen, __pi_strlen)212EXPORT_SYMBOL_NOKASAN(strlen)213214215