Path: blob/main/contrib/arm-optimized-routines/string/aarch64/strcpy.S
39491 views
/*1* strcpy/stpcpy - copy a string returning pointer to start/end.2*3* Copyright (c) 2020-2023, Arm Limited.4* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception5*/67/* Assumptions:8*9* ARMv8-a, AArch64, Advanced SIMD.10* MTE compatible.11*/1213#include "asmdefs.h"1415#define dstin x016#define srcin x117#define result x01819#define src x220#define dst x321#define len x422#define synd x423#define tmp x524#define shift x525#define data1 x626#define dataw1 w627#define data2 x728#define dataw2 w72930#define dataq q031#define vdata v032#define vhas_nul v133#define vend v234#define dend d235#define dataq2 q13637#ifdef BUILD_STPCPY38# define STRCPY __stpcpy_aarch6439# define IFSTPCPY(X,...) X,__VA_ARGS__40#else41# define STRCPY __strcpy_aarch6442# define IFSTPCPY(X,...)43#endif4445/*46Core algorithm:47For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits48per byte. We take 4 bits of every comparison byte with shift right and narrow49by 4 instruction. Since the bits in the nibble mask reflect the order in50which things occur in the original string, counting leading zeros identifies51exactly which byte matched. */5253ENTRY (STRCPY)54bic src, srcin, 1555ld1 {vdata.16b}, [src]56cmeq vhas_nul.16b, vdata.16b, 057lsl shift, srcin, 258shrn vend.8b, vhas_nul.8h, 459fmov synd, dend60lsr synd, synd, shift61cbnz synd, L(tail)6263ldr dataq, [src, 16]!64cmeq vhas_nul.16b, vdata.16b, 065shrn vend.8b, vhas_nul.8h, 466fmov synd, dend67cbz synd, L(start_loop)6869#ifndef __AARCH64EB__70rbit synd, synd71#endif72sub tmp, src, srcin73clz len, synd74add len, tmp, len, lsr 275tbz len, 4, L(less16)76sub tmp, len, 1577ldr dataq, [srcin]78ldr dataq2, [srcin, tmp]79str dataq, [dstin]80str dataq2, [dstin, tmp]81IFSTPCPY (add result, dstin, len)82ret8384L(tail):85rbit synd, synd86clz len, synd87lsr len, len, 288L(less16):89tbz len, 3, L(less8)90sub tmp, len, 791ldr data1, [srcin]92ldr data2, [srcin, tmp]93str data1, [dstin]94str data2, [dstin, tmp]95IFSTPCPY (add result, dstin, len)96ret9798.p2align 499L(less8):100subs tmp, len, 3101b.lo L(less4)102ldr dataw1, [srcin]103ldr dataw2, [srcin, tmp]104str dataw1, [dstin]105str dataw2, [dstin, tmp]106IFSTPCPY (add result, dstin, len)107ret108109L(less4):110cbz len, L(zerobyte)111ldrh dataw1, [srcin]112strh dataw1, [dstin]113L(zerobyte):114strb wzr, [dstin, len]115IFSTPCPY (add result, dstin, len)116ret117118.p2align 4119L(start_loop):120sub tmp, srcin, dstin121ldr dataq2, [srcin]122sub dst, src, tmp123str dataq2, [dstin]124L(loop):125str dataq, [dst], 32126ldr dataq, [src, 16]127cmeq vhas_nul.16b, vdata.16b, 0128umaxp vend.16b, vhas_nul.16b, vhas_nul.16b129fmov synd, dend130cbnz synd, L(loopend)131str dataq, [dst, -16]132ldr dataq, [src, 32]!133cmeq vhas_nul.16b, vdata.16b, 0134umaxp vend.16b, vhas_nul.16b, vhas_nul.16b135fmov synd, dend136cbz synd, L(loop)137add dst, dst, 16138L(loopend):139shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */140fmov synd, dend141sub dst, dst, 31142#ifndef __AARCH64EB__143rbit synd, synd144#endif145clz len, synd146lsr len, len, 2147add dst, dst, len148ldr dataq, [dst, tmp]149str dataq, [dst]150IFSTPCPY (add result, dst, 15)151ret152153END (STRCPY)154155156