Path: blob/main/contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S
39486 views
/*1* memcpy - copy memory area2*3* Copyright (c) 2019-2023, Arm Limited.4* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception5*/67/* Assumptions:8*9* ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.10*11*/1213#include "asmdefs.h"1415.arch armv8-a+sve1617#define dstin x018#define src x119#define count x220#define dst x321#define srcend x422#define dstend x523#define tmp1 x624#define vlen x62526#define A_q q027#define B_q q128#define C_q q229#define D_q q330#define E_q q431#define F_q q532#define G_q q633#define H_q q73435/* This implementation handles overlaps and supports both memcpy and memmove36from a single entry point. It uses unaligned accesses and branchless37sequences to keep the code small, simple and improve performance.38SVE vectors are used to speedup small copies.3940Copies are split into 3 main cases: small copies of up to 32 bytes, medium41copies of up to 128 bytes, and large copies. The overhead of the overlap42check is negligible since it is only required for large copies.4344Large copies use a software pipelined loop processing 64 bytes per iteration.45The source pointer is 16-byte aligned to minimize unaligned accesses.46The loop tail is handled by always copying 64 bytes from the end.47*/4849ENTRY_ALIAS (__memmove_aarch64_sve)50ENTRY (__memcpy_aarch64_sve)51cmp count, 12852b.hi L(copy_long)53cntb vlen54cmp count, vlen, lsl 155b.hi L(copy32_128)5657whilelo p0.b, xzr, count58whilelo p1.b, vlen, count59ld1b z0.b, p0/z, [src, 0, mul vl]60ld1b z1.b, p1/z, [src, 1, mul vl]61st1b z0.b, p0, [dstin, 0, mul vl]62st1b z1.b, p1, [dstin, 1, mul vl]63ret6465/* Medium copies: 33..128 bytes. */66L(copy32_128):67add srcend, src, count68add dstend, dstin, count69ldp A_q, B_q, [src]70ldp C_q, D_q, [srcend, -32]71cmp count, 6472b.hi L(copy128)73stp A_q, B_q, [dstin]74stp C_q, D_q, [dstend, -32]75ret7677/* Copy 65..128 bytes. */78L(copy128):79ldp E_q, F_q, [src, 32]80cmp count, 9681b.ls L(copy96)82ldp G_q, H_q, [srcend, -64]83stp G_q, H_q, [dstend, -64]84L(copy96):85stp A_q, B_q, [dstin]86stp E_q, F_q, [dstin, 32]87stp C_q, D_q, [dstend, -32]88ret8990/* Copy more than 128 bytes. */91L(copy_long):92add srcend, src, count93add dstend, dstin, count9495/* Use backwards copy if there is an overlap. */96sub tmp1, dstin, src97cmp tmp1, count98b.lo L(copy_long_backwards)99100/* Copy 16 bytes and then align src to 16-byte alignment. */101ldr D_q, [src]102and tmp1, src, 15103bic src, src, 15104sub dst, dstin, tmp1105add count, count, tmp1 /* Count is now 16 too large. */106ldp A_q, B_q, [src, 16]107str D_q, [dstin]108ldp C_q, D_q, [src, 48]109subs count, count, 128 + 16 /* Test and readjust count. */110b.ls L(copy64_from_end)111L(loop64):112stp A_q, B_q, [dst, 16]113ldp A_q, B_q, [src, 80]114stp C_q, D_q, [dst, 48]115ldp C_q, D_q, [src, 112]116add src, src, 64117add dst, dst, 64118subs count, count, 64119b.hi L(loop64)120121/* Write the last iteration and copy 64 bytes from the end. */122L(copy64_from_end):123ldp E_q, F_q, [srcend, -64]124stp A_q, B_q, [dst, 16]125ldp A_q, B_q, [srcend, -32]126stp C_q, D_q, [dst, 48]127stp E_q, F_q, [dstend, -64]128stp A_q, B_q, [dstend, -32]129ret130131/* Large backwards copy for overlapping copies.132Copy 16 bytes and then align srcend to 16-byte alignment. */133L(copy_long_backwards):134cbz tmp1, L(return)135ldr D_q, [srcend, -16]136and tmp1, srcend, 15137bic srcend, srcend, 15138sub count, count, tmp1139ldp A_q, B_q, [srcend, -32]140str D_q, [dstend, -16]141ldp C_q, D_q, [srcend, -64]142sub dstend, dstend, tmp1143subs count, count, 128144b.ls L(copy64_from_start)145146L(loop64_backwards):147str B_q, [dstend, -16]148str A_q, [dstend, -32]149ldp A_q, B_q, [srcend, -96]150str D_q, [dstend, -48]151str C_q, [dstend, -64]!152ldp C_q, D_q, [srcend, -128]153sub srcend, srcend, 64154subs count, count, 64155b.hi L(loop64_backwards)156157/* Write the last iteration and copy 64 bytes from the start. */158L(copy64_from_start):159ldp E_q, F_q, [src, 32]160stp A_q, B_q, [dstend, -32]161ldp A_q, B_q, [src]162stp C_q, D_q, [dstend, -64]163stp E_q, F_q, [dstin, 32]164stp A_q, B_q, [dstin]165L(return):166ret167168END (__memcpy_aarch64_sve)169170171