Path: blob/main/contrib/arm-optimized-routines/string/aarch64/memcpy-advsimd.S
39486 views
/*1* memcpy - copy memory area2*3* Copyright (c) 2019-2023, Arm Limited.4* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception5*/67/* Assumptions:8*9* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.10*11*/1213#include "asmdefs.h"1415#define dstin x016#define src x117#define count x218#define dst x319#define srcend x420#define dstend x521#define A_l x622#define A_lw w623#define A_h x724#define B_l x825#define B_lw w826#define B_h x927#define C_lw w1028#define tmp1 x142930#define A_q q031#define B_q q132#define C_q q233#define D_q q334#define E_q q435#define F_q q536#define G_q q637#define H_q q73839/* This implementation handles overlaps and supports both memcpy and memmove40from a single entry point. It uses unaligned accesses and branchless41sequences to keep the code small, simple and improve performance.4243Copies are split into 3 main cases: small copies of up to 32 bytes, medium44copies of up to 128 bytes, and large copies. The overhead of the overlap45check is negligible since it is only required for large copies.4647Large copies use a software pipelined loop processing 64 bytes per iteration.48The source pointer is 16-byte aligned to minimize unaligned accesses.49The loop tail is handled by always copying 64 bytes from the end.50*/5152ENTRY_ALIAS (__memmove_aarch64_simd)53ENTRY (__memcpy_aarch64_simd)54add srcend, src, count55cmp count, 12856b.hi L(copy_long)57add dstend, dstin, count58cmp count, 3259b.hi L(copy32_128)60nop6162/* Small copies: 0..32 bytes. */63cmp count, 1664b.lo L(copy16)65ldr A_q, [src]66ldr B_q, [srcend, -16]67str A_q, [dstin]68str B_q, [dstend, -16]69ret7071.p2align 472/* Medium copies: 33..128 bytes. */73L(copy32_128):74ldp A_q, B_q, [src]75ldp C_q, D_q, [srcend, -32]76cmp count, 6477b.hi L(copy128)78stp A_q, B_q, [dstin]79stp C_q, D_q, [dstend, -32]80ret8182.p2align 483/* Copy 8-15 bytes. */84L(copy16):85tbz count, 3, L(copy8)86ldr A_l, [src]87ldr A_h, [srcend, -8]88str A_l, [dstin]89str A_h, [dstend, -8]90ret9192/* Copy 4-7 bytes. */93L(copy8):94tbz count, 2, L(copy4)95ldr A_lw, [src]96ldr B_lw, [srcend, -4]97str A_lw, [dstin]98str B_lw, [dstend, -4]99ret100101/* Copy 65..128 bytes. */102L(copy128):103ldp E_q, F_q, [src, 32]104cmp count, 96105b.ls L(copy96)106ldp G_q, H_q, [srcend, -64]107stp G_q, H_q, [dstend, -64]108L(copy96):109stp A_q, B_q, [dstin]110stp E_q, F_q, [dstin, 32]111stp C_q, D_q, [dstend, -32]112ret113114/* Copy 0..3 bytes using a branchless sequence. */115L(copy4):116cbz count, L(copy0)117lsr tmp1, count, 1118ldrb A_lw, [src]119ldrb C_lw, [srcend, -1]120ldrb B_lw, [src, tmp1]121strb A_lw, [dstin]122strb B_lw, [dstin, tmp1]123strb C_lw, [dstend, -1]124L(copy0):125ret126127.p2align 3128/* Copy more than 128 bytes. */129L(copy_long):130add dstend, dstin, count131132/* Use backwards copy if there is an overlap. */133sub tmp1, dstin, src134cmp tmp1, count135b.lo L(copy_long_backwards)136137/* Copy 16 bytes and then align src to 16-byte alignment. */138ldr D_q, [src]139and tmp1, src, 15140bic src, src, 15141sub dst, dstin, tmp1142add count, count, tmp1 /* Count is now 16 too large. */143ldp A_q, B_q, [src, 16]144str D_q, [dstin]145ldp C_q, D_q, [src, 48]146subs count, count, 128 + 16 /* Test and readjust count. */147b.ls L(copy64_from_end)148L(loop64):149stp A_q, B_q, [dst, 16]150ldp A_q, B_q, [src, 80]151stp C_q, D_q, [dst, 48]152ldp C_q, D_q, [src, 112]153add src, src, 64154add dst, dst, 64155subs count, count, 64156b.hi L(loop64)157158/* Write the last iteration and copy 64 bytes from the end. */159L(copy64_from_end):160ldp E_q, F_q, [srcend, -64]161stp A_q, B_q, [dst, 16]162ldp A_q, B_q, [srcend, -32]163stp C_q, D_q, [dst, 48]164stp E_q, F_q, [dstend, -64]165stp A_q, B_q, [dstend, -32]166ret167168.p2align 4169nop170171/* Large backwards copy for overlapping copies.172Copy 16 bytes and then align srcend to 16-byte alignment. */173L(copy_long_backwards):174cbz tmp1, L(copy0)175ldr D_q, [srcend, -16]176and tmp1, srcend, 15177bic srcend, srcend, 15178sub count, count, tmp1179ldp A_q, B_q, [srcend, -32]180str D_q, [dstend, -16]181ldp C_q, D_q, [srcend, -64]182sub dstend, dstend, tmp1183subs count, count, 128184b.ls L(copy64_from_start)185186L(loop64_backwards):187str B_q, [dstend, -16]188str A_q, [dstend, -32]189ldp A_q, B_q, [srcend, -96]190str D_q, [dstend, -48]191str C_q, [dstend, -64]!192ldp C_q, D_q, [srcend, -128]193sub srcend, srcend, 64194subs count, count, 64195b.hi L(loop64_backwards)196197/* Write the last iteration and copy 64 bytes from the start. */198L(copy64_from_start):199ldp E_q, F_q, [src, 32]200stp A_q, B_q, [dstend, -32]201ldp A_q, B_q, [src]202stp C_q, D_q, [dstend, -64]203stp E_q, F_q, [dstin, 32]204stp A_q, B_q, [dstin]205ret206207END (__memcpy_aarch64_simd)208209210211