Path: blob/main/contrib/arm-optimized-routines/string/aarch64/memcpy.S
39491 views
/*1* memcpy - copy memory area2*3* Copyright (c) 2012-2022, Arm Limited.4* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception5*/67/* Assumptions:8*9* ARMv8-a, AArch64, unaligned accesses.10*11*/1213#include "asmdefs.h"1415#define dstin x016#define src x117#define count x218#define dst x319#define srcend x420#define dstend x521#define A_l x622#define A_lw w623#define A_h x724#define B_l x825#define B_lw w826#define B_h x927#define C_l x1028#define C_lw w1029#define C_h x1130#define D_l x1231#define D_h x1332#define E_l x1433#define E_h x1534#define F_l x1635#define F_h x1736#define G_l count37#define G_h dst38#define H_l src39#define H_h srcend40#define tmp1 x144142/* This implementation handles overlaps and supports both memcpy and memmove43from a single entry point. It uses unaligned accesses and branchless44sequences to keep the code small, simple and improve performance.4546Copies are split into 3 main cases: small copies of up to 32 bytes, medium47copies of up to 128 bytes, and large copies. The overhead of the overlap48check is negligible since it is only required for large copies.4950Large copies use a software pipelined loop processing 64 bytes per iteration.51The destination pointer is 16-byte aligned to minimize unaligned accesses.52The loop tail is handled by always copying 64 bytes from the end.53*/5455ENTRY_ALIAS (__memmove_aarch64)56ENTRY (__memcpy_aarch64)57add srcend, src, count58add dstend, dstin, count59cmp count, 12860b.hi L(copy_long)61cmp count, 3262b.hi L(copy32_128)6364/* Small copies: 0..32 bytes. */65cmp count, 1666b.lo L(copy16)67ldp A_l, A_h, [src]68ldp D_l, D_h, [srcend, -16]69stp A_l, A_h, [dstin]70stp D_l, D_h, [dstend, -16]71ret7273/* Copy 8-15 bytes. */74L(copy16):75tbz count, 3, L(copy8)76ldr A_l, [src]77ldr A_h, [srcend, -8]78str A_l, [dstin]79str A_h, [dstend, -8]80ret8182.p2align 383/* Copy 4-7 bytes. */84L(copy8):85tbz count, 2, L(copy4)86ldr A_lw, [src]87ldr B_lw, [srcend, -4]88str A_lw, [dstin]89str B_lw, [dstend, -4]90ret9192/* Copy 0..3 bytes using a branchless sequence. */93L(copy4):94cbz count, L(copy0)95lsr tmp1, count, 196ldrb A_lw, [src]97ldrb C_lw, [srcend, -1]98ldrb B_lw, [src, tmp1]99strb A_lw, [dstin]100strb B_lw, [dstin, tmp1]101strb C_lw, [dstend, -1]102L(copy0):103ret104105.p2align 4106/* Medium copies: 33..128 bytes. */107L(copy32_128):108ldp A_l, A_h, [src]109ldp B_l, B_h, [src, 16]110ldp C_l, C_h, [srcend, -32]111ldp D_l, D_h, [srcend, -16]112cmp count, 64113b.hi L(copy128)114stp A_l, A_h, [dstin]115stp B_l, B_h, [dstin, 16]116stp C_l, C_h, [dstend, -32]117stp D_l, D_h, [dstend, -16]118ret119120.p2align 4121/* Copy 65..128 bytes. */122L(copy128):123ldp E_l, E_h, [src, 32]124ldp F_l, F_h, [src, 48]125cmp count, 96126b.ls L(copy96)127ldp G_l, G_h, [srcend, -64]128ldp H_l, H_h, [srcend, -48]129stp G_l, G_h, [dstend, -64]130stp H_l, H_h, [dstend, -48]131L(copy96):132stp A_l, A_h, [dstin]133stp B_l, B_h, [dstin, 16]134stp E_l, E_h, [dstin, 32]135stp F_l, F_h, [dstin, 48]136stp C_l, C_h, [dstend, -32]137stp D_l, D_h, [dstend, -16]138ret139140.p2align 4141/* Copy more than 128 bytes. */142L(copy_long):143/* Use backwards copy if there is an overlap. */144sub tmp1, dstin, src145cbz tmp1, L(copy0)146cmp tmp1, count147b.lo L(copy_long_backwards)148149/* Copy 16 bytes and then align dst to 16-byte alignment. */150151ldp D_l, D_h, [src]152and tmp1, dstin, 15153bic dst, dstin, 15154sub src, src, tmp1155add count, count, tmp1 /* Count is now 16 too large. */156ldp A_l, A_h, [src, 16]157stp D_l, D_h, [dstin]158ldp B_l, B_h, [src, 32]159ldp C_l, C_h, [src, 48]160ldp D_l, D_h, [src, 64]!161subs count, count, 128 + 16 /* Test and readjust count. */162b.ls L(copy64_from_end)163164L(loop64):165stp A_l, A_h, [dst, 16]166ldp A_l, A_h, [src, 16]167stp B_l, B_h, [dst, 32]168ldp B_l, B_h, [src, 32]169stp C_l, C_h, [dst, 48]170ldp C_l, C_h, [src, 48]171stp D_l, D_h, [dst, 64]!172ldp D_l, D_h, [src, 64]!173subs count, count, 64174b.hi L(loop64)175176/* Write the last iteration and copy 64 bytes from the end. */177L(copy64_from_end):178ldp E_l, E_h, [srcend, -64]179stp A_l, A_h, [dst, 16]180ldp A_l, A_h, [srcend, -48]181stp B_l, B_h, [dst, 32]182ldp B_l, B_h, [srcend, -32]183stp C_l, C_h, [dst, 48]184ldp C_l, C_h, [srcend, -16]185stp D_l, D_h, [dst, 64]186stp E_l, E_h, [dstend, -64]187stp A_l, A_h, [dstend, -48]188stp B_l, B_h, [dstend, -32]189stp C_l, C_h, [dstend, -16]190ret191192.p2align 4193194/* Large backwards copy for overlapping copies.195Copy 16 bytes and then align dst to 16-byte alignment. */196L(copy_long_backwards):197ldp D_l, D_h, [srcend, -16]198and tmp1, dstend, 15199sub srcend, srcend, tmp1200sub count, count, tmp1201ldp A_l, A_h, [srcend, -16]202stp D_l, D_h, [dstend, -16]203ldp B_l, B_h, [srcend, -32]204ldp C_l, C_h, [srcend, -48]205ldp D_l, D_h, [srcend, -64]!206sub dstend, dstend, tmp1207subs count, count, 128208b.ls L(copy64_from_start)209210L(loop64_backwards):211stp A_l, A_h, [dstend, -16]212ldp A_l, A_h, [srcend, -16]213stp B_l, B_h, [dstend, -32]214ldp B_l, B_h, [srcend, -32]215stp C_l, C_h, [dstend, -48]216ldp C_l, C_h, [srcend, -48]217stp D_l, D_h, [dstend, -64]!218ldp D_l, D_h, [srcend, -64]!219subs count, count, 64220b.hi L(loop64_backwards)221222/* Write the last iteration and copy 64 bytes from the start. */223L(copy64_from_start):224ldp G_l, G_h, [src, 48]225stp A_l, A_h, [dstend, -16]226ldp A_l, A_h, [src, 32]227stp B_l, B_h, [dstend, -32]228ldp B_l, B_h, [src, 16]229stp C_l, C_h, [dstend, -48]230ldp C_l, C_h, [src]231stp D_l, D_h, [dstend, -64]232stp G_l, G_h, [dstin, 48]233stp A_l, A_h, [dstin, 32]234stp B_l, B_h, [dstin, 16]235stp C_l, C_h, [dstin]236ret237238END (__memcpy_aarch64)239240241242