Path: blob/main/contrib/llvm-project/compiler-rt/lib/builtins/aarch64/sme-libc-opt-memcpy-memmove.S
213799 views
//===----------------------------------------------------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7///8/// \file9/// This file contains assembly-optimized implementations of Scalable Matrix10/// Extension (SME) compatible memcpy and memmove functions.11///12/// These implementations depend on unaligned access support.13///14/// Routines taken from libc/AOR_v20.02/string/aarch64.15///16//===----------------------------------------------------------------------===//1718#include "../assembly.h"1920//21// __arm_sc_memcpy / __arm_sc_memmove22//2324#define dstin x025#define src x126#define count x227#define dst x328#define srcend1 x429#define dstend1 x530#define A_l x631#define A_lw w632#define A_h x733#define B_l x834#define B_lw w835#define B_h x936#define C_l x1037#define C_lw w1038#define C_h x1139#define D_l x1240#define D_h x1341#define E_l x1442#define E_h x1543#define F_l x1644#define F_h x1745#define G_l count46#define G_h dst47#define H_l src48#define H_h srcend149#define tmp1 x145051/* This implementation handles overlaps and supports both memcpy and memmove52from a single entry point. It uses unaligned accesses and branchless53sequences to keep the code small, simple and improve performance.5455Copies are split into 3 main cases: small copies of up to 32 bytes, medium56copies of up to 128 bytes, and large copies. The overhead of the overlap57check is negligible since it is only required for large copies.5859Large copies use a software pipelined loop processing 64 bytes per iteration.60The destination pointer is 16-byte aligned to minimize unaligned accesses.61The loop tail is handled by always copying 64 bytes from the end.62*/6364DEFINE_COMPILERRT_FUNCTION(__arm_sc_memcpy)65add srcend1, src, count66add dstend1, dstin, count67cmp count, 12868b.hi 7f // copy_long69cmp count, 3270b.hi 4f // copy32_1287172/* Small copies: 0..32 bytes. */73cmp count, 1674b.lo 0f // copy1675ldp A_l, A_h, [src]76ldp D_l, D_h, [srcend1, -16]77stp A_l, A_h, [dstin]78stp D_l, D_h, [dstend1, -16]79ret8081/* Copy 8-15 bytes. */820: // copy1683tbz count, 3, 1f // copy884ldr A_l, [src]85ldr A_h, [srcend1, -8]86str A_l, [dstin]87str A_h, [dstend1, -8]88ret8990.p2align 391/* Copy 4-7 bytes. */921: // copy893tbz count, 2, 2f // copy494ldr A_lw, [src]95ldr B_lw, [srcend1, -4]96str A_lw, [dstin]97str B_lw, [dstend1, -4]98ret99100/* Copy 0..3 bytes using a branchless sequence. */1012: // copy4102cbz count, 3f // copy0103lsr tmp1, count, 1104ldrb A_lw, [src]105ldrb C_lw, [srcend1, -1]106ldrb B_lw, [src, tmp1]107strb A_lw, [dstin]108strb B_lw, [dstin, tmp1]109strb C_lw, [dstend1, -1]1103: // copy0111ret112113.p2align 4114/* Medium copies: 33..128 bytes. */1154: // copy32_128116ldp A_l, A_h, [src]117ldp B_l, B_h, [src, 16]118ldp C_l, C_h, [srcend1, -32]119ldp D_l, D_h, [srcend1, -16]120cmp count, 64121b.hi 5f // copy128122stp A_l, A_h, [dstin]123stp B_l, B_h, [dstin, 16]124stp C_l, C_h, [dstend1, -32]125stp D_l, D_h, [dstend1, -16]126ret127128.p2align 4129/* Copy 65..128 bytes. */1305: // copy128131ldp E_l, E_h, [src, 32]132ldp F_l, F_h, [src, 48]133cmp count, 96134b.ls 6f // copy96135ldp G_l, G_h, [srcend1, -64]136ldp H_l, H_h, [srcend1, -48]137stp G_l, G_h, [dstend1, -64]138stp H_l, H_h, [dstend1, -48]1396: // copy96140stp A_l, A_h, [dstin]141stp B_l, B_h, [dstin, 16]142stp E_l, E_h, [dstin, 32]143stp F_l, F_h, [dstin, 48]144stp C_l, C_h, [dstend1, -32]145stp D_l, D_h, [dstend1, -16]146ret147148.p2align 4149/* Copy more than 128 bytes. */1507: // copy_long151/* Use backwards copy if there is an overlap. */152sub tmp1, dstin, src153cbz tmp1, 3b // copy0154cmp tmp1, count155b.lo 10f //copy_long_backwards156157/* Copy 16 bytes and then align dst to 16-byte alignment. */158159ldp D_l, D_h, [src]160and tmp1, dstin, 15161bic dst, dstin, 15162sub src, src, tmp1163add count, count, tmp1 /* Count is now 16 too large. */164ldp A_l, A_h, [src, 16]165stp D_l, D_h, [dstin]166ldp B_l, B_h, [src, 32]167ldp C_l, C_h, [src, 48]168ldp D_l, D_h, [src, 64]!169subs count, count, 128 + 16 /* Test and readjust count. */170b.ls 9f // copy64_from_end1718: // loop64172stp A_l, A_h, [dst, 16]173ldp A_l, A_h, [src, 16]174stp B_l, B_h, [dst, 32]175ldp B_l, B_h, [src, 32]176stp C_l, C_h, [dst, 48]177ldp C_l, C_h, [src, 48]178stp D_l, D_h, [dst, 64]!179ldp D_l, D_h, [src, 64]!180subs count, count, 64181b.hi 8b // loop64182183/* Write the last iteration and copy 64 bytes from the end. */1849: // copy64_from_end185ldp E_l, E_h, [srcend1, -64]186stp A_l, A_h, [dst, 16]187ldp A_l, A_h, [srcend1, -48]188stp B_l, B_h, [dst, 32]189ldp B_l, B_h, [srcend1, -32]190stp C_l, C_h, [dst, 48]191ldp C_l, C_h, [srcend1, -16]192stp D_l, D_h, [dst, 64]193stp E_l, E_h, [dstend1, -64]194stp A_l, A_h, [dstend1, -48]195stp B_l, B_h, [dstend1, -32]196stp C_l, C_h, [dstend1, -16]197ret198199.p2align 4200201/* Large backwards copy for overlapping copies.202Copy 16 bytes and then align dst to 16-byte alignment. */20310: // copy_long_backwards204ldp D_l, D_h, [srcend1, -16]205and tmp1, dstend1, 15206sub srcend1, srcend1, tmp1207sub count, count, tmp1208ldp A_l, A_h, [srcend1, -16]209stp D_l, D_h, [dstend1, -16]210ldp B_l, B_h, [srcend1, -32]211ldp C_l, C_h, [srcend1, -48]212ldp D_l, D_h, [srcend1, -64]!213sub dstend1, dstend1, tmp1214subs count, count, 128215b.ls 12f // copy64_from_start21621711: // loop64_backwards218stp A_l, A_h, [dstend1, -16]219ldp A_l, A_h, [srcend1, -16]220stp B_l, B_h, [dstend1, -32]221ldp B_l, B_h, [srcend1, -32]222stp C_l, C_h, [dstend1, -48]223ldp C_l, C_h, [srcend1, -48]224stp D_l, D_h, [dstend1, -64]!225ldp D_l, D_h, [srcend1, -64]!226subs count, count, 64227b.hi 11b // loop64_backwards228229/* Write the last iteration and copy 64 bytes from the start. */23012: // copy64_from_start231ldp G_l, G_h, [src, 48]232stp A_l, A_h, [dstend1, -16]233ldp A_l, A_h, [src, 32]234stp B_l, B_h, [dstend1, -32]235ldp B_l, B_h, [src, 16]236stp C_l, C_h, [dstend1, -48]237ldp C_l, C_h, [src]238stp D_l, D_h, [dstend1, -64]239stp G_l, G_h, [dstin, 48]240stp A_l, A_h, [dstin, 32]241stp B_l, B_h, [dstin, 16]242stp C_l, C_h, [dstin]243ret244END_COMPILERRT_FUNCTION(__arm_sc_memcpy)245246DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)247248249250