Path: blob/main/contrib/llvm-project/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
35292 views
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.1// See https://llvm.org/LICENSE.txt for license information.2// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception34// Routines taken from libc/AOR_v20.02/string/aarch6456#include "../assembly.h"78#ifdef __aarch64__910#define L(l) .L ## l1112//13// __arm_sc_memcpy / __arm_sc_memmove14//1516#define dstin x017#define src x118#define count x219#define dst x320#define srcend1 x421#define dstend1 x522#define A_l x623#define A_lw w624#define A_h x725#define B_l x826#define B_lw w827#define B_h x928#define C_l x1029#define C_lw w1030#define C_h x1131#define D_l x1232#define D_h x1333#define E_l x1434#define E_h x1535#define F_l x1636#define F_h x1737#define G_l count38#define G_h dst39#define H_l src40#define H_h srcend141#define tmp1 x144243/* This implementation handles overlaps and supports both memcpy and memmove44from a single entry point. It uses unaligned accesses and branchless45sequences to keep the code small, simple and improve performance.4647Copies are split into 3 main cases: small copies of up to 32 bytes, medium48copies of up to 128 bytes, and large copies. The overhead of the overlap49check is negligible since it is only required for large copies.5051Large copies use a software pipelined loop processing 64 bytes per iteration.52The destination pointer is 16-byte aligned to minimize unaligned accesses.53The loop tail is handled by always copying 64 bytes from the end.54*/5556DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memcpy)57add srcend1, src, count58add dstend1, dstin, count59cmp count, 12860b.hi L(copy_long)61cmp count, 3262b.hi L(copy32_128)6364/* Small copies: 0..32 bytes. */65cmp count, 1666b.lo L(copy16)67ldp A_l, A_h, [src]68ldp D_l, D_h, [srcend1, -16]69stp A_l, A_h, [dstin]70stp D_l, D_h, [dstend1, -16]71ret7273/* Copy 8-15 bytes. */74L(copy16):75tbz count, 3, L(copy8)76ldr A_l, [src]77ldr A_h, [srcend1, -8]78str A_l, [dstin]79str A_h, [dstend1, -8]80ret8182.p2align 383/* Copy 4-7 bytes. */84L(copy8):85tbz count, 2, L(copy4)86ldr A_lw, [src]87ldr B_lw, [srcend1, -4]88str A_lw, [dstin]89str B_lw, [dstend1, -4]90ret9192/* Copy 0..3 bytes using a branchless sequence. */93L(copy4):94cbz count, L(copy0)95lsr tmp1, count, 196ldrb A_lw, [src]97ldrb C_lw, [srcend1, -1]98ldrb B_lw, [src, tmp1]99strb A_lw, [dstin]100strb B_lw, [dstin, tmp1]101strb C_lw, [dstend1, -1]102L(copy0):103ret104105.p2align 4106/* Medium copies: 33..128 bytes. */107L(copy32_128):108ldp A_l, A_h, [src]109ldp B_l, B_h, [src, 16]110ldp C_l, C_h, [srcend1, -32]111ldp D_l, D_h, [srcend1, -16]112cmp count, 64113b.hi L(copy128)114stp A_l, A_h, [dstin]115stp B_l, B_h, [dstin, 16]116stp C_l, C_h, [dstend1, -32]117stp D_l, D_h, [dstend1, -16]118ret119120.p2align 4121/* Copy 65..128 bytes. */122L(copy128):123ldp E_l, E_h, [src, 32]124ldp F_l, F_h, [src, 48]125cmp count, 96126b.ls L(copy96)127ldp G_l, G_h, [srcend1, -64]128ldp H_l, H_h, [srcend1, -48]129stp G_l, G_h, [dstend1, -64]130stp H_l, H_h, [dstend1, -48]131L(copy96):132stp A_l, A_h, [dstin]133stp B_l, B_h, [dstin, 16]134stp E_l, E_h, [dstin, 32]135stp F_l, F_h, [dstin, 48]136stp C_l, C_h, [dstend1, -32]137stp D_l, D_h, [dstend1, -16]138ret139140.p2align 4141/* Copy more than 128 bytes. */142L(copy_long):143/* Use backwards copy if there is an overlap. */144sub tmp1, dstin, src145cbz tmp1, L(copy0)146cmp tmp1, count147b.lo L(copy_long_backwards)148149/* Copy 16 bytes and then align dst to 16-byte alignment. */150151ldp D_l, D_h, [src]152and tmp1, dstin, 15153bic dst, dstin, 15154sub src, src, tmp1155add count, count, tmp1 /* Count is now 16 too large. */156ldp A_l, A_h, [src, 16]157stp D_l, D_h, [dstin]158ldp B_l, B_h, [src, 32]159ldp C_l, C_h, [src, 48]160ldp D_l, D_h, [src, 64]!161subs count, count, 128 + 16 /* Test and readjust count. */162b.ls L(copy64_from_end)163L(loop64):164stp A_l, A_h, [dst, 16]165ldp A_l, A_h, [src, 16]166stp B_l, B_h, [dst, 32]167ldp B_l, B_h, [src, 32]168stp C_l, C_h, [dst, 48]169ldp C_l, C_h, [src, 48]170stp D_l, D_h, [dst, 64]!171ldp D_l, D_h, [src, 64]!172subs count, count, 64173b.hi L(loop64)174175/* Write the last iteration and copy 64 bytes from the end. */176L(copy64_from_end):177ldp E_l, E_h, [srcend1, -64]178stp A_l, A_h, [dst, 16]179ldp A_l, A_h, [srcend1, -48]180stp B_l, B_h, [dst, 32]181ldp B_l, B_h, [srcend1, -32]182stp C_l, C_h, [dst, 48]183ldp C_l, C_h, [srcend1, -16]184stp D_l, D_h, [dst, 64]185stp E_l, E_h, [dstend1, -64]186stp A_l, A_h, [dstend1, -48]187stp B_l, B_h, [dstend1, -32]188stp C_l, C_h, [dstend1, -16]189ret190191.p2align 4192193/* Large backwards copy for overlapping copies.194Copy 16 bytes and then align dst to 16-byte alignment. */195L(copy_long_backwards):196ldp D_l, D_h, [srcend1, -16]197and tmp1, dstend1, 15198sub srcend1, srcend1, tmp1199sub count, count, tmp1200ldp A_l, A_h, [srcend1, -16]201stp D_l, D_h, [dstend1, -16]202ldp B_l, B_h, [srcend1, -32]203ldp C_l, C_h, [srcend1, -48]204ldp D_l, D_h, [srcend1, -64]!205sub dstend1, dstend1, tmp1206subs count, count, 128207b.ls L(copy64_from_start)208209L(loop64_backwards):210stp A_l, A_h, [dstend1, -16]211ldp A_l, A_h, [srcend1, -16]212stp B_l, B_h, [dstend1, -32]213ldp B_l, B_h, [srcend1, -32]214stp C_l, C_h, [dstend1, -48]215ldp C_l, C_h, [srcend1, -48]216stp D_l, D_h, [dstend1, -64]!217ldp D_l, D_h, [srcend1, -64]!218subs count, count, 64219b.hi L(loop64_backwards)220221/* Write the last iteration and copy 64 bytes from the start. */222L(copy64_from_start):223ldp G_l, G_h, [src, 48]224stp A_l, A_h, [dstend1, -16]225ldp A_l, A_h, [src, 32]226stp B_l, B_h, [dstend1, -32]227ldp B_l, B_h, [src, 16]228stp C_l, C_h, [dstend1, -48]229ldp C_l, C_h, [src]230stp D_l, D_h, [dstend1, -64]231stp G_l, G_h, [dstin, 48]232stp A_l, A_h, [dstin, 32]233stp B_l, B_h, [dstin, 16]234stp C_l, C_h, [dstin]235ret236END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memcpy)237238DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)239240241//242// __arm_sc_memset243//244245#define dstin x0246#define val x1247#define valw w1248#define count x2249#define dst x3250#define dstend2 x4251#define zva_val x5252253DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)254#ifdef __ARM_FEATURE_SVE255mov z0.b, valw256#else257bfi valw, valw, #8, #8258bfi valw, valw, #16, #16259bfi val, val, #32, #32260fmov d0, val261fmov v0.d[1], val262#endif263add dstend2, dstin, count264265cmp count, 96266b.hi L(set_long)267cmp count, 16268b.hs L(set_medium)269mov val, v0.D[0]270271/* Set 0..15 bytes. */272tbz count, 3, 1f273str val, [dstin]274str val, [dstend2, -8]275ret276nop2771: tbz count, 2, 2f278str valw, [dstin]279str valw, [dstend2, -4]280ret2812: cbz count, 3f282strb valw, [dstin]283tbz count, 1, 3f284strh valw, [dstend2, -2]2853: ret286287/* Set 17..96 bytes. */288L(set_medium):289str q0, [dstin]290tbnz count, 6, L(set96)291str q0, [dstend2, -16]292tbz count, 5, 1f293str q0, [dstin, 16]294str q0, [dstend2, -32]2951: ret296297.p2align 4298/* Set 64..96 bytes. Write 64 bytes from the start and29932 bytes from the end. */300L(set96):301str q0, [dstin, 16]302stp q0, q0, [dstin, 32]303stp q0, q0, [dstend2, -32]304ret305306.p2align 4307L(set_long):308and valw, valw, 255309bic dst, dstin, 15310str q0, [dstin]311cmp count, 160312ccmp valw, 0, 0, hs313b.ne L(no_zva)314315#ifndef SKIP_ZVA_CHECK316mrs zva_val, dczid_el0317and zva_val, zva_val, 31318cmp zva_val, 4 /* ZVA size is 64 bytes. */319b.ne L(no_zva)320#endif321str q0, [dst, 16]322stp q0, q0, [dst, 32]323bic dst, dst, 63324sub count, dstend2, dst /* Count is now 64 too large. */325sub count, count, 128 /* Adjust count and bias for loop. */326327.p2align 4328L(zva_loop):329add dst, dst, 64330dc zva, dst331subs count, count, 64332b.hi L(zva_loop)333stp q0, q0, [dstend2, -64]334stp q0, q0, [dstend2, -32]335ret336337L(no_zva):338sub count, dstend2, dst /* Count is 16 too large. */339sub dst, dst, 16 /* Dst is biased by -32. */340sub count, count, 64 + 16 /* Adjust count and bias for loop. */341L(no_zva_loop):342stp q0, q0, [dst, 32]343stp q0, q0, [dst, 64]!344subs count, count, 64345b.hi L(no_zva_loop)346stp q0, q0, [dstend2, -64]347stp q0, q0, [dstend2, -32]348ret349END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memset)350351#endif // __aarch64__352353354