Path: blob/main/contrib/arm-optimized-routines/string/arm/memcpy.S
39556 views
/*1* memcpy - copy memory area2*3* Copyright (c) 2013-2022, Arm Limited.4* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception5*/67/*8This memcpy routine is optimised for Cortex-A15 cores and takes advantage9of VFP or NEON when built with the appropriate flags.1011Assumptions:1213ARMv6 (ARMv7-a if using Neon)14ARM state15Unaligned accesses1617*/1819#include "asmdefs.h"2021.syntax unified22/* This implementation requires ARM state. */23.arm2425#ifdef __ARM_NEON__2627.fpu neon28.arch armv7-a29# define FRAME_SIZE 430# define USE_VFP31# define USE_NEON3233#elif !defined (__SOFTFP__)3435.arch armv636.fpu vfpv237# define FRAME_SIZE 3238# define USE_VFP3940#else41.arch armv642# define FRAME_SIZE 324344#endif4546/* Old versions of GAS incorrectly implement the NEON align semantics. */47#ifdef BROKEN_ASM_NEON_ALIGN48#define ALIGN(addr, align) addr,:align49#else50#define ALIGN(addr, align) addr:align51#endif5253#define PC_OFFSET 8 /* PC pipeline compensation. */54#define INSN_SIZE 45556/* Call parameters. */57#define dstin r058#define src r159#define count r26061/* Locals. */62#define tmp1 r363#define dst ip64#define tmp2 r106566#ifndef USE_NEON67/* For bulk copies using GP registers. */68#define A_l r2 /* Call-clobbered. */69#define A_h r3 /* Call-clobbered. */70#define B_l r471#define B_h r572#define C_l r673#define C_h r774#define D_l r875#define D_h r976#endif7778/* Number of lines ahead to pre-fetch data. If you change this the code79below will need adjustment to compensate. */8081#define prefetch_lines 58283#ifdef USE_VFP84.macro cpy_line_vfp vreg, base85vstr \vreg, [dst, #\base]86vldr \vreg, [src, #\base]87vstr d0, [dst, #\base + 8]88vldr d0, [src, #\base + 8]89vstr d1, [dst, #\base + 16]90vldr d1, [src, #\base + 16]91vstr d2, [dst, #\base + 24]92vldr d2, [src, #\base + 24]93vstr \vreg, [dst, #\base + 32]94vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]95vstr d0, [dst, #\base + 40]96vldr d0, [src, #\base + 40]97vstr d1, [dst, #\base + 48]98vldr d1, [src, #\base + 48]99vstr d2, [dst, #\base + 56]100vldr d2, [src, #\base + 56]101.endm102103.macro cpy_tail_vfp vreg, base104vstr \vreg, [dst, #\base]105vldr \vreg, [src, #\base]106vstr d0, [dst, #\base + 8]107vldr d0, [src, #\base + 8]108vstr d1, [dst, #\base + 16]109vldr d1, [src, #\base + 16]110vstr d2, [dst, #\base + 24]111vldr d2, [src, #\base + 24]112vstr \vreg, [dst, #\base + 32]113vstr d0, [dst, #\base + 40]114vldr d0, [src, #\base + 40]115vstr d1, [dst, #\base + 48]116vldr d1, [src, #\base + 48]117vstr d2, [dst, #\base + 56]118vldr d2, [src, #\base + 56]119.endm120#endif121122ENTRY (__memcpy_arm)123124mov dst, dstin /* Preserve dstin, we need to return it. */125cmp count, #64126bhs L(cpy_not_short)127/* Deal with small copies quickly by dropping straight into the128exit block. */129130L(tail63unaligned):131#ifdef USE_NEON132and tmp1, count, #0x38133rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)134add pc, pc, tmp1135vld1.8 {d0}, [src]! /* 14 words to go. */136vst1.8 {d0}, [dst]!137vld1.8 {d0}, [src]! /* 12 words to go. */138vst1.8 {d0}, [dst]!139vld1.8 {d0}, [src]! /* 10 words to go. */140vst1.8 {d0}, [dst]!141vld1.8 {d0}, [src]! /* 8 words to go. */142vst1.8 {d0}, [dst]!143vld1.8 {d0}, [src]! /* 6 words to go. */144vst1.8 {d0}, [dst]!145vld1.8 {d0}, [src]! /* 4 words to go. */146vst1.8 {d0}, [dst]!147vld1.8 {d0}, [src]! /* 2 words to go. */148vst1.8 {d0}, [dst]!149150tst count, #4151ldrne tmp1, [src], #4152strne tmp1, [dst], #4153#else154/* Copy up to 15 full words of data. May not be aligned. */155/* Cannot use VFP for unaligned data. */156and tmp1, count, #0x3c157add dst, dst, tmp1158add src, src, tmp1159rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)160/* Jump directly into the sequence below at the correct offset. */161add pc, pc, tmp1, lsl #1162163ldr tmp1, [src, #-60] /* 15 words to go. */164str tmp1, [dst, #-60]165166ldr tmp1, [src, #-56] /* 14 words to go. */167str tmp1, [dst, #-56]168ldr tmp1, [src, #-52]169str tmp1, [dst, #-52]170171ldr tmp1, [src, #-48] /* 12 words to go. */172str tmp1, [dst, #-48]173ldr tmp1, [src, #-44]174str tmp1, [dst, #-44]175176ldr tmp1, [src, #-40] /* 10 words to go. */177str tmp1, [dst, #-40]178ldr tmp1, [src, #-36]179str tmp1, [dst, #-36]180181ldr tmp1, [src, #-32] /* 8 words to go. */182str tmp1, [dst, #-32]183ldr tmp1, [src, #-28]184str tmp1, [dst, #-28]185186ldr tmp1, [src, #-24] /* 6 words to go. */187str tmp1, [dst, #-24]188ldr tmp1, [src, #-20]189str tmp1, [dst, #-20]190191ldr tmp1, [src, #-16] /* 4 words to go. */192str tmp1, [dst, #-16]193ldr tmp1, [src, #-12]194str tmp1, [dst, #-12]195196ldr tmp1, [src, #-8] /* 2 words to go. */197str tmp1, [dst, #-8]198ldr tmp1, [src, #-4]199str tmp1, [dst, #-4]200#endif201202lsls count, count, #31203ldrhcs tmp1, [src], #2204ldrbne src, [src] /* Src is dead, use as a scratch. */205strhcs tmp1, [dst], #2206strbne src, [dst]207bx lr208209L(cpy_not_short):210/* At least 64 bytes to copy, but don't know the alignment yet. */211str tmp2, [sp, #-FRAME_SIZE]!212and tmp2, src, #7213and tmp1, dst, #7214cmp tmp1, tmp2215bne L(cpy_notaligned)216217#ifdef USE_VFP218/* Magic dust alert! Force VFP on Cortex-A9. Experiments show219that the FP pipeline is much better at streaming loads and220stores. This is outside the critical loop. */221vmov.f32 s0, s0222#endif223224/* SRC and DST have the same mutual 64-bit alignment, but we may225still need to pre-copy some bytes to get to natural alignment.226We bring SRC and DST into full 64-bit alignment. */227lsls tmp2, dst, #29228beq 1f229rsbs tmp2, tmp2, #0230sub count, count, tmp2, lsr #29231ldrmi tmp1, [src], #4232strmi tmp1, [dst], #4233lsls tmp2, tmp2, #2234ldrhcs tmp1, [src], #2235ldrbne tmp2, [src], #1236strhcs tmp1, [dst], #2237strbne tmp2, [dst], #12382391:240subs tmp2, count, #64 /* Use tmp2 for count. */241blo L(tail63aligned)242243cmp tmp2, #512244bhs L(cpy_body_long)245246L(cpy_body_medium): /* Count in tmp2. */247#ifdef USE_VFP2481:249vldr d0, [src, #0]250subs tmp2, tmp2, #64251vldr d1, [src, #8]252vstr d0, [dst, #0]253vldr d0, [src, #16]254vstr d1, [dst, #8]255vldr d1, [src, #24]256vstr d0, [dst, #16]257vldr d0, [src, #32]258vstr d1, [dst, #24]259vldr d1, [src, #40]260vstr d0, [dst, #32]261vldr d0, [src, #48]262vstr d1, [dst, #40]263vldr d1, [src, #56]264vstr d0, [dst, #48]265add src, src, #64266vstr d1, [dst, #56]267add dst, dst, #64268bhs 1b269tst tmp2, #0x3f270beq L(done)271272L(tail63aligned): /* Count in tmp2. */273and tmp1, tmp2, #0x38274add dst, dst, tmp1275add src, src, tmp1276rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)277add pc, pc, tmp1278279vldr d0, [src, #-56] /* 14 words to go. */280vstr d0, [dst, #-56]281vldr d0, [src, #-48] /* 12 words to go. */282vstr d0, [dst, #-48]283vldr d0, [src, #-40] /* 10 words to go. */284vstr d0, [dst, #-40]285vldr d0, [src, #-32] /* 8 words to go. */286vstr d0, [dst, #-32]287vldr d0, [src, #-24] /* 6 words to go. */288vstr d0, [dst, #-24]289vldr d0, [src, #-16] /* 4 words to go. */290vstr d0, [dst, #-16]291vldr d0, [src, #-8] /* 2 words to go. */292vstr d0, [dst, #-8]293#else294sub src, src, #8295sub dst, dst, #82961:297ldrd A_l, A_h, [src, #8]298strd A_l, A_h, [dst, #8]299ldrd A_l, A_h, [src, #16]300strd A_l, A_h, [dst, #16]301ldrd A_l, A_h, [src, #24]302strd A_l, A_h, [dst, #24]303ldrd A_l, A_h, [src, #32]304strd A_l, A_h, [dst, #32]305ldrd A_l, A_h, [src, #40]306strd A_l, A_h, [dst, #40]307ldrd A_l, A_h, [src, #48]308strd A_l, A_h, [dst, #48]309ldrd A_l, A_h, [src, #56]310strd A_l, A_h, [dst, #56]311ldrd A_l, A_h, [src, #64]!312strd A_l, A_h, [dst, #64]!313subs tmp2, tmp2, #64314bhs 1b315tst tmp2, #0x3f316bne 1f317ldr tmp2,[sp], #FRAME_SIZE318bx lr3191:320add src, src, #8321add dst, dst, #8322323L(tail63aligned): /* Count in tmp2. */324/* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but325we know that the src and dest are 64-bit aligned so we can use326LDRD/STRD to improve efficiency. */327/* TMP2 is now negative, but we don't care about that. The bottom328six bits still tell us how many bytes are left to copy. */329330and tmp1, tmp2, #0x38331add dst, dst, tmp1332add src, src, tmp1333rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)334add pc, pc, tmp1335ldrd A_l, A_h, [src, #-56] /* 14 words to go. */336strd A_l, A_h, [dst, #-56]337ldrd A_l, A_h, [src, #-48] /* 12 words to go. */338strd A_l, A_h, [dst, #-48]339ldrd A_l, A_h, [src, #-40] /* 10 words to go. */340strd A_l, A_h, [dst, #-40]341ldrd A_l, A_h, [src, #-32] /* 8 words to go. */342strd A_l, A_h, [dst, #-32]343ldrd A_l, A_h, [src, #-24] /* 6 words to go. */344strd A_l, A_h, [dst, #-24]345ldrd A_l, A_h, [src, #-16] /* 4 words to go. */346strd A_l, A_h, [dst, #-16]347ldrd A_l, A_h, [src, #-8] /* 2 words to go. */348strd A_l, A_h, [dst, #-8]349350#endif351tst tmp2, #4352ldrne tmp1, [src], #4353strne tmp1, [dst], #4354lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */355ldrhcs tmp1, [src], #2356ldrbne tmp2, [src]357strhcs tmp1, [dst], #2358strbne tmp2, [dst]359360L(done):361ldr tmp2, [sp], #FRAME_SIZE362bx lr363364L(cpy_body_long): /* Count in tmp2. */365366/* Long copy. We know that there's at least (prefetch_lines * 64)367bytes to go. */368#ifdef USE_VFP369/* Don't use PLD. Instead, read some data in advance of the current370copy position into a register. This should act like a PLD371operation but we won't have to repeat the transfer. */372373vldr d3, [src, #0]374vldr d4, [src, #64]375vldr d5, [src, #128]376vldr d6, [src, #192]377vldr d7, [src, #256]378379vldr d0, [src, #8]380vldr d1, [src, #16]381vldr d2, [src, #24]382add src, src, #32383384subs tmp2, tmp2, #prefetch_lines * 64 * 2385blo 2f3861:387cpy_line_vfp d3, 0388cpy_line_vfp d4, 64389cpy_line_vfp d5, 128390add dst, dst, #3 * 64391add src, src, #3 * 64392cpy_line_vfp d6, 0393cpy_line_vfp d7, 64394add dst, dst, #2 * 64395add src, src, #2 * 64396subs tmp2, tmp2, #prefetch_lines * 64397bhs 1b3983992:400cpy_tail_vfp d3, 0401cpy_tail_vfp d4, 64402cpy_tail_vfp d5, 128403add src, src, #3 * 64404add dst, dst, #3 * 64405cpy_tail_vfp d6, 0406vstr d7, [dst, #64]407vldr d7, [src, #64]408vstr d0, [dst, #64 + 8]409vldr d0, [src, #64 + 8]410vstr d1, [dst, #64 + 16]411vldr d1, [src, #64 + 16]412vstr d2, [dst, #64 + 24]413vldr d2, [src, #64 + 24]414vstr d7, [dst, #64 + 32]415add src, src, #96416vstr d0, [dst, #64 + 40]417vstr d1, [dst, #64 + 48]418vstr d2, [dst, #64 + 56]419add dst, dst, #128420add tmp2, tmp2, #prefetch_lines * 64421b L(cpy_body_medium)422#else423/* Long copy. Use an SMS style loop to maximize the I/O424bandwidth of the core. We don't have enough spare registers425to synthesise prefetching, so use PLD operations. */426/* Pre-bias src and dst. */427sub src, src, #8428sub dst, dst, #8429pld [src, #8]430pld [src, #72]431subs tmp2, tmp2, #64432pld [src, #136]433ldrd A_l, A_h, [src, #8]434strd B_l, B_h, [sp, #8]435ldrd B_l, B_h, [src, #16]436strd C_l, C_h, [sp, #16]437ldrd C_l, C_h, [src, #24]438strd D_l, D_h, [sp, #24]439pld [src, #200]440ldrd D_l, D_h, [src, #32]!441b 1f442.p2align 64432:444pld [src, #232]445strd A_l, A_h, [dst, #40]446ldrd A_l, A_h, [src, #40]447strd B_l, B_h, [dst, #48]448ldrd B_l, B_h, [src, #48]449strd C_l, C_h, [dst, #56]450ldrd C_l, C_h, [src, #56]451strd D_l, D_h, [dst, #64]!452ldrd D_l, D_h, [src, #64]!453subs tmp2, tmp2, #644541:455strd A_l, A_h, [dst, #8]456ldrd A_l, A_h, [src, #8]457strd B_l, B_h, [dst, #16]458ldrd B_l, B_h, [src, #16]459strd C_l, C_h, [dst, #24]460ldrd C_l, C_h, [src, #24]461strd D_l, D_h, [dst, #32]462ldrd D_l, D_h, [src, #32]463bcs 2b464/* Save the remaining bytes and restore the callee-saved regs. */465strd A_l, A_h, [dst, #40]466add src, src, #40467strd B_l, B_h, [dst, #48]468ldrd B_l, B_h, [sp, #8]469strd C_l, C_h, [dst, #56]470ldrd C_l, C_h, [sp, #16]471strd D_l, D_h, [dst, #64]472ldrd D_l, D_h, [sp, #24]473add dst, dst, #72474tst tmp2, #0x3f475bne L(tail63aligned)476ldr tmp2, [sp], #FRAME_SIZE477bx lr478#endif479480L(cpy_notaligned):481pld [src]482pld [src, #64]483/* There's at least 64 bytes to copy, but there is no mutual484alignment. */485/* Bring DST to 64-bit alignment. */486lsls tmp2, dst, #29487pld [src, #(2 * 64)]488beq 1f489rsbs tmp2, tmp2, #0490sub count, count, tmp2, lsr #29491ldrmi tmp1, [src], #4492strmi tmp1, [dst], #4493lsls tmp2, tmp2, #2494ldrbne tmp1, [src], #1495ldrhcs tmp2, [src], #2496strbne tmp1, [dst], #1497strhcs tmp2, [dst], #24981:499pld [src, #(3 * 64)]500subs count, count, #64501ldrlo tmp2, [sp], #FRAME_SIZE502blo L(tail63unaligned)503pld [src, #(4 * 64)]504505#ifdef USE_NEON506vld1.8 {d0-d3}, [src]!507vld1.8 {d4-d7}, [src]!508subs count, count, #64509blo 2f5101:511pld [src, #(4 * 64)]512vst1.8 {d0-d3}, [ALIGN (dst, 64)]!513vld1.8 {d0-d3}, [src]!514vst1.8 {d4-d7}, [ALIGN (dst, 64)]!515vld1.8 {d4-d7}, [src]!516subs count, count, #64517bhs 1b5182:519vst1.8 {d0-d3}, [ALIGN (dst, 64)]!520vst1.8 {d4-d7}, [ALIGN (dst, 64)]!521ands count, count, #0x3f522#else523/* Use an SMS style loop to maximize the I/O bandwidth. */524sub src, src, #4525sub dst, dst, #8526subs tmp2, count, #64 /* Use tmp2 for count. */527ldr A_l, [src, #4]528ldr A_h, [src, #8]529strd B_l, B_h, [sp, #8]530ldr B_l, [src, #12]531ldr B_h, [src, #16]532strd C_l, C_h, [sp, #16]533ldr C_l, [src, #20]534ldr C_h, [src, #24]535strd D_l, D_h, [sp, #24]536ldr D_l, [src, #28]537ldr D_h, [src, #32]!538b 1f539.p2align 65402:541pld [src, #(5 * 64) - (32 - 4)]542strd A_l, A_h, [dst, #40]543ldr A_l, [src, #36]544ldr A_h, [src, #40]545strd B_l, B_h, [dst, #48]546ldr B_l, [src, #44]547ldr B_h, [src, #48]548strd C_l, C_h, [dst, #56]549ldr C_l, [src, #52]550ldr C_h, [src, #56]551strd D_l, D_h, [dst, #64]!552ldr D_l, [src, #60]553ldr D_h, [src, #64]!554subs tmp2, tmp2, #645551:556strd A_l, A_h, [dst, #8]557ldr A_l, [src, #4]558ldr A_h, [src, #8]559strd B_l, B_h, [dst, #16]560ldr B_l, [src, #12]561ldr B_h, [src, #16]562strd C_l, C_h, [dst, #24]563ldr C_l, [src, #20]564ldr C_h, [src, #24]565strd D_l, D_h, [dst, #32]566ldr D_l, [src, #28]567ldr D_h, [src, #32]568bcs 2b569570/* Save the remaining bytes and restore the callee-saved regs. */571strd A_l, A_h, [dst, #40]572add src, src, #36573strd B_l, B_h, [dst, #48]574ldrd B_l, B_h, [sp, #8]575strd C_l, C_h, [dst, #56]576ldrd C_l, C_h, [sp, #16]577strd D_l, D_h, [dst, #64]578ldrd D_l, D_h, [sp, #24]579add dst, dst, #72580ands count, tmp2, #0x3f581#endif582ldr tmp2, [sp], #FRAME_SIZE583bne L(tail63unaligned)584bx lr585586END (__memcpy_arm)587588589