/* SPDX-License-Identifier: GPL-2.0-only */1/*2* Copyright (C) 2013 ARM Ltd.3* Copyright (C) 2013 Linaro.4*5* This code is based on glibc cortex strings work originally authored by Linaro6* be found @7*8* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/9* files/head:/src/aarch64/10*/111213/*14* Copy a buffer from src to dest (alignment handled by the hardware)15*16* Parameters:17* x0 - dest18* x1 - src19* x2 - n20* Returns:21* x0 - dest22*/23dstin .req x024src .req x125count .req x226tmp1 .req x327tmp1w .req w328tmp2 .req x429tmp2w .req w430dst .req x63132A_l .req x733A_h .req x834B_l .req x935B_h .req x1036C_l .req x1137C_h .req x1238D_l .req x1339D_h .req x144041mov dst, dstin4243#ifdef CONFIG_AS_HAS_MOPS44alternative_if_not ARM64_HAS_MOPS45b .Lno_mops46alternative_else_nop_endif47cpy1 dst, src, count48b .Lexitfunc49.Lno_mops:50#endif5152cmp count, #1653/*When memory length is less than 16, the accessed are not aligned.*/54b.lo .Ltiny155556neg tmp2, src57ands tmp2, tmp2, #15/* Bytes to reach alignment. */58b.eq .LSrcAligned59sub count, count, tmp260/*61* Copy the leading memory data from src to dst in an increasing62* address order.By this way,the risk of overwriting the source63* memory data is eliminated when the distance between src and64* dst is less than 16. The memory accesses here are alignment.65*/66tbz tmp2, #0, 1f67ldrb1 tmp1w, src, #168strb1 tmp1w, dst, #1691:70tbz tmp2, #1, 2f71ldrh1 tmp1w, src, #272strh1 tmp1w, dst, #2732:74tbz tmp2, #2, 3f75ldr1 tmp1w, src, #476str1 tmp1w, dst, #4773:78tbz tmp2, #3, .LSrcAligned79ldr1 tmp1, src, #880str1 tmp1, dst, #88182.LSrcAligned:83cmp count, #6484b.ge .Lcpy_over6485/*86* Deal with small copies quickly by dropping straight into the87* exit block.88*/89.Ltail63:90/*91* Copy up to 48 bytes of data. At this point we only need the92* bottom 6 bits of count to be accurate.93*/94ands tmp1, count, #0x3095b.eq .Ltiny1596cmp tmp1w, #0x2097b.eq 1f98b.lt 2f99ldp1 A_l, A_h, src, #16100stp1 A_l, A_h, dst, #161011:102ldp1 A_l, A_h, src, #16103stp1 A_l, A_h, dst, #161042:105ldp1 A_l, A_h, src, #16106stp1 A_l, A_h, dst, #16107.Ltiny15:108/*109* Prefer to break one ldp/stp into several load/store to access110* memory in an increasing address order,rather than to load/store 16111* bytes from (src-16) to (dst-16) and to backward the src to aligned112* address,which way is used in original cortex memcpy. If keeping113* the original memcpy process here, memmove need to satisfy the114* precondition that src address is at least 16 bytes bigger than dst115* address,otherwise some source data will be overwritten when memove116* call memcpy directly. To make memmove simpler and decouple the117* memcpy's dependency on memmove, withdrew the original process.118*/119tbz count, #3, 1f120ldr1 tmp1, src, #8121str1 tmp1, dst, #81221:123tbz count, #2, 2f124ldr1 tmp1w, src, #4125str1 tmp1w, dst, #41262:127tbz count, #1, 3f128ldrh1 tmp1w, src, #2129strh1 tmp1w, dst, #21303:131tbz count, #0, .Lexitfunc132ldrb1 tmp1w, src, #1133strb1 tmp1w, dst, #1134135b .Lexitfunc136137.Lcpy_over64:138subs count, count, #128139b.ge .Lcpy_body_large140/*141* Less than 128 bytes to copy, so handle 64 here and then jump142* to the tail.143*/144ldp1 A_l, A_h, src, #16145stp1 A_l, A_h, dst, #16146ldp1 B_l, B_h, src, #16147ldp1 C_l, C_h, src, #16148stp1 B_l, B_h, dst, #16149stp1 C_l, C_h, dst, #16150ldp1 D_l, D_h, src, #16151stp1 D_l, D_h, dst, #16152153tst count, #0x3f154b.ne .Ltail63155b .Lexitfunc156157/*158* Critical loop. Start at a new cache line boundary. Assuming159* 64 bytes per line this ensures the entire loop is in one line.160*/161.p2align L1_CACHE_SHIFT162.Lcpy_body_large:163/* pre-get 64 bytes data. */164ldp1 A_l, A_h, src, #16165ldp1 B_l, B_h, src, #16166ldp1 C_l, C_h, src, #16167ldp1 D_l, D_h, src, #161681:169/*170* interlace the load of next 64 bytes data block with store of the last171* loaded 64 bytes data.172*/173stp1 A_l, A_h, dst, #16174ldp1 A_l, A_h, src, #16175stp1 B_l, B_h, dst, #16176ldp1 B_l, B_h, src, #16177stp1 C_l, C_h, dst, #16178ldp1 C_l, C_h, src, #16179stp1 D_l, D_h, dst, #16180ldp1 D_l, D_h, src, #16181subs count, count, #64182b.ge 1b183stp1 A_l, A_h, dst, #16184stp1 B_l, B_h, dst, #16185stp1 C_l, C_h, dst, #16186stp1 D_l, D_h, dst, #16187188tst count, #0x3f189b.ne .Ltail63190.Lexitfunc:191192193