/*1* arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions2* xthal_memcpy and xthal_bcopy3*4* This file is subject to the terms and conditions of the GNU General Public5* License. See the file "COPYING" in the main directory of this archive6* for more details.7*8* Copyright (C) 2002 - 2005 Tensilica Inc.9*/1011#include <variant/core.h>1213.macro src_b r, w0, w114#ifdef __XTENSA_EB__15src \r, \w0, \w116#else17src \r, \w1, \w018#endif19.endm2021.macro ssa8 r22#ifdef __XTENSA_EB__23ssa8b \r24#else25ssa8l \r26#endif27.endm282930/*31* void *memcpy(void *dst, const void *src, size_t len);32* void *memmove(void *dst, const void *src, size_t len);33* void *bcopy(const void *src, void *dst, size_t len);34*35* This function is intended to do the same thing as the standard36* library function memcpy() (or bcopy()) for most cases.37* However, where the source and/or destination references38* an instruction RAM or ROM or a data RAM or ROM, that39* source and/or destination will always be accessed with40* 32-bit load and store instructions (as required for these41* types of devices).42*43* !!!!!!! XTFIXME:44* !!!!!!! Handling of IRAM/IROM has not yet45* !!!!!!! been implemented.46*47* The bcopy version is provided here to avoid the overhead48* of an extra call, for callers that require this convention.49*50* The (general case) algorithm is as follows:51* If destination is unaligned, align it by conditionally52* copying 1 and 2 bytes.53* If source is aligned,54* do 16 bytes with a loop, and then finish up with55* 8, 4, 2, and 1 byte copies conditional on the length;56* else (if source is unaligned),57* do the same, but use SRC to align the source data.58* This code tries to use fall-through branches for the common59* case of aligned source and destination and multiple60* of 4 (or 8) length.61*62* Register use:63* a0/ return address64* a1/ stack pointer65* a2/ return value66* a3/ src67* a4/ length68* a5/ dst69* a6/ tmp70* a7/ tmp71* a8/ tmp72* a9/ tmp73* a10/ tmp74* a11/ tmp75*/7677.text78.align 479.global bcopy80.type bcopy,@function81bcopy:82entry sp, 16 # minimal stack frame83# a2=src, a3=dst, a4=len84mov a5, a3 # copy dst so that a2 is return value85mov a3, a286mov a2, a587j .Lcommon # go to common code for memcpy+bcopy888990/*91* Byte by byte copy92*/93.align 494.byte 0 # 1 mod 4 alignment for LOOPNEZ95# (0 mod 4 alignment for LBEG)96.Lbytecopy:97#if XCHAL_HAVE_LOOPS98loopnez a4, .Lbytecopydone99#else /* !XCHAL_HAVE_LOOPS */100beqz a4, .Lbytecopydone101add a7, a3, a4 # a7 = end address for source102#endif /* !XCHAL_HAVE_LOOPS */103.Lnextbyte:104l8ui a6, a3, 0105addi a3, a3, 1106s8i a6, a5, 0107addi a5, a5, 1108#if !XCHAL_HAVE_LOOPS109blt a3, a7, .Lnextbyte110#endif /* !XCHAL_HAVE_LOOPS */111.Lbytecopydone:112retw113114/*115* Destination is unaligned116*/117118.align 4119.Ldst1mod2: # dst is only byte aligned120_bltui a4, 7, .Lbytecopy # do short copies byte by byte121122# copy 1 byte123l8ui a6, a3, 0124addi a3, a3, 1125addi a4, a4, -1126s8i a6, a5, 0127addi a5, a5, 1128_bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then129# return to main algorithm130.Ldst2mod4: # dst 16-bit aligned131# copy 2 bytes132_bltui a4, 6, .Lbytecopy # do short copies byte by byte133l8ui a6, a3, 0134l8ui a7, a3, 1135addi a3, a3, 2136addi a4, a4, -2137s8i a6, a5, 0138s8i a7, a5, 1139addi a5, a5, 2140j .Ldstaligned # dst is now aligned, return to main algorithm141142.align 4143.global memcpy144.type memcpy,@function145memcpy:146.global memmove147.type memmove,@function148memmove:149150entry sp, 16 # minimal stack frame151# a2/ dst, a3/ src, a4/ len152mov a5, a2 # copy dst so that a2 is return value153.Lcommon:154_bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2155_bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4156.Ldstaligned: # return here from .Ldst?mod? once dst is aligned157srli a7, a4, 4 # number of loop iterations with 16B158# per iteration159movi a8, 3 # if source is not aligned,160_bany a3, a8, .Lsrcunaligned # then use shifting copy161/*162* Destination and source are word-aligned, use word copy.163*/164# copy 16 bytes per iteration for word-aligned dst and word-aligned src165#if XCHAL_HAVE_LOOPS166loopnez a7, .Loop1done167#else /* !XCHAL_HAVE_LOOPS */168beqz a7, .Loop1done169slli a8, a7, 4170add a8, a8, a3 # a8 = end of last 16B source chunk171#endif /* !XCHAL_HAVE_LOOPS */172.Loop1:173l32i a6, a3, 0174l32i a7, a3, 4175s32i a6, a5, 0176l32i a6, a3, 8177s32i a7, a5, 4178l32i a7, a3, 12179s32i a6, a5, 8180addi a3, a3, 16181s32i a7, a5, 12182addi a5, a5, 16183#if !XCHAL_HAVE_LOOPS184blt a3, a8, .Loop1185#endif /* !XCHAL_HAVE_LOOPS */186.Loop1done:187bbci.l a4, 3, .L2188# copy 8 bytes189l32i a6, a3, 0190l32i a7, a3, 4191addi a3, a3, 8192s32i a6, a5, 0193s32i a7, a5, 4194addi a5, a5, 8195.L2:196bbsi.l a4, 2, .L3197bbsi.l a4, 1, .L4198bbsi.l a4, 0, .L5199retw200.L3:201# copy 4 bytes202l32i a6, a3, 0203addi a3, a3, 4204s32i a6, a5, 0205addi a5, a5, 4206bbsi.l a4, 1, .L4207bbsi.l a4, 0, .L5208retw209.L4:210# copy 2 bytes211l16ui a6, a3, 0212addi a3, a3, 2213s16i a6, a5, 0214addi a5, a5, 2215bbsi.l a4, 0, .L5216retw217.L5:218# copy 1 byte219l8ui a6, a3, 0220s8i a6, a5, 0221retw222223/*224* Destination is aligned, Source is unaligned225*/226227.align 4228.Lsrcunaligned:229_beqz a4, .Ldone # avoid loading anything for zero-length copies230# copy 16 bytes per iteration for word-aligned dst and unaligned src231ssa8 a3 # set shift amount from byte offset232#define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS (simulator) with the233lint or ferret client, or 0 to save a few cycles */234#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT235and a11, a3, a8 # save unalignment offset for below236sub a3, a3, a11 # align a3237#endif238l32i a6, a3, 0 # load first word239#if XCHAL_HAVE_LOOPS240loopnez a7, .Loop2done241#else /* !XCHAL_HAVE_LOOPS */242beqz a7, .Loop2done243slli a10, a7, 4244add a10, a10, a3 # a10 = end of last 16B source chunk245#endif /* !XCHAL_HAVE_LOOPS */246.Loop2:247l32i a7, a3, 4248l32i a8, a3, 8249src_b a6, a6, a7250s32i a6, a5, 0251l32i a9, a3, 12252src_b a7, a7, a8253s32i a7, a5, 4254l32i a6, a3, 16255src_b a8, a8, a9256s32i a8, a5, 8257addi a3, a3, 16258src_b a9, a9, a6259s32i a9, a5, 12260addi a5, a5, 16261#if !XCHAL_HAVE_LOOPS262blt a3, a10, .Loop2263#endif /* !XCHAL_HAVE_LOOPS */264.Loop2done:265bbci.l a4, 3, .L12266# copy 8 bytes267l32i a7, a3, 4268l32i a8, a3, 8269src_b a6, a6, a7270s32i a6, a5, 0271addi a3, a3, 8272src_b a7, a7, a8273s32i a7, a5, 4274addi a5, a5, 8275mov a6, a8276.L12:277bbci.l a4, 2, .L13278# copy 4 bytes279l32i a7, a3, 4280addi a3, a3, 4281src_b a6, a6, a7282s32i a6, a5, 0283addi a5, a5, 4284mov a6, a7285.L13:286#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT287add a3, a3, a11 # readjust a3 with correct misalignment288#endif289bbsi.l a4, 1, .L14290bbsi.l a4, 0, .L15291.Ldone: retw292.L14:293# copy 2 bytes294l8ui a6, a3, 0295l8ui a7, a3, 1296addi a3, a3, 2297s8i a6, a5, 0298s8i a7, a5, 1299addi a5, a5, 2300bbsi.l a4, 0, .L15301retw302.L15:303# copy 1 byte304l8ui a6, a3, 0305s8i a6, a5, 0306retw307308/*309* Local Variables:310* mode:fundamental311* comment-start: "# "312* comment-start-skip: "# *"313* End:314*/315316317