/*1* arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions2* xthal_memcpy and xthal_bcopy3*4* This file is subject to the terms and conditions of the GNU General Public5* License. See the file "COPYING" in the main directory of this archive6* for more details.7*8* Copyright (C) 2002 - 2012 Tensilica Inc.9*/1011#include <linux/linkage.h>12#include <asm/asmmacro.h>13#include <asm/core.h>1415/*16* void *memcpy(void *dst, const void *src, size_t len);17*18* This function is intended to do the same thing as the standard19* library function memcpy() for most cases.20* However, where the source and/or destination references21* an instruction RAM or ROM or a data RAM or ROM, that22* source and/or destination will always be accessed with23* 32-bit load and store instructions (as required for these24* types of devices).25*26* !!!!!!! XTFIXME:27* !!!!!!! Handling of IRAM/IROM has not yet28* !!!!!!! been implemented.29*30* The (general case) algorithm is as follows:31* If destination is unaligned, align it by conditionally32* copying 1 and 2 bytes.33* If source is aligned,34* do 16 bytes with a loop, and then finish up with35* 8, 4, 2, and 1 byte copies conditional on the length;36* else (if source is unaligned),37* do the same, but use SRC to align the source data.38* This code tries to use fall-through branches for the common39* case of aligned source and destination and multiple40* of 4 (or 8) length.41*42* Register use:43* a0/ return address44* a1/ stack pointer45* a2/ return value46* a3/ src47* a4/ length48* a5/ dst49* a6/ tmp50* a7/ tmp51* a8/ tmp52* a9/ tmp53* a10/ tmp54* a11/ tmp55*/5657.text5859/*60* Byte by byte copy61*/62.align 463.byte 0 # 1 mod 4 alignment for LOOPNEZ64# (0 mod 4 alignment for LBEG)65.Lbytecopy:66#if XCHAL_HAVE_LOOPS67loopnez a4, .Lbytecopydone68#else /* !XCHAL_HAVE_LOOPS */69beqz a4, .Lbytecopydone70add a7, a3, a4 # a7 = end address for source71#endif /* !XCHAL_HAVE_LOOPS */72.Lnextbyte:73l8ui a6, a3, 074addi a3, a3, 175s8i a6, a5, 076addi a5, a5, 177#if !XCHAL_HAVE_LOOPS78bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end79#endif /* !XCHAL_HAVE_LOOPS */80.Lbytecopydone:81abi_ret_default8283/*84* Destination is unaligned85*/8687.align 488.Ldst1mod2: # dst is only byte aligned89_bltui a4, 7, .Lbytecopy # do short copies byte by byte9091# copy 1 byte92l8ui a6, a3, 093addi a3, a3, 194addi a4, a4, -195s8i a6, a5, 096addi a5, a5, 197_bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then98# return to main algorithm99.Ldst2mod4: # dst 16-bit aligned100# copy 2 bytes101_bltui a4, 6, .Lbytecopy # do short copies byte by byte102l8ui a6, a3, 0103l8ui a7, a3, 1104addi a3, a3, 2105addi a4, a4, -2106s8i a6, a5, 0107s8i a7, a5, 1108addi a5, a5, 2109j .Ldstaligned # dst is now aligned, return to main algorithm110111ENTRY(__memcpy)112WEAK(memcpy)113114abi_entry_default115# a2/ dst, a3/ src, a4/ len116mov a5, a2 # copy dst so that a2 is return value117.Lcommon:118_bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2119_bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4120.Ldstaligned: # return here from .Ldst?mod? once dst is aligned121srli a7, a4, 4 # number of loop iterations with 16B122# per iteration123movi a8, 3 # if source is not aligned,124_bany a3, a8, .Lsrcunaligned # then use shifting copy125/*126* Destination and source are word-aligned, use word copy.127*/128# copy 16 bytes per iteration for word-aligned dst and word-aligned src129#if XCHAL_HAVE_LOOPS130loopnez a7, .Loop1done131#else /* !XCHAL_HAVE_LOOPS */132beqz a7, .Loop1done133slli a8, a7, 4134add a8, a8, a3 # a8 = end of last 16B source chunk135#endif /* !XCHAL_HAVE_LOOPS */136.Loop1:137l32i a6, a3, 0138l32i a7, a3, 4139s32i a6, a5, 0140l32i a6, a3, 8141s32i a7, a5, 4142l32i a7, a3, 12143s32i a6, a5, 8144addi a3, a3, 16145s32i a7, a5, 12146addi a5, a5, 16147#if !XCHAL_HAVE_LOOPS148bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end149#endif /* !XCHAL_HAVE_LOOPS */150.Loop1done:151bbci.l a4, 3, .L2152# copy 8 bytes153l32i a6, a3, 0154l32i a7, a3, 4155addi a3, a3, 8156s32i a6, a5, 0157s32i a7, a5, 4158addi a5, a5, 8159.L2:160bbsi.l a4, 2, .L3161bbsi.l a4, 1, .L4162bbsi.l a4, 0, .L5163abi_ret_default164.L3:165# copy 4 bytes166l32i a6, a3, 0167addi a3, a3, 4168s32i a6, a5, 0169addi a5, a5, 4170bbsi.l a4, 1, .L4171bbsi.l a4, 0, .L5172abi_ret_default173.L4:174# copy 2 bytes175l16ui a6, a3, 0176addi a3, a3, 2177s16i a6, a5, 0178addi a5, a5, 2179bbsi.l a4, 0, .L5180abi_ret_default181.L5:182# copy 1 byte183l8ui a6, a3, 0184s8i a6, a5, 0185abi_ret_default186187/*188* Destination is aligned, Source is unaligned189*/190191.align 4192.Lsrcunaligned:193_beqz a4, .Ldone # avoid loading anything for zero-length copies194# copy 16 bytes per iteration for word-aligned dst and unaligned src195__ssa8 a3 # set shift amount from byte offset196197/* set to 1 when running on ISS (simulator) with the198lint or ferret client, or 0 to save a few cycles */199#define SIM_CHECKS_ALIGNMENT 1200#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT201and a11, a3, a8 # save unalignment offset for below202sub a3, a3, a11 # align a3203#endif204l32i a6, a3, 0 # load first word205#if XCHAL_HAVE_LOOPS206loopnez a7, .Loop2done207#else /* !XCHAL_HAVE_LOOPS */208beqz a7, .Loop2done209slli a10, a7, 4210add a10, a10, a3 # a10 = end of last 16B source chunk211#endif /* !XCHAL_HAVE_LOOPS */212.Loop2:213l32i a7, a3, 4214l32i a8, a3, 8215__src_b a6, a6, a7216s32i a6, a5, 0217l32i a9, a3, 12218__src_b a7, a7, a8219s32i a7, a5, 4220l32i a6, a3, 16221__src_b a8, a8, a9222s32i a8, a5, 8223addi a3, a3, 16224__src_b a9, a9, a6225s32i a9, a5, 12226addi a5, a5, 16227#if !XCHAL_HAVE_LOOPS228bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end229#endif /* !XCHAL_HAVE_LOOPS */230.Loop2done:231bbci.l a4, 3, .L12232# copy 8 bytes233l32i a7, a3, 4234l32i a8, a3, 8235__src_b a6, a6, a7236s32i a6, a5, 0237addi a3, a3, 8238__src_b a7, a7, a8239s32i a7, a5, 4240addi a5, a5, 8241mov a6, a8242.L12:243bbci.l a4, 2, .L13244# copy 4 bytes245l32i a7, a3, 4246addi a3, a3, 4247__src_b a6, a6, a7248s32i a6, a5, 0249addi a5, a5, 4250mov a6, a7251.L13:252#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT253add a3, a3, a11 # readjust a3 with correct misalignment254#endif255bbsi.l a4, 1, .L14256bbsi.l a4, 0, .L15257.Ldone: abi_ret_default258.L14:259# copy 2 bytes260l8ui a6, a3, 0261l8ui a7, a3, 1262addi a3, a3, 2263s8i a6, a5, 0264s8i a7, a5, 1265addi a5, a5, 2266bbsi.l a4, 0, .L15267abi_ret_default268.L15:269# copy 1 byte270l8ui a6, a3, 0271s8i a6, a5, 0272abi_ret_default273274ENDPROC(__memcpy)275EXPORT_SYMBOL(__memcpy)276EXPORT_SYMBOL(memcpy)277278/*279* void *memmove(void *dst, const void *src, size_t len);280*281* This function is intended to do the same thing as the standard282* library function memmove() for most cases.283* However, where the source and/or destination references284* an instruction RAM or ROM or a data RAM or ROM, that285* source and/or destination will always be accessed with286* 32-bit load and store instructions (as required for these287* types of devices).288*289* !!!!!!! XTFIXME:290* !!!!!!! Handling of IRAM/IROM has not yet291* !!!!!!! been implemented.292*293* The (general case) algorithm is as follows:294* If end of source doesn't overlap destination then use memcpy.295* Otherwise do memcpy backwards.296*297* Register use:298* a0/ return address299* a1/ stack pointer300* a2/ return value301* a3/ src302* a4/ length303* a5/ dst304* a6/ tmp305* a7/ tmp306* a8/ tmp307* a9/ tmp308* a10/ tmp309* a11/ tmp310*/311312/*313* Byte by byte copy314*/315.align 4316.byte 0 # 1 mod 4 alignment for LOOPNEZ317# (0 mod 4 alignment for LBEG)318.Lbackbytecopy:319#if XCHAL_HAVE_LOOPS320loopnez a4, .Lbackbytecopydone321#else /* !XCHAL_HAVE_LOOPS */322beqz a4, .Lbackbytecopydone323sub a7, a3, a4 # a7 = start address for source324#endif /* !XCHAL_HAVE_LOOPS */325.Lbacknextbyte:326addi a3, a3, -1327l8ui a6, a3, 0328addi a5, a5, -1329s8i a6, a5, 0330#if !XCHAL_HAVE_LOOPS331bne a3, a7, .Lbacknextbyte # continue loop if332# $a3:src != $a7:src_start333#endif /* !XCHAL_HAVE_LOOPS */334.Lbackbytecopydone:335abi_ret_default336337/*338* Destination is unaligned339*/340341.align 4342.Lbackdst1mod2: # dst is only byte aligned343_bltui a4, 7, .Lbackbytecopy # do short copies byte by byte344345# copy 1 byte346addi a3, a3, -1347l8ui a6, a3, 0348addi a5, a5, -1349s8i a6, a5, 0350addi a4, a4, -1351_bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then352# return to main algorithm353.Lbackdst2mod4: # dst 16-bit aligned354# copy 2 bytes355_bltui a4, 6, .Lbackbytecopy # do short copies byte by byte356addi a3, a3, -2357l8ui a6, a3, 0358l8ui a7, a3, 1359addi a5, a5, -2360s8i a6, a5, 0361s8i a7, a5, 1362addi a4, a4, -2363j .Lbackdstaligned # dst is now aligned,364# return to main algorithm365366ENTRY(__memmove)367WEAK(memmove)368369abi_entry_default370# a2/ dst, a3/ src, a4/ len371mov a5, a2 # copy dst so that a2 is return value372.Lmovecommon:373sub a6, a5, a3374bgeu a6, a4, .Lcommon375376add a5, a5, a4377add a3, a3, a4378379_bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2380_bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4381.Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned382srli a7, a4, 4 # number of loop iterations with 16B383# per iteration384movi a8, 3 # if source is not aligned,385_bany a3, a8, .Lbacksrcunaligned # then use shifting copy386/*387* Destination and source are word-aligned, use word copy.388*/389# copy 16 bytes per iteration for word-aligned dst and word-aligned src390#if XCHAL_HAVE_LOOPS391loopnez a7, .LbackLoop1done392#else /* !XCHAL_HAVE_LOOPS */393beqz a7, .LbackLoop1done394slli a8, a7, 4395sub a8, a3, a8 # a8 = start of first 16B source chunk396#endif /* !XCHAL_HAVE_LOOPS */397.LbackLoop1:398addi a3, a3, -16399l32i a7, a3, 12400l32i a6, a3, 8401addi a5, a5, -16402s32i a7, a5, 12403l32i a7, a3, 4404s32i a6, a5, 8405l32i a6, a3, 0406s32i a7, a5, 4407s32i a6, a5, 0408#if !XCHAL_HAVE_LOOPS409bne a3, a8, .LbackLoop1 # continue loop if a3:src != a8:src_start410#endif /* !XCHAL_HAVE_LOOPS */411.LbackLoop1done:412bbci.l a4, 3, .Lback2413# copy 8 bytes414addi a3, a3, -8415l32i a6, a3, 0416l32i a7, a3, 4417addi a5, a5, -8418s32i a6, a5, 0419s32i a7, a5, 4420.Lback2:421bbsi.l a4, 2, .Lback3422bbsi.l a4, 1, .Lback4423bbsi.l a4, 0, .Lback5424abi_ret_default425.Lback3:426# copy 4 bytes427addi a3, a3, -4428l32i a6, a3, 0429addi a5, a5, -4430s32i a6, a5, 0431bbsi.l a4, 1, .Lback4432bbsi.l a4, 0, .Lback5433abi_ret_default434.Lback4:435# copy 2 bytes436addi a3, a3, -2437l16ui a6, a3, 0438addi a5, a5, -2439s16i a6, a5, 0440bbsi.l a4, 0, .Lback5441abi_ret_default442.Lback5:443# copy 1 byte444addi a3, a3, -1445l8ui a6, a3, 0446addi a5, a5, -1447s8i a6, a5, 0448abi_ret_default449450/*451* Destination is aligned, Source is unaligned452*/453454.align 4455.Lbacksrcunaligned:456_beqz a4, .Lbackdone # avoid loading anything for zero-length copies457# copy 16 bytes per iteration for word-aligned dst and unaligned src458__ssa8 a3 # set shift amount from byte offset459#define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with460* the lint or ferret client, or 0461* to save a few cycles */462#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT463and a11, a3, a8 # save unalignment offset for below464sub a3, a3, a11 # align a3465#endif466l32i a6, a3, 0 # load first word467#if XCHAL_HAVE_LOOPS468loopnez a7, .LbackLoop2done469#else /* !XCHAL_HAVE_LOOPS */470beqz a7, .LbackLoop2done471slli a10, a7, 4472sub a10, a3, a10 # a10 = start of first 16B source chunk473#endif /* !XCHAL_HAVE_LOOPS */474.LbackLoop2:475addi a3, a3, -16476l32i a7, a3, 12477l32i a8, a3, 8478addi a5, a5, -16479__src_b a6, a7, a6480s32i a6, a5, 12481l32i a9, a3, 4482__src_b a7, a8, a7483s32i a7, a5, 8484l32i a6, a3, 0485__src_b a8, a9, a8486s32i a8, a5, 4487__src_b a9, a6, a9488s32i a9, a5, 0489#if !XCHAL_HAVE_LOOPS490bne a3, a10, .LbackLoop2 # continue loop if a3:src != a10:src_start491#endif /* !XCHAL_HAVE_LOOPS */492.LbackLoop2done:493bbci.l a4, 3, .Lback12494# copy 8 bytes495addi a3, a3, -8496l32i a7, a3, 4497l32i a8, a3, 0498addi a5, a5, -8499__src_b a6, a7, a6500s32i a6, a5, 4501__src_b a7, a8, a7502s32i a7, a5, 0503mov a6, a8504.Lback12:505bbci.l a4, 2, .Lback13506# copy 4 bytes507addi a3, a3, -4508l32i a7, a3, 0509addi a5, a5, -4510__src_b a6, a7, a6511s32i a6, a5, 0512mov a6, a7513.Lback13:514#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT515add a3, a3, a11 # readjust a3 with correct misalignment516#endif517bbsi.l a4, 1, .Lback14518bbsi.l a4, 0, .Lback15519.Lbackdone:520abi_ret_default521.Lback14:522# copy 2 bytes523addi a3, a3, -2524l8ui a6, a3, 0525l8ui a7, a3, 1526addi a5, a5, -2527s8i a6, a5, 0528s8i a7, a5, 1529bbsi.l a4, 0, .Lback15530abi_ret_default531.Lback15:532# copy 1 byte533addi a3, a3, -1534addi a5, a5, -1535l8ui a6, a3, 0536s8i a6, a5, 0537abi_ret_default538539ENDPROC(__memmove)540EXPORT_SYMBOL(__memmove)541EXPORT_SYMBOL(memmove)542543544