/*1* This file is subject to the terms and conditions of the GNU General Public2* License. See the file "COPYING" in the main directory of this archive3* for more details.4*5* Unified implementation of memcpy, memmove and the __copy_user backend.6*7* Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle ([email protected])8* Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.9* Copyright (C) 2002 Broadcom, Inc.10* memcpy/copy_user author: Mark Vandevoorde11* Copyright (C) 2007 Maciej W. Rozycki12* Copyright (C) 2014 Imagination Technologies Ltd.13*14* Mnemonic names for arguments to memcpy/__copy_user15*/1617/*18* Hack to resolve longstanding prefetch issue19*20* Prefetching may be fatal on some systems if we're prefetching beyond the21* end of memory on some systems. It's also a seriously bad idea on non22* dma-coherent systems.23*/24#ifdef CONFIG_DMA_NONCOHERENT25#undef CONFIG_CPU_HAS_PREFETCH26#endif27#ifdef CONFIG_MIPS_MALTA28#undef CONFIG_CPU_HAS_PREFETCH29#endif30#ifdef CONFIG_CPU_MIPSR631#undef CONFIG_CPU_HAS_PREFETCH32#endif3334#include <linux/export.h>35#include <asm/asm.h>36#include <asm/asm-offsets.h>37#include <asm/regdef.h>3839#define dst a040#define src a141#define len a24243/*44* Spec45*46* memcpy copies len bytes from src to dst and sets v0 to dst.47* It assumes that48* - src and dst don't overlap49* - src is readable50* - dst is writable51* memcpy uses the standard calling convention52*53* __copy_user copies up to len bytes from src to dst and sets a2 (len) to54* the number of uncopied bytes due to an exception caused by a read or write.55* __copy_user assumes that src and dst don't overlap, and that the call is56* implementing one of the following:57* copy_to_user58* - src is readable (no exceptions when reading src)59* copy_from_user60* - dst is writable (no exceptions when writing dst)61* __copy_user uses a non-standard calling convention; see62* include/asm-mips/uaccess.h63*64* When an exception happens on a load, the handler must65# ensure that all of the destination buffer is overwritten to prevent66* leaking information to user mode programs.67*/6869/*70* Implementation71*/7273/*74* The exception handler for loads requires that:75* 1- AT contain the address of the byte just past the end of the source76* of the copy,77* 2- src_entry <= src < AT, and78* 3- (dst - src) == (dst_entry - src_entry),79* The _entry suffix denotes values when __copy_user was called.80*81* (1) is set up up by uaccess.h and maintained by not writing AT in copy_user82* (2) is met by incrementing src by the number of bytes copied83* (3) is met by not doing loads between a pair of increments of dst and src84*85* The exception handlers for stores adjust len (if necessary) and return.86* These handlers do not need to overwrite any data.87*88* For __rmemcpy and memmove an exception is always a kernel bug, therefore89* they're not protected.90*/9192/* Instruction type */93#define LD_INSN 194#define ST_INSN 295/* Pretech type */96#define SRC_PREFETCH 197#define DST_PREFETCH 298#define LEGACY_MODE 199#define EVA_MODE 2100#define USEROP 1101#define KERNELOP 2102103/*104* Wrapper to add an entry in the exception table105* in case the insn causes a memory exception.106* Arguments:107* insn : Load/store instruction108* type : Instruction type109* reg : Register110* addr : Address111* handler : Exception handler112*/113114#define EXC(insn, type, reg, addr, handler) \115.if \mode == LEGACY_MODE; \1169: insn reg, addr; \117.section __ex_table,"a"; \118PTR_WD 9b, handler; \119.previous; \120/* This is assembled in EVA mode */ \121.else; \122/* If loading from user or storing to user */ \123.if ((\from == USEROP) && (type == LD_INSN)) || \124((\to == USEROP) && (type == ST_INSN)); \1259: __BUILD_EVA_INSN(insn##e, reg, addr); \126.section __ex_table,"a"; \127PTR_WD 9b, handler; \128.previous; \129.else; \130/* \131* Still in EVA, but no need for \132* exception handler or EVA insn \133*/ \134insn reg, addr; \135.endif; \136.endif137138/*139* Only on the 64-bit kernel we can made use of 64-bit registers.140*/141#ifdef CONFIG_64BIT142#define USE_DOUBLE143#endif144145#ifdef USE_DOUBLE146147#define LOADK ld /* No exception */148#define LOAD(reg, addr, handler) EXC(ld, LD_INSN, reg, addr, handler)149#define LOADL(reg, addr, handler) EXC(ldl, LD_INSN, reg, addr, handler)150#define LOADR(reg, addr, handler) EXC(ldr, LD_INSN, reg, addr, handler)151#define STOREL(reg, addr, handler) EXC(sdl, ST_INSN, reg, addr, handler)152#define STORER(reg, addr, handler) EXC(sdr, ST_INSN, reg, addr, handler)153#define STORE(reg, addr, handler) EXC(sd, ST_INSN, reg, addr, handler)154#define ADD daddu155#define SUB dsubu156#define SRL dsrl157#define SRA dsra158#define SLL dsll159#define SLLV dsllv160#define SRLV dsrlv161#define NBYTES 8162#define LOG_NBYTES 3163164/*165* As we are sharing code base with the mips32 tree (which use the o32 ABI166* register definitions). We need to redefine the register definitions from167* the n64 ABI register naming to the o32 ABI register naming.168*/169#undef t0170#undef t1171#undef t2172#undef t3173#define t0 $8174#define t1 $9175#define t2 $10176#define t3 $11177#define t4 $12178#define t5 $13179#define t6 $14180#define t7 $15181182#else183184#define LOADK lw /* No exception */185#define LOAD(reg, addr, handler) EXC(lw, LD_INSN, reg, addr, handler)186#define LOADL(reg, addr, handler) EXC(lwl, LD_INSN, reg, addr, handler)187#define LOADR(reg, addr, handler) EXC(lwr, LD_INSN, reg, addr, handler)188#define STOREL(reg, addr, handler) EXC(swl, ST_INSN, reg, addr, handler)189#define STORER(reg, addr, handler) EXC(swr, ST_INSN, reg, addr, handler)190#define STORE(reg, addr, handler) EXC(sw, ST_INSN, reg, addr, handler)191#define ADD addu192#define SUB subu193#define SRL srl194#define SLL sll195#define SRA sra196#define SLLV sllv197#define SRLV srlv198#define NBYTES 4199#define LOG_NBYTES 2200201#endif /* USE_DOUBLE */202203#define LOADB(reg, addr, handler) EXC(lb, LD_INSN, reg, addr, handler)204#define STOREB(reg, addr, handler) EXC(sb, ST_INSN, reg, addr, handler)205206#ifdef CONFIG_CPU_HAS_PREFETCH207# define _PREF(hint, addr, type) \208.if \mode == LEGACY_MODE; \209kernel_pref(hint, addr); \210.else; \211.if ((\from == USEROP) && (type == SRC_PREFETCH)) || \212((\to == USEROP) && (type == DST_PREFETCH)); \213/* \214* PREFE has only 9 bits for the offset \215* compared to PREF which has 16, so it may \216* need to use the $at register but this \217* register should remain intact because it's \218* used later on. Therefore use $v1. \219*/ \220.set at=v1; \221user_pref(hint, addr); \222.set noat; \223.else; \224kernel_pref(hint, addr); \225.endif; \226.endif227#else228# define _PREF(hint, addr, type)229#endif230231#define PREFS(hint, addr) _PREF(hint, addr, SRC_PREFETCH)232#define PREFD(hint, addr) _PREF(hint, addr, DST_PREFETCH)233234#ifdef CONFIG_CPU_LITTLE_ENDIAN235#define LDFIRST LOADR236#define LDREST LOADL237#define STFIRST STORER238#define STREST STOREL239#define SHIFT_DISCARD SLLV240#else241#define LDFIRST LOADL242#define LDREST LOADR243#define STFIRST STOREL244#define STREST STORER245#define SHIFT_DISCARD SRLV246#endif247248#define FIRST(unit) ((unit)*NBYTES)249#define REST(unit) (FIRST(unit)+NBYTES-1)250#define UNIT(unit) FIRST(unit)251252#define ADDRMASK (NBYTES-1)253254.text255.set noreorder256#ifndef CONFIG_CPU_DADDI_WORKAROUNDS257.set noat258#else259.set at=v1260#endif261262.align 5263264/*265* Macro to build the __copy_user common code266* Arguments:267* mode : LEGACY_MODE or EVA_MODE268* from : Source operand. USEROP or KERNELOP269* to : Destination operand. USEROP or KERNELOP270*/271.macro __BUILD_COPY_USER mode, from, to272273/* initialize __memcpy if this the first time we execute this macro */274.ifnotdef __memcpy275.set __memcpy, 1276.hidden __memcpy /* make sure it does not leak */277.endif278279/*280* Note: dst & src may be unaligned, len may be 0281* Temps282*/283#define rem t8284285R10KCBARRIER(0(ra))286/*287* The "issue break"s below are very approximate.288* Issue delays for dcache fills will perturb the schedule, as will289* load queue full replay traps, etc.290*291* If len < NBYTES use byte operations.292*/293PREFS( 0, 0(src) )294PREFD( 1, 0(dst) )295sltu t2, len, NBYTES296and t1, dst, ADDRMASK297PREFS( 0, 1*32(src) )298PREFD( 1, 1*32(dst) )299bnez t2, .Lcopy_bytes_checklen\@300and t0, src, ADDRMASK301PREFS( 0, 2*32(src) )302PREFD( 1, 2*32(dst) )303#ifndef CONFIG_CPU_NO_LOAD_STORE_LR304bnez t1, .Ldst_unaligned\@305nop306bnez t0, .Lsrc_unaligned_dst_aligned\@307#else /* CONFIG_CPU_NO_LOAD_STORE_LR */308or t0, t0, t1309bnez t0, .Lcopy_unaligned_bytes\@310#endif /* CONFIG_CPU_NO_LOAD_STORE_LR */311/*312* use delay slot for fall-through313* src and dst are aligned; need to compute rem314*/315.Lboth_aligned\@:316SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter317beqz t0, .Lcleanup_both_aligned\@ # len < 8*NBYTES318and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES)319PREFS( 0, 3*32(src) )320PREFD( 1, 3*32(dst) )321.align 43221:323R10KCBARRIER(0(ra))324LOAD(t0, UNIT(0)(src), .Ll_exc\@)325LOAD(t1, UNIT(1)(src), .Ll_exc_copy\@)326LOAD(t2, UNIT(2)(src), .Ll_exc_copy\@)327LOAD(t3, UNIT(3)(src), .Ll_exc_copy\@)328SUB len, len, 8*NBYTES329LOAD(t4, UNIT(4)(src), .Ll_exc_copy\@)330LOAD(t7, UNIT(5)(src), .Ll_exc_copy\@)331STORE(t0, UNIT(0)(dst), .Ls_exc_p8u\@)332STORE(t1, UNIT(1)(dst), .Ls_exc_p7u\@)333LOAD(t0, UNIT(6)(src), .Ll_exc_copy\@)334LOAD(t1, UNIT(7)(src), .Ll_exc_copy\@)335ADD src, src, 8*NBYTES336ADD dst, dst, 8*NBYTES337STORE(t2, UNIT(-6)(dst), .Ls_exc_p6u\@)338STORE(t3, UNIT(-5)(dst), .Ls_exc_p5u\@)339STORE(t4, UNIT(-4)(dst), .Ls_exc_p4u\@)340STORE(t7, UNIT(-3)(dst), .Ls_exc_p3u\@)341STORE(t0, UNIT(-2)(dst), .Ls_exc_p2u\@)342STORE(t1, UNIT(-1)(dst), .Ls_exc_p1u\@)343PREFS( 0, 8*32(src) )344PREFD( 1, 8*32(dst) )345bne len, rem, 1b346nop347348/*349* len == rem == the number of bytes left to copy < 8*NBYTES350*/351.Lcleanup_both_aligned\@:352beqz len, .Ldone\@353sltu t0, len, 4*NBYTES354bnez t0, .Lless_than_4units\@355and rem, len, (NBYTES-1) # rem = len % NBYTES356/*357* len >= 4*NBYTES358*/359LOAD( t0, UNIT(0)(src), .Ll_exc\@)360LOAD( t1, UNIT(1)(src), .Ll_exc_copy\@)361LOAD( t2, UNIT(2)(src), .Ll_exc_copy\@)362LOAD( t3, UNIT(3)(src), .Ll_exc_copy\@)363SUB len, len, 4*NBYTES364ADD src, src, 4*NBYTES365R10KCBARRIER(0(ra))366STORE(t0, UNIT(0)(dst), .Ls_exc_p4u\@)367STORE(t1, UNIT(1)(dst), .Ls_exc_p3u\@)368STORE(t2, UNIT(2)(dst), .Ls_exc_p2u\@)369STORE(t3, UNIT(3)(dst), .Ls_exc_p1u\@)370.set reorder /* DADDI_WAR */371ADD dst, dst, 4*NBYTES372beqz len, .Ldone\@373.set noreorder374.Lless_than_4units\@:375/*376* rem = len % NBYTES377*/378beq rem, len, .Lcopy_bytes\@379nop3801:381R10KCBARRIER(0(ra))382LOAD(t0, 0(src), .Ll_exc\@)383ADD src, src, NBYTES384SUB len, len, NBYTES385STORE(t0, 0(dst), .Ls_exc_p1u\@)386.set reorder /* DADDI_WAR */387ADD dst, dst, NBYTES388bne rem, len, 1b389.set noreorder390391#ifndef CONFIG_CPU_NO_LOAD_STORE_LR392/*393* src and dst are aligned, need to copy rem bytes (rem < NBYTES)394* A loop would do only a byte at a time with possible branch395* mispredicts. Can't do an explicit LOAD dst,mask,or,STORE396* because can't assume read-access to dst. Instead, use397* STREST dst, which doesn't require read access to dst.398*399* This code should perform better than a simple loop on modern,400* wide-issue mips processors because the code has fewer branches and401* more instruction-level parallelism.402*/403#define bits t2404beqz len, .Ldone\@405ADD t1, dst, len # t1 is just past last byte of dst406li bits, 8*NBYTES407SLL rem, len, 3 # rem = number of bits to keep408LOAD(t0, 0(src), .Ll_exc\@)409SUB bits, bits, rem # bits = number of bits to discard410SHIFT_DISCARD t0, t0, bits411STREST(t0, -1(t1), .Ls_exc\@)412jr ra413move len, zero414.Ldst_unaligned\@:415/*416* dst is unaligned417* t0 = src & ADDRMASK418* t1 = dst & ADDRMASK; T1 > 0419* len >= NBYTES420*421* Copy enough bytes to align dst422* Set match = (src and dst have same alignment)423*/424#define match rem425LDFIRST(t3, FIRST(0)(src), .Ll_exc\@)426ADD t2, zero, NBYTES427LDREST(t3, REST(0)(src), .Ll_exc_copy\@)428SUB t2, t2, t1 # t2 = number of bytes copied429xor match, t0, t1430R10KCBARRIER(0(ra))431STFIRST(t3, FIRST(0)(dst), .Ls_exc\@)432beq len, t2, .Ldone\@433SUB len, len, t2434ADD dst, dst, t2435beqz match, .Lboth_aligned\@436ADD src, src, t2437438.Lsrc_unaligned_dst_aligned\@:439SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter440PREFS( 0, 3*32(src) )441beqz t0, .Lcleanup_src_unaligned\@442and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES443PREFD( 1, 3*32(dst) )4441:445/*446* Avoid consecutive LD*'s to the same register since some mips447* implementations can't issue them in the same cycle.448* It's OK to load FIRST(N+1) before REST(N) because the two addresses449* are to the same unit (unless src is aligned, but it's not).450*/451R10KCBARRIER(0(ra))452LDFIRST(t0, FIRST(0)(src), .Ll_exc\@)453LDFIRST(t1, FIRST(1)(src), .Ll_exc_copy\@)454SUB len, len, 4*NBYTES455LDREST(t0, REST(0)(src), .Ll_exc_copy\@)456LDREST(t1, REST(1)(src), .Ll_exc_copy\@)457LDFIRST(t2, FIRST(2)(src), .Ll_exc_copy\@)458LDFIRST(t3, FIRST(3)(src), .Ll_exc_copy\@)459LDREST(t2, REST(2)(src), .Ll_exc_copy\@)460LDREST(t3, REST(3)(src), .Ll_exc_copy\@)461PREFS( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed)462ADD src, src, 4*NBYTES463#ifdef CONFIG_CPU_SB1464nop # improves slotting465#endif466STORE(t0, UNIT(0)(dst), .Ls_exc_p4u\@)467STORE(t1, UNIT(1)(dst), .Ls_exc_p3u\@)468STORE(t2, UNIT(2)(dst), .Ls_exc_p2u\@)469STORE(t3, UNIT(3)(dst), .Ls_exc_p1u\@)470PREFD( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed)471.set reorder /* DADDI_WAR */472ADD dst, dst, 4*NBYTES473bne len, rem, 1b474.set noreorder475476.Lcleanup_src_unaligned\@:477beqz len, .Ldone\@478and rem, len, NBYTES-1 # rem = len % NBYTES479beq rem, len, .Lcopy_bytes\@480nop4811:482R10KCBARRIER(0(ra))483LDFIRST(t0, FIRST(0)(src), .Ll_exc\@)484LDREST(t0, REST(0)(src), .Ll_exc_copy\@)485ADD src, src, NBYTES486SUB len, len, NBYTES487STORE(t0, 0(dst), .Ls_exc_p1u\@)488.set reorder /* DADDI_WAR */489ADD dst, dst, NBYTES490bne len, rem, 1b491.set noreorder492493#endif /* !CONFIG_CPU_NO_LOAD_STORE_LR */494.Lcopy_bytes_checklen\@:495beqz len, .Ldone\@496nop497.Lcopy_bytes\@:498/* 0 < len < NBYTES */499R10KCBARRIER(0(ra))500#define COPY_BYTE(N) \501LOADB(t0, N(src), .Ll_exc\@); \502SUB len, len, 1; \503beqz len, .Ldone\@; \504STOREB(t0, N(dst), .Ls_exc_p1\@)505506COPY_BYTE(0)507COPY_BYTE(1)508#ifdef USE_DOUBLE509COPY_BYTE(2)510COPY_BYTE(3)511COPY_BYTE(4)512COPY_BYTE(5)513#endif514LOADB(t0, NBYTES-2(src), .Ll_exc\@)515SUB len, len, 1516jr ra517STOREB(t0, NBYTES-2(dst), .Ls_exc_p1\@)518.Ldone\@:519jr ra520nop521522#ifdef CONFIG_CPU_NO_LOAD_STORE_LR523.Lcopy_unaligned_bytes\@:5241:525COPY_BYTE(0)526COPY_BYTE(1)527COPY_BYTE(2)528COPY_BYTE(3)529COPY_BYTE(4)530COPY_BYTE(5)531COPY_BYTE(6)532COPY_BYTE(7)533ADD src, src, 8534b 1b535ADD dst, dst, 8536#endif /* CONFIG_CPU_NO_LOAD_STORE_LR */537.if __memcpy == 1538END(memcpy)539.set __memcpy, 0540.hidden __memcpy541.endif542543.Ll_exc_copy\@:544/*545* Copy bytes from src until faulting load address (or until a546* lb faults)547*548* When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)549* may be more than a byte beyond the last address.550* Hence, the lb below may get an exception.551*552* Assumes src < THREAD_BUADDR($28)553*/554LOADK t0, TI_TASK($28)555nop556LOADK t0, THREAD_BUADDR(t0)5571:558LOADB(t1, 0(src), .Ll_exc\@)559ADD src, src, 1560sb t1, 0(dst) # can't fault -- we're copy_from_user561.set reorder /* DADDI_WAR */562ADD dst, dst, 1563bne src, t0, 1b564.set noreorder565.Ll_exc\@:566LOADK t0, TI_TASK($28)567nop568LOADK t0, THREAD_BUADDR(t0) # t0 is just past last good address569nop570SUB len, AT, t0 # len number of uncopied bytes571jr ra572nop573574#define SEXC(n) \575.set reorder; /* DADDI_WAR */ \576.Ls_exc_p ## n ## u\@: \577ADD len, len, n*NBYTES; \578jr ra; \579.set noreorder580581SEXC(8)582SEXC(7)583SEXC(6)584SEXC(5)585SEXC(4)586SEXC(3)587SEXC(2)588SEXC(1)589590.Ls_exc_p1\@:591.set reorder /* DADDI_WAR */592ADD len, len, 1593jr ra594.set noreorder595.Ls_exc\@:596jr ra597nop598.endm599600#ifndef CONFIG_HAVE_PLAT_MEMCPY601.align 5602LEAF(memmove)603EXPORT_SYMBOL(memmove)604ADD t0, a0, a2605ADD t1, a1, a2606sltu t0, a1, t0 # dst + len <= src -> memcpy607sltu t1, a0, t1 # dst >= src + len -> memcpy608and t0, t1609beqz t0, .L__memcpy610move v0, a0 /* return value */611beqz a2, .Lr_out612END(memmove)613614/* fall through to __rmemcpy */615LEAF(__rmemcpy) /* a0=dst a1=src a2=len */616sltu t0, a1, a0617beqz t0, .Lr_end_bytes_up # src >= dst618nop619ADD a0, a2 # dst = dst + len620ADD a1, a2 # src = src + len621622.Lr_end_bytes:623R10KCBARRIER(0(ra))624lb t0, -1(a1)625SUB a2, a2, 0x1626sb t0, -1(a0)627SUB a1, a1, 0x1628.set reorder /* DADDI_WAR */629SUB a0, a0, 0x1630bnez a2, .Lr_end_bytes631.set noreorder632633.Lr_out:634jr ra635move a2, zero636637.Lr_end_bytes_up:638R10KCBARRIER(0(ra))639lb t0, (a1)640SUB a2, a2, 0x1641sb t0, (a0)642ADD a1, a1, 0x1643.set reorder /* DADDI_WAR */644ADD a0, a0, 0x1645bnez a2, .Lr_end_bytes_up646.set noreorder647648jr ra649move a2, zero650END(__rmemcpy)651652/*653* A combined memcpy/__copy_user654* __copy_user sets len to 0 for success; else to an upper bound of655* the number of uncopied bytes.656* memcpy sets v0 to dst.657*/658.align 5659LEAF(memcpy) /* a0=dst a1=src a2=len */660EXPORT_SYMBOL(memcpy)661move v0, dst /* return value */662.L__memcpy:663#ifndef CONFIG_EVA664FEXPORT(__raw_copy_from_user)665EXPORT_SYMBOL(__raw_copy_from_user)666FEXPORT(__raw_copy_to_user)667EXPORT_SYMBOL(__raw_copy_to_user)668#endif669/* Legacy Mode, user <-> user */670__BUILD_COPY_USER LEGACY_MODE USEROP USEROP671672#endif673674#ifdef CONFIG_EVA675676/*677* For EVA we need distinct symbols for reading and writing to user space.678* This is because we need to use specific EVA instructions to perform the679* virtual <-> physical translation when a virtual address is actually in user680* space681*/682683/*684* __copy_from_user (EVA)685*/686687LEAF(__raw_copy_from_user)688EXPORT_SYMBOL(__raw_copy_from_user)689__BUILD_COPY_USER EVA_MODE USEROP KERNELOP690END(__raw_copy_from_user)691692693694/*695* __copy_to_user (EVA)696*/697698LEAF(__raw_copy_to_user)699EXPORT_SYMBOL(__raw_copy_to_user)700__BUILD_COPY_USER EVA_MODE KERNELOP USEROP701END(__raw_copy_to_user)702703#endif704705706