/*1* This file is subject to the terms and conditions of the GNU General Public2* License. See the file "COPYING" in the main directory of this archive3* for more details.4*5* Unified implementation of memcpy, memmove and the __copy_user backend.6*7* Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle ([email protected])8* Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.9* Copyright (C) 2002 Broadcom, Inc.10* memcpy/copy_user author: Mark Vandevoorde11* Copyright (C) 2007 Maciej W. Rozycki12*13* Mnemonic names for arguments to memcpy/__copy_user14*/1516/*17* Hack to resolve longstanding prefetch issue18*19* Prefetching may be fatal on some systems if we're prefetching beyond the20* end of memory on some systems. It's also a seriously bad idea on non21* dma-coherent systems.22*/23#ifdef CONFIG_DMA_NONCOHERENT24#undef CONFIG_CPU_HAS_PREFETCH25#endif26#ifdef CONFIG_MIPS_MALTA27#undef CONFIG_CPU_HAS_PREFETCH28#endif2930#include <asm/asm.h>31#include <asm/asm-offsets.h>32#include <asm/regdef.h>3334#define dst a035#define src a136#define len a23738/*39* Spec40*41* memcpy copies len bytes from src to dst and sets v0 to dst.42* It assumes that43* - src and dst don't overlap44* - src is readable45* - dst is writable46* memcpy uses the standard calling convention47*48* __copy_user copies up to len bytes from src to dst and sets a2 (len) to49* the number of uncopied bytes due to an exception caused by a read or write.50* __copy_user assumes that src and dst don't overlap, and that the call is51* implementing one of the following:52* copy_to_user53* - src is readable (no exceptions when reading src)54* copy_from_user55* - dst is writable (no exceptions when writing dst)56* __copy_user uses a non-standard calling convention; see57* include/asm-mips/uaccess.h58*59* When an exception happens on a load, the handler must60# ensure that all of the destination buffer is overwritten to prevent61* leaking information to user mode programs.62*/6364/*65* Implementation66*/6768/*69* The exception handler for loads requires that:70* 1- AT contain the address of the byte just past the end of the source71* of the copy,72* 2- src_entry <= src < AT, and73* 3- (dst - src) == (dst_entry - src_entry),74* The _entry suffix denotes values when __copy_user was called.75*76* (1) is set up up by uaccess.h and maintained by not writing AT in copy_user77* (2) is met by incrementing src by the number of bytes copied78* (3) is met by not doing loads between a pair of increments of dst and src79*80* The exception handlers for stores adjust len (if necessary) and return.81* These handlers do not need to overwrite any data.82*83* For __rmemcpy and memmove an exception is always a kernel bug, therefore84* they're not protected.85*/8687#define EXC(inst_reg,addr,handler) \889: inst_reg, addr; \89.section __ex_table,"a"; \90PTR 9b, handler; \91.previous9293/*94* Only on the 64-bit kernel we can made use of 64-bit registers.95*/96#ifdef CONFIG_64BIT97#define USE_DOUBLE98#endif99100#ifdef USE_DOUBLE101102#define LOAD ld103#define LOADL ldl104#define LOADR ldr105#define STOREL sdl106#define STORER sdr107#define STORE sd108#define ADD daddu109#define SUB dsubu110#define SRL dsrl111#define SRA dsra112#define SLL dsll113#define SLLV dsllv114#define SRLV dsrlv115#define NBYTES 8116#define LOG_NBYTES 3117118/*119* As we are sharing code base with the mips32 tree (which use the o32 ABI120* register definitions). We need to redefine the register definitions from121* the n64 ABI register naming to the o32 ABI register naming.122*/123#undef t0124#undef t1125#undef t2126#undef t3127#define t0 $8128#define t1 $9129#define t2 $10130#define t3 $11131#define t4 $12132#define t5 $13133#define t6 $14134#define t7 $15135136#else137138#define LOAD lw139#define LOADL lwl140#define LOADR lwr141#define STOREL swl142#define STORER swr143#define STORE sw144#define ADD addu145#define SUB subu146#define SRL srl147#define SLL sll148#define SRA sra149#define SLLV sllv150#define SRLV srlv151#define NBYTES 4152#define LOG_NBYTES 2153154#endif /* USE_DOUBLE */155156#ifdef CONFIG_CPU_LITTLE_ENDIAN157#define LDFIRST LOADR158#define LDREST LOADL159#define STFIRST STORER160#define STREST STOREL161#define SHIFT_DISCARD SLLV162#else163#define LDFIRST LOADL164#define LDREST LOADR165#define STFIRST STOREL166#define STREST STORER167#define SHIFT_DISCARD SRLV168#endif169170#define FIRST(unit) ((unit)*NBYTES)171#define REST(unit) (FIRST(unit)+NBYTES-1)172#define UNIT(unit) FIRST(unit)173174#define ADDRMASK (NBYTES-1)175176.text177.set noreorder178#ifndef CONFIG_CPU_DADDI_WORKAROUNDS179.set noat180#else181.set at=v1182#endif183184/*185* A combined memcpy/__copy_user186* __copy_user sets len to 0 for success; else to an upper bound of187* the number of uncopied bytes.188* memcpy sets v0 to dst.189*/190.align 5191LEAF(__copy_user_inatomic)192/*193* Note: dst & src may be unaligned, len may be 0194* Temps195*/196#define rem t8197198/*199* The "issue break"s below are very approximate.200* Issue delays for dcache fills will perturb the schedule, as will201* load queue full replay traps, etc.202*203* If len < NBYTES use byte operations.204*/205PREF( 0, 0(src) )206PREF( 1, 0(dst) )207sltu t2, len, NBYTES208and t1, dst, ADDRMASK209PREF( 0, 1*32(src) )210PREF( 1, 1*32(dst) )211bnez t2, .Lcopy_bytes_checklen212and t0, src, ADDRMASK213PREF( 0, 2*32(src) )214PREF( 1, 2*32(dst) )215bnez t1, .Ldst_unaligned216nop217bnez t0, .Lsrc_unaligned_dst_aligned218/*219* use delay slot for fall-through220* src and dst are aligned; need to compute rem221*/222.Lboth_aligned:223SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter224beqz t0, .Lcleanup_both_aligned # len < 8*NBYTES225and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES)226PREF( 0, 3*32(src) )227PREF( 1, 3*32(dst) )228.align 42291:230EXC( LOAD t0, UNIT(0)(src), .Ll_exc)231EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)232EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)233EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)234SUB len, len, 8*NBYTES235EXC( LOAD t4, UNIT(4)(src), .Ll_exc_copy)236EXC( LOAD t7, UNIT(5)(src), .Ll_exc_copy)237STORE t0, UNIT(0)(dst)238STORE t1, UNIT(1)(dst)239EXC( LOAD t0, UNIT(6)(src), .Ll_exc_copy)240EXC( LOAD t1, UNIT(7)(src), .Ll_exc_copy)241ADD src, src, 8*NBYTES242ADD dst, dst, 8*NBYTES243STORE t2, UNIT(-6)(dst)244STORE t3, UNIT(-5)(dst)245STORE t4, UNIT(-4)(dst)246STORE t7, UNIT(-3)(dst)247STORE t0, UNIT(-2)(dst)248STORE t1, UNIT(-1)(dst)249PREF( 0, 8*32(src) )250PREF( 1, 8*32(dst) )251bne len, rem, 1b252nop253254/*255* len == rem == the number of bytes left to copy < 8*NBYTES256*/257.Lcleanup_both_aligned:258beqz len, .Ldone259sltu t0, len, 4*NBYTES260bnez t0, .Lless_than_4units261and rem, len, (NBYTES-1) # rem = len % NBYTES262/*263* len >= 4*NBYTES264*/265EXC( LOAD t0, UNIT(0)(src), .Ll_exc)266EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)267EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)268EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)269SUB len, len, 4*NBYTES270ADD src, src, 4*NBYTES271STORE t0, UNIT(0)(dst)272STORE t1, UNIT(1)(dst)273STORE t2, UNIT(2)(dst)274STORE t3, UNIT(3)(dst)275.set reorder /* DADDI_WAR */276ADD dst, dst, 4*NBYTES277beqz len, .Ldone278.set noreorder279.Lless_than_4units:280/*281* rem = len % NBYTES282*/283beq rem, len, .Lcopy_bytes284nop2851:286EXC( LOAD t0, 0(src), .Ll_exc)287ADD src, src, NBYTES288SUB len, len, NBYTES289STORE t0, 0(dst)290.set reorder /* DADDI_WAR */291ADD dst, dst, NBYTES292bne rem, len, 1b293.set noreorder294295/*296* src and dst are aligned, need to copy rem bytes (rem < NBYTES)297* A loop would do only a byte at a time with possible branch298* mispredicts. Can't do an explicit LOAD dst,mask,or,STORE299* because can't assume read-access to dst. Instead, use300* STREST dst, which doesn't require read access to dst.301*302* This code should perform better than a simple loop on modern,303* wide-issue mips processors because the code has fewer branches and304* more instruction-level parallelism.305*/306#define bits t2307beqz len, .Ldone308ADD t1, dst, len # t1 is just past last byte of dst309li bits, 8*NBYTES310SLL rem, len, 3 # rem = number of bits to keep311EXC( LOAD t0, 0(src), .Ll_exc)312SUB bits, bits, rem # bits = number of bits to discard313SHIFT_DISCARD t0, t0, bits314STREST t0, -1(t1)315jr ra316move len, zero317.Ldst_unaligned:318/*319* dst is unaligned320* t0 = src & ADDRMASK321* t1 = dst & ADDRMASK; T1 > 0322* len >= NBYTES323*324* Copy enough bytes to align dst325* Set match = (src and dst have same alignment)326*/327#define match rem328EXC( LDFIRST t3, FIRST(0)(src), .Ll_exc)329ADD t2, zero, NBYTES330EXC( LDREST t3, REST(0)(src), .Ll_exc_copy)331SUB t2, t2, t1 # t2 = number of bytes copied332xor match, t0, t1333STFIRST t3, FIRST(0)(dst)334beq len, t2, .Ldone335SUB len, len, t2336ADD dst, dst, t2337beqz match, .Lboth_aligned338ADD src, src, t2339340.Lsrc_unaligned_dst_aligned:341SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter342PREF( 0, 3*32(src) )343beqz t0, .Lcleanup_src_unaligned344and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES345PREF( 1, 3*32(dst) )3461:347/*348* Avoid consecutive LD*'s to the same register since some mips349* implementations can't issue them in the same cycle.350* It's OK to load FIRST(N+1) before REST(N) because the two addresses351* are to the same unit (unless src is aligned, but it's not).352*/353EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)354EXC( LDFIRST t1, FIRST(1)(src), .Ll_exc_copy)355SUB len, len, 4*NBYTES356EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)357EXC( LDREST t1, REST(1)(src), .Ll_exc_copy)358EXC( LDFIRST t2, FIRST(2)(src), .Ll_exc_copy)359EXC( LDFIRST t3, FIRST(3)(src), .Ll_exc_copy)360EXC( LDREST t2, REST(2)(src), .Ll_exc_copy)361EXC( LDREST t3, REST(3)(src), .Ll_exc_copy)362PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed)363ADD src, src, 4*NBYTES364#ifdef CONFIG_CPU_SB1365nop # improves slotting366#endif367STORE t0, UNIT(0)(dst)368STORE t1, UNIT(1)(dst)369STORE t2, UNIT(2)(dst)370STORE t3, UNIT(3)(dst)371PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed)372.set reorder /* DADDI_WAR */373ADD dst, dst, 4*NBYTES374bne len, rem, 1b375.set noreorder376377.Lcleanup_src_unaligned:378beqz len, .Ldone379and rem, len, NBYTES-1 # rem = len % NBYTES380beq rem, len, .Lcopy_bytes381nop3821:383EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)384EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)385ADD src, src, NBYTES386SUB len, len, NBYTES387STORE t0, 0(dst)388.set reorder /* DADDI_WAR */389ADD dst, dst, NBYTES390bne len, rem, 1b391.set noreorder392393.Lcopy_bytes_checklen:394beqz len, .Ldone395nop396.Lcopy_bytes:397/* 0 < len < NBYTES */398#define COPY_BYTE(N) \399EXC( lb t0, N(src), .Ll_exc); \400SUB len, len, 1; \401beqz len, .Ldone; \402sb t0, N(dst)403404COPY_BYTE(0)405COPY_BYTE(1)406#ifdef USE_DOUBLE407COPY_BYTE(2)408COPY_BYTE(3)409COPY_BYTE(4)410COPY_BYTE(5)411#endif412EXC( lb t0, NBYTES-2(src), .Ll_exc)413SUB len, len, 1414jr ra415sb t0, NBYTES-2(dst)416.Ldone:417jr ra418nop419END(__copy_user_inatomic)420421.Ll_exc_copy:422/*423* Copy bytes from src until faulting load address (or until a424* lb faults)425*426* When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)427* may be more than a byte beyond the last address.428* Hence, the lb below may get an exception.429*430* Assumes src < THREAD_BUADDR($28)431*/432LOAD t0, TI_TASK($28)433nop434LOAD t0, THREAD_BUADDR(t0)4351:436EXC( lb t1, 0(src), .Ll_exc)437ADD src, src, 1438sb t1, 0(dst) # can't fault -- we're copy_from_user439.set reorder /* DADDI_WAR */440ADD dst, dst, 1441bne src, t0, 1b442.set noreorder443.Ll_exc:444LOAD t0, TI_TASK($28)445nop446LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address447nop448SUB len, AT, t0 # len number of uncopied bytes449jr ra450nop451452453