/*1* This file is subject to the terms and conditions of the GNU General Public2* License. See the file "COPYING" in the main directory of this archive3* for more details.4*5* Unified implementation of memcpy, memmove and the __copy_user backend.6*7* Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle ([email protected])8* Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.9* Copyright (C) 2002 Broadcom, Inc.10* memcpy/copy_user author: Mark Vandevoorde11* Copyright (C) 2007 Maciej W. Rozycki12*13* Mnemonic names for arguments to memcpy/__copy_user14*/1516/*17* Hack to resolve longstanding prefetch issue18*19* Prefetching may be fatal on some systems if we're prefetching beyond the20* end of memory on some systems. It's also a seriously bad idea on non21* dma-coherent systems.22*/23#ifdef CONFIG_DMA_NONCOHERENT24#undef CONFIG_CPU_HAS_PREFETCH25#endif26#ifdef CONFIG_MIPS_MALTA27#undef CONFIG_CPU_HAS_PREFETCH28#endif2930#include <asm/asm.h>31#include <asm/asm-offsets.h>32#include <asm/regdef.h>3334#define dst a035#define src a136#define len a23738/*39* Spec40*41* memcpy copies len bytes from src to dst and sets v0 to dst.42* It assumes that43* - src and dst don't overlap44* - src is readable45* - dst is writable46* memcpy uses the standard calling convention47*48* __copy_user copies up to len bytes from src to dst and sets a2 (len) to49* the number of uncopied bytes due to an exception caused by a read or write.50* __copy_user assumes that src and dst don't overlap, and that the call is51* implementing one of the following:52* copy_to_user53* - src is readable (no exceptions when reading src)54* copy_from_user55* - dst is writable (no exceptions when writing dst)56* __copy_user uses a non-standard calling convention; see57* include/asm-mips/uaccess.h58*59* When an exception happens on a load, the handler must60# ensure that all of the destination buffer is overwritten to prevent61* leaking information to user mode programs.62*/6364/*65* Implementation66*/6768/*69* The exception handler for loads requires that:70* 1- AT contain the address of the byte just past the end of the source71* of the copy,72* 2- src_entry <= src < AT, and73* 3- (dst - src) == (dst_entry - src_entry),74* The _entry suffix denotes values when __copy_user was called.75*76* (1) is set up up by uaccess.h and maintained by not writing AT in copy_user77* (2) is met by incrementing src by the number of bytes copied78* (3) is met by not doing loads between a pair of increments of dst and src79*80* The exception handlers for stores adjust len (if necessary) and return.81* These handlers do not need to overwrite any data.82*83* For __rmemcpy and memmove an exception is always a kernel bug, therefore84* they're not protected.85*/8687#define EXC(inst_reg,addr,handler) \889: inst_reg, addr; \89.section __ex_table,"a"; \90PTR 9b, handler; \91.previous9293/*94* Only on the 64-bit kernel we can made use of 64-bit registers.95*/96#ifdef CONFIG_64BIT97#define USE_DOUBLE98#endif99100#ifdef USE_DOUBLE101102#define LOAD ld103#define LOADL ldl104#define LOADR ldr105#define STOREL sdl106#define STORER sdr107#define STORE sd108#define ADD daddu109#define SUB dsubu110#define SRL dsrl111#define SRA dsra112#define SLL dsll113#define SLLV dsllv114#define SRLV dsrlv115#define NBYTES 8116#define LOG_NBYTES 3117118/*119* As we are sharing code base with the mips32 tree (which use the o32 ABI120* register definitions). We need to redefine the register definitions from121* the n64 ABI register naming to the o32 ABI register naming.122*/123#undef t0124#undef t1125#undef t2126#undef t3127#define t0 $8128#define t1 $9129#define t2 $10130#define t3 $11131#define t4 $12132#define t5 $13133#define t6 $14134#define t7 $15135136#else137138#define LOAD lw139#define LOADL lwl140#define LOADR lwr141#define STOREL swl142#define STORER swr143#define STORE sw144#define ADD addu145#define SUB subu146#define SRL srl147#define SLL sll148#define SRA sra149#define SLLV sllv150#define SRLV srlv151#define NBYTES 4152#define LOG_NBYTES 2153154#endif /* USE_DOUBLE */155156#ifdef CONFIG_CPU_LITTLE_ENDIAN157#define LDFIRST LOADR158#define LDREST LOADL159#define STFIRST STORER160#define STREST STOREL161#define SHIFT_DISCARD SLLV162#else163#define LDFIRST LOADL164#define LDREST LOADR165#define STFIRST STOREL166#define STREST STORER167#define SHIFT_DISCARD SRLV168#endif169170#define FIRST(unit) ((unit)*NBYTES)171#define REST(unit) (FIRST(unit)+NBYTES-1)172#define UNIT(unit) FIRST(unit)173174#define ADDRMASK (NBYTES-1)175176.text177.set noreorder178#ifndef CONFIG_CPU_DADDI_WORKAROUNDS179.set noat180#else181.set at=v1182#endif183184/*185* A combined memcpy/__copy_user186* __copy_user sets len to 0 for success; else to an upper bound of187* the number of uncopied bytes.188* memcpy sets v0 to dst.189*/190.align 5191LEAF(memcpy) /* a0=dst a1=src a2=len */192move v0, dst /* return value */193.L__memcpy:194FEXPORT(__copy_user)195/*196* Note: dst & src may be unaligned, len may be 0197* Temps198*/199#define rem t8200201R10KCBARRIER(0(ra))202/*203* The "issue break"s below are very approximate.204* Issue delays for dcache fills will perturb the schedule, as will205* load queue full replay traps, etc.206*207* If len < NBYTES use byte operations.208*/209PREF( 0, 0(src) )210PREF( 1, 0(dst) )211sltu t2, len, NBYTES212and t1, dst, ADDRMASK213PREF( 0, 1*32(src) )214PREF( 1, 1*32(dst) )215bnez t2, .Lcopy_bytes_checklen216and t0, src, ADDRMASK217PREF( 0, 2*32(src) )218PREF( 1, 2*32(dst) )219bnez t1, .Ldst_unaligned220nop221bnez t0, .Lsrc_unaligned_dst_aligned222/*223* use delay slot for fall-through224* src and dst are aligned; need to compute rem225*/226.Lboth_aligned:227SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter228beqz t0, .Lcleanup_both_aligned # len < 8*NBYTES229and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES)230PREF( 0, 3*32(src) )231PREF( 1, 3*32(dst) )232.align 42331:234R10KCBARRIER(0(ra))235EXC( LOAD t0, UNIT(0)(src), .Ll_exc)236EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)237EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)238EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)239SUB len, len, 8*NBYTES240EXC( LOAD t4, UNIT(4)(src), .Ll_exc_copy)241EXC( LOAD t7, UNIT(5)(src), .Ll_exc_copy)242EXC( STORE t0, UNIT(0)(dst), .Ls_exc_p8u)243EXC( STORE t1, UNIT(1)(dst), .Ls_exc_p7u)244EXC( LOAD t0, UNIT(6)(src), .Ll_exc_copy)245EXC( LOAD t1, UNIT(7)(src), .Ll_exc_copy)246ADD src, src, 8*NBYTES247ADD dst, dst, 8*NBYTES248EXC( STORE t2, UNIT(-6)(dst), .Ls_exc_p6u)249EXC( STORE t3, UNIT(-5)(dst), .Ls_exc_p5u)250EXC( STORE t4, UNIT(-4)(dst), .Ls_exc_p4u)251EXC( STORE t7, UNIT(-3)(dst), .Ls_exc_p3u)252EXC( STORE t0, UNIT(-2)(dst), .Ls_exc_p2u)253EXC( STORE t1, UNIT(-1)(dst), .Ls_exc_p1u)254PREF( 0, 8*32(src) )255PREF( 1, 8*32(dst) )256bne len, rem, 1b257nop258259/*260* len == rem == the number of bytes left to copy < 8*NBYTES261*/262.Lcleanup_both_aligned:263beqz len, .Ldone264sltu t0, len, 4*NBYTES265bnez t0, .Lless_than_4units266and rem, len, (NBYTES-1) # rem = len % NBYTES267/*268* len >= 4*NBYTES269*/270EXC( LOAD t0, UNIT(0)(src), .Ll_exc)271EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)272EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)273EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)274SUB len, len, 4*NBYTES275ADD src, src, 4*NBYTES276R10KCBARRIER(0(ra))277EXC( STORE t0, UNIT(0)(dst), .Ls_exc_p4u)278EXC( STORE t1, UNIT(1)(dst), .Ls_exc_p3u)279EXC( STORE t2, UNIT(2)(dst), .Ls_exc_p2u)280EXC( STORE t3, UNIT(3)(dst), .Ls_exc_p1u)281.set reorder /* DADDI_WAR */282ADD dst, dst, 4*NBYTES283beqz len, .Ldone284.set noreorder285.Lless_than_4units:286/*287* rem = len % NBYTES288*/289beq rem, len, .Lcopy_bytes290nop2911:292R10KCBARRIER(0(ra))293EXC( LOAD t0, 0(src), .Ll_exc)294ADD src, src, NBYTES295SUB len, len, NBYTES296EXC( STORE t0, 0(dst), .Ls_exc_p1u)297.set reorder /* DADDI_WAR */298ADD dst, dst, NBYTES299bne rem, len, 1b300.set noreorder301302/*303* src and dst are aligned, need to copy rem bytes (rem < NBYTES)304* A loop would do only a byte at a time with possible branch305* mispredicts. Can't do an explicit LOAD dst,mask,or,STORE306* because can't assume read-access to dst. Instead, use307* STREST dst, which doesn't require read access to dst.308*309* This code should perform better than a simple loop on modern,310* wide-issue mips processors because the code has fewer branches and311* more instruction-level parallelism.312*/313#define bits t2314beqz len, .Ldone315ADD t1, dst, len # t1 is just past last byte of dst316li bits, 8*NBYTES317SLL rem, len, 3 # rem = number of bits to keep318EXC( LOAD t0, 0(src), .Ll_exc)319SUB bits, bits, rem # bits = number of bits to discard320SHIFT_DISCARD t0, t0, bits321EXC( STREST t0, -1(t1), .Ls_exc)322jr ra323move len, zero324.Ldst_unaligned:325/*326* dst is unaligned327* t0 = src & ADDRMASK328* t1 = dst & ADDRMASK; T1 > 0329* len >= NBYTES330*331* Copy enough bytes to align dst332* Set match = (src and dst have same alignment)333*/334#define match rem335EXC( LDFIRST t3, FIRST(0)(src), .Ll_exc)336ADD t2, zero, NBYTES337EXC( LDREST t3, REST(0)(src), .Ll_exc_copy)338SUB t2, t2, t1 # t2 = number of bytes copied339xor match, t0, t1340R10KCBARRIER(0(ra))341EXC( STFIRST t3, FIRST(0)(dst), .Ls_exc)342beq len, t2, .Ldone343SUB len, len, t2344ADD dst, dst, t2345beqz match, .Lboth_aligned346ADD src, src, t2347348.Lsrc_unaligned_dst_aligned:349SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter350PREF( 0, 3*32(src) )351beqz t0, .Lcleanup_src_unaligned352and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES353PREF( 1, 3*32(dst) )3541:355/*356* Avoid consecutive LD*'s to the same register since some mips357* implementations can't issue them in the same cycle.358* It's OK to load FIRST(N+1) before REST(N) because the two addresses359* are to the same unit (unless src is aligned, but it's not).360*/361R10KCBARRIER(0(ra))362EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)363EXC( LDFIRST t1, FIRST(1)(src), .Ll_exc_copy)364SUB len, len, 4*NBYTES365EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)366EXC( LDREST t1, REST(1)(src), .Ll_exc_copy)367EXC( LDFIRST t2, FIRST(2)(src), .Ll_exc_copy)368EXC( LDFIRST t3, FIRST(3)(src), .Ll_exc_copy)369EXC( LDREST t2, REST(2)(src), .Ll_exc_copy)370EXC( LDREST t3, REST(3)(src), .Ll_exc_copy)371PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed)372ADD src, src, 4*NBYTES373#ifdef CONFIG_CPU_SB1374nop # improves slotting375#endif376EXC( STORE t0, UNIT(0)(dst), .Ls_exc_p4u)377EXC( STORE t1, UNIT(1)(dst), .Ls_exc_p3u)378EXC( STORE t2, UNIT(2)(dst), .Ls_exc_p2u)379EXC( STORE t3, UNIT(3)(dst), .Ls_exc_p1u)380PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed)381.set reorder /* DADDI_WAR */382ADD dst, dst, 4*NBYTES383bne len, rem, 1b384.set noreorder385386.Lcleanup_src_unaligned:387beqz len, .Ldone388and rem, len, NBYTES-1 # rem = len % NBYTES389beq rem, len, .Lcopy_bytes390nop3911:392R10KCBARRIER(0(ra))393EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)394EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)395ADD src, src, NBYTES396SUB len, len, NBYTES397EXC( STORE t0, 0(dst), .Ls_exc_p1u)398.set reorder /* DADDI_WAR */399ADD dst, dst, NBYTES400bne len, rem, 1b401.set noreorder402403.Lcopy_bytes_checklen:404beqz len, .Ldone405nop406.Lcopy_bytes:407/* 0 < len < NBYTES */408R10KCBARRIER(0(ra))409#define COPY_BYTE(N) \410EXC( lb t0, N(src), .Ll_exc); \411SUB len, len, 1; \412beqz len, .Ldone; \413EXC( sb t0, N(dst), .Ls_exc_p1)414415COPY_BYTE(0)416COPY_BYTE(1)417#ifdef USE_DOUBLE418COPY_BYTE(2)419COPY_BYTE(3)420COPY_BYTE(4)421COPY_BYTE(5)422#endif423EXC( lb t0, NBYTES-2(src), .Ll_exc)424SUB len, len, 1425jr ra426EXC( sb t0, NBYTES-2(dst), .Ls_exc_p1)427.Ldone:428jr ra429nop430END(memcpy)431432.Ll_exc_copy:433/*434* Copy bytes from src until faulting load address (or until a435* lb faults)436*437* When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)438* may be more than a byte beyond the last address.439* Hence, the lb below may get an exception.440*441* Assumes src < THREAD_BUADDR($28)442*/443LOAD t0, TI_TASK($28)444nop445LOAD t0, THREAD_BUADDR(t0)4461:447EXC( lb t1, 0(src), .Ll_exc)448ADD src, src, 1449sb t1, 0(dst) # can't fault -- we're copy_from_user450.set reorder /* DADDI_WAR */451ADD dst, dst, 1452bne src, t0, 1b453.set noreorder454.Ll_exc:455LOAD t0, TI_TASK($28)456nop457LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address458nop459SUB len, AT, t0 # len number of uncopied bytes460/*461* Here's where we rely on src and dst being incremented in tandem,462* See (3) above.463* dst += (fault addr - src) to put dst at first byte to clear464*/465ADD dst, t0 # compute start address in a1466SUB dst, src467/*468* Clear len bytes starting at dst. Can't call __bzero because it469* might modify len. An inefficient loop for these rare times...470*/471.set reorder /* DADDI_WAR */472SUB src, len, 1473beqz len, .Ldone474.set noreorder4751: sb zero, 0(dst)476ADD dst, dst, 1477#ifndef CONFIG_CPU_DADDI_WORKAROUNDS478bnez src, 1b479SUB src, src, 1480#else481.set push482.set noat483li v1, 1484bnez src, 1b485SUB src, src, v1486.set pop487#endif488jr ra489nop490491492#define SEXC(n) \493.set reorder; /* DADDI_WAR */ \494.Ls_exc_p ## n ## u: \495ADD len, len, n*NBYTES; \496jr ra; \497.set noreorder498499SEXC(8)500SEXC(7)501SEXC(6)502SEXC(5)503SEXC(4)504SEXC(3)505SEXC(2)506SEXC(1)507508.Ls_exc_p1:509.set reorder /* DADDI_WAR */510ADD len, len, 1511jr ra512.set noreorder513.Ls_exc:514jr ra515nop516517.align 5518LEAF(memmove)519ADD t0, a0, a2520ADD t1, a1, a2521sltu t0, a1, t0 # dst + len <= src -> memcpy522sltu t1, a0, t1 # dst >= src + len -> memcpy523and t0, t1524beqz t0, .L__memcpy525move v0, a0 /* return value */526beqz a2, .Lr_out527END(memmove)528529/* fall through to __rmemcpy */530LEAF(__rmemcpy) /* a0=dst a1=src a2=len */531sltu t0, a1, a0532beqz t0, .Lr_end_bytes_up # src >= dst533nop534ADD a0, a2 # dst = dst + len535ADD a1, a2 # src = src + len536537.Lr_end_bytes:538R10KCBARRIER(0(ra))539lb t0, -1(a1)540SUB a2, a2, 0x1541sb t0, -1(a0)542SUB a1, a1, 0x1543.set reorder /* DADDI_WAR */544SUB a0, a0, 0x1545bnez a2, .Lr_end_bytes546.set noreorder547548.Lr_out:549jr ra550move a2, zero551552.Lr_end_bytes_up:553R10KCBARRIER(0(ra))554lb t0, (a1)555SUB a2, a2, 0x1556sb t0, (a0)557ADD a1, a1, 0x1558.set reorder /* DADDI_WAR */559ADD a0, a0, 0x1560bnez a2, .Lr_end_bytes_up561.set noreorder562563jr ra564move a2, zero565END(__rmemcpy)566567568