Path: blob/master/arch/mips/cavium-octeon/octeon-memcpy.S
10817 views
/*1* This file is subject to the terms and conditions of the GNU General Public2* License. See the file "COPYING" in the main directory of this archive3* for more details.4*5* Unified implementation of memcpy, memmove and the __copy_user backend.6*7* Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle ([email protected])8* Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.9* Copyright (C) 2002 Broadcom, Inc.10* memcpy/copy_user author: Mark Vandevoorde11*12* Mnemonic names for arguments to memcpy/__copy_user13*/1415#include <asm/asm.h>16#include <asm/asm-offsets.h>17#include <asm/regdef.h>1819#define dst a020#define src a121#define len a22223/*24* Spec25*26* memcpy copies len bytes from src to dst and sets v0 to dst.27* It assumes that28* - src and dst don't overlap29* - src is readable30* - dst is writable31* memcpy uses the standard calling convention32*33* __copy_user copies up to len bytes from src to dst and sets a2 (len) to34* the number of uncopied bytes due to an exception caused by a read or write.35* __copy_user assumes that src and dst don't overlap, and that the call is36* implementing one of the following:37* copy_to_user38* - src is readable (no exceptions when reading src)39* copy_from_user40* - dst is writable (no exceptions when writing dst)41* __copy_user uses a non-standard calling convention; see42* arch/mips/include/asm/uaccess.h43*44* When an exception happens on a load, the handler must45# ensure that all of the destination buffer is overwritten to prevent46* leaking information to user mode programs.47*/4849/*50* Implementation51*/5253/*54* The exception handler for loads requires that:55* 1- AT contain the address of the byte just past the end of the source56* of the copy,57* 2- src_entry <= src < AT, and58* 3- (dst - src) == (dst_entry - src_entry),59* The _entry suffix denotes values when __copy_user was called.60*61* (1) is set up up by uaccess.h and maintained by not writing AT in copy_user62* (2) is met by incrementing src by the number of bytes copied63* (3) is met by not doing loads between a pair of increments of dst and src64*65* The exception handlers for stores adjust len (if necessary) and return.66* These handlers do not need to overwrite any data.67*68* For __rmemcpy and memmove an exception is always a kernel bug, therefore69* they're not protected.70*/7172#define EXC(inst_reg,addr,handler) \739: inst_reg, addr; \74.section __ex_table,"a"; \75PTR 9b, handler; \76.previous7778/*79* Only on the 64-bit kernel we can made use of 64-bit registers.80*/81#ifdef CONFIG_64BIT82#define USE_DOUBLE83#endif8485#ifdef USE_DOUBLE8687#define LOAD ld88#define LOADL ldl89#define LOADR ldr90#define STOREL sdl91#define STORER sdr92#define STORE sd93#define ADD daddu94#define SUB dsubu95#define SRL dsrl96#define SRA dsra97#define SLL dsll98#define SLLV dsllv99#define SRLV dsrlv100#define NBYTES 8101#define LOG_NBYTES 3102103/*104* As we are sharing code base with the mips32 tree (which use the o32 ABI105* register definitions). We need to redefine the register definitions from106* the n64 ABI register naming to the o32 ABI register naming.107*/108#undef t0109#undef t1110#undef t2111#undef t3112#define t0 $8113#define t1 $9114#define t2 $10115#define t3 $11116#define t4 $12117#define t5 $13118#define t6 $14119#define t7 $15120121#else122123#define LOAD lw124#define LOADL lwl125#define LOADR lwr126#define STOREL swl127#define STORER swr128#define STORE sw129#define ADD addu130#define SUB subu131#define SRL srl132#define SLL sll133#define SRA sra134#define SLLV sllv135#define SRLV srlv136#define NBYTES 4137#define LOG_NBYTES 2138139#endif /* USE_DOUBLE */140141#ifdef CONFIG_CPU_LITTLE_ENDIAN142#define LDFIRST LOADR143#define LDREST LOADL144#define STFIRST STORER145#define STREST STOREL146#define SHIFT_DISCARD SLLV147#else148#define LDFIRST LOADL149#define LDREST LOADR150#define STFIRST STOREL151#define STREST STORER152#define SHIFT_DISCARD SRLV153#endif154155#define FIRST(unit) ((unit)*NBYTES)156#define REST(unit) (FIRST(unit)+NBYTES-1)157#define UNIT(unit) FIRST(unit)158159#define ADDRMASK (NBYTES-1)160161.text162.set noreorder163.set noat164165/*166* A combined memcpy/__copy_user167* __copy_user sets len to 0 for success; else to an upper bound of168* the number of uncopied bytes.169* memcpy sets v0 to dst.170*/171.align 5172LEAF(memcpy) /* a0=dst a1=src a2=len */173move v0, dst /* return value */174__memcpy:175FEXPORT(__copy_user)176/*177* Note: dst & src may be unaligned, len may be 0178* Temps179*/180#181# Octeon doesn't care if the destination is unaligned. The hardware182# can fix it faster than we can special case the assembly.183#184pref 0, 0(src)185sltu t0, len, NBYTES # Check if < 1 word186bnez t0, copy_bytes_checklen187and t0, src, ADDRMASK # Check if src unaligned188bnez t0, src_unaligned189sltu t0, len, 4*NBYTES # Check if < 4 words190bnez t0, less_than_4units191sltu t0, len, 8*NBYTES # Check if < 8 words192bnez t0, less_than_8units193sltu t0, len, 16*NBYTES # Check if < 16 words194bnez t0, cleanup_both_aligned195sltu t0, len, 128+1 # Check if len < 129196bnez t0, 1f # Skip prefetch if len is too short197sltu t0, len, 256+1 # Check if len < 257198bnez t0, 1f # Skip prefetch if len is too short199pref 0, 128(src) # We must not prefetch invalid addresses200#201# This is where we loop if there is more than 128 bytes left2022: pref 0, 256(src) # We must not prefetch invalid addresses203#204# This is where we loop if we can't prefetch anymore2051:206EXC( LOAD t0, UNIT(0)(src), l_exc)207EXC( LOAD t1, UNIT(1)(src), l_exc_copy)208EXC( LOAD t2, UNIT(2)(src), l_exc_copy)209EXC( LOAD t3, UNIT(3)(src), l_exc_copy)210SUB len, len, 16*NBYTES211EXC( STORE t0, UNIT(0)(dst), s_exc_p16u)212EXC( STORE t1, UNIT(1)(dst), s_exc_p15u)213EXC( STORE t2, UNIT(2)(dst), s_exc_p14u)214EXC( STORE t3, UNIT(3)(dst), s_exc_p13u)215EXC( LOAD t0, UNIT(4)(src), l_exc_copy)216EXC( LOAD t1, UNIT(5)(src), l_exc_copy)217EXC( LOAD t2, UNIT(6)(src), l_exc_copy)218EXC( LOAD t3, UNIT(7)(src), l_exc_copy)219EXC( STORE t0, UNIT(4)(dst), s_exc_p12u)220EXC( STORE t1, UNIT(5)(dst), s_exc_p11u)221EXC( STORE t2, UNIT(6)(dst), s_exc_p10u)222ADD src, src, 16*NBYTES223EXC( STORE t3, UNIT(7)(dst), s_exc_p9u)224ADD dst, dst, 16*NBYTES225EXC( LOAD t0, UNIT(-8)(src), l_exc_copy)226EXC( LOAD t1, UNIT(-7)(src), l_exc_copy)227EXC( LOAD t2, UNIT(-6)(src), l_exc_copy)228EXC( LOAD t3, UNIT(-5)(src), l_exc_copy)229EXC( STORE t0, UNIT(-8)(dst), s_exc_p8u)230EXC( STORE t1, UNIT(-7)(dst), s_exc_p7u)231EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u)232EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u)233EXC( LOAD t0, UNIT(-4)(src), l_exc_copy)234EXC( LOAD t1, UNIT(-3)(src), l_exc_copy)235EXC( LOAD t2, UNIT(-2)(src), l_exc_copy)236EXC( LOAD t3, UNIT(-1)(src), l_exc_copy)237EXC( STORE t0, UNIT(-4)(dst), s_exc_p4u)238EXC( STORE t1, UNIT(-3)(dst), s_exc_p3u)239EXC( STORE t2, UNIT(-2)(dst), s_exc_p2u)240EXC( STORE t3, UNIT(-1)(dst), s_exc_p1u)241sltu t0, len, 256+1 # See if we can prefetch more242beqz t0, 2b243sltu t0, len, 128 # See if we can loop more time244beqz t0, 1b245nop246#247# Jump here if there are less than 16*NBYTES left.248#249cleanup_both_aligned:250beqz len, done251sltu t0, len, 8*NBYTES252bnez t0, less_than_8units253nop254EXC( LOAD t0, UNIT(0)(src), l_exc)255EXC( LOAD t1, UNIT(1)(src), l_exc_copy)256EXC( LOAD t2, UNIT(2)(src), l_exc_copy)257EXC( LOAD t3, UNIT(3)(src), l_exc_copy)258SUB len, len, 8*NBYTES259EXC( STORE t0, UNIT(0)(dst), s_exc_p8u)260EXC( STORE t1, UNIT(1)(dst), s_exc_p7u)261EXC( STORE t2, UNIT(2)(dst), s_exc_p6u)262EXC( STORE t3, UNIT(3)(dst), s_exc_p5u)263EXC( LOAD t0, UNIT(4)(src), l_exc_copy)264EXC( LOAD t1, UNIT(5)(src), l_exc_copy)265EXC( LOAD t2, UNIT(6)(src), l_exc_copy)266EXC( LOAD t3, UNIT(7)(src), l_exc_copy)267EXC( STORE t0, UNIT(4)(dst), s_exc_p4u)268EXC( STORE t1, UNIT(5)(dst), s_exc_p3u)269EXC( STORE t2, UNIT(6)(dst), s_exc_p2u)270EXC( STORE t3, UNIT(7)(dst), s_exc_p1u)271ADD src, src, 8*NBYTES272beqz len, done273ADD dst, dst, 8*NBYTES274#275# Jump here if there are less than 8*NBYTES left.276#277less_than_8units:278sltu t0, len, 4*NBYTES279bnez t0, less_than_4units280nop281EXC( LOAD t0, UNIT(0)(src), l_exc)282EXC( LOAD t1, UNIT(1)(src), l_exc_copy)283EXC( LOAD t2, UNIT(2)(src), l_exc_copy)284EXC( LOAD t3, UNIT(3)(src), l_exc_copy)285SUB len, len, 4*NBYTES286EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)287EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)288EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)289EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)290ADD src, src, 4*NBYTES291beqz len, done292ADD dst, dst, 4*NBYTES293#294# Jump here if there are less than 4*NBYTES left. This means295# we may need to copy up to 3 NBYTES words.296#297less_than_4units:298sltu t0, len, 1*NBYTES299bnez t0, copy_bytes_checklen300nop301#302# 1) Copy NBYTES, then check length again303#304EXC( LOAD t0, 0(src), l_exc)305SUB len, len, NBYTES306sltu t1, len, 8307EXC( STORE t0, 0(dst), s_exc_p1u)308ADD src, src, NBYTES309bnez t1, copy_bytes_checklen310ADD dst, dst, NBYTES311#312# 2) Copy NBYTES, then check length again313#314EXC( LOAD t0, 0(src), l_exc)315SUB len, len, NBYTES316sltu t1, len, 8317EXC( STORE t0, 0(dst), s_exc_p1u)318ADD src, src, NBYTES319bnez t1, copy_bytes_checklen320ADD dst, dst, NBYTES321#322# 3) Copy NBYTES, then check length again323#324EXC( LOAD t0, 0(src), l_exc)325SUB len, len, NBYTES326ADD src, src, NBYTES327ADD dst, dst, NBYTES328b copy_bytes_checklen329EXC( STORE t0, -8(dst), s_exc_p1u)330331src_unaligned:332#define rem t8333SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter334beqz t0, cleanup_src_unaligned335and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES3361:337/*338* Avoid consecutive LD*'s to the same register since some mips339* implementations can't issue them in the same cycle.340* It's OK to load FIRST(N+1) before REST(N) because the two addresses341* are to the same unit (unless src is aligned, but it's not).342*/343EXC( LDFIRST t0, FIRST(0)(src), l_exc)344EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)345SUB len, len, 4*NBYTES346EXC( LDREST t0, REST(0)(src), l_exc_copy)347EXC( LDREST t1, REST(1)(src), l_exc_copy)348EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)349EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)350EXC( LDREST t2, REST(2)(src), l_exc_copy)351EXC( LDREST t3, REST(3)(src), l_exc_copy)352ADD src, src, 4*NBYTES353EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)354EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)355EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)356EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)357bne len, rem, 1b358ADD dst, dst, 4*NBYTES359360cleanup_src_unaligned:361beqz len, done362and rem, len, NBYTES-1 # rem = len % NBYTES363beq rem, len, copy_bytes364nop3651:366EXC( LDFIRST t0, FIRST(0)(src), l_exc)367EXC( LDREST t0, REST(0)(src), l_exc_copy)368SUB len, len, NBYTES369EXC( STORE t0, 0(dst), s_exc_p1u)370ADD src, src, NBYTES371bne len, rem, 1b372ADD dst, dst, NBYTES373374copy_bytes_checklen:375beqz len, done376nop377copy_bytes:378/* 0 < len < NBYTES */379#define COPY_BYTE(N) \380EXC( lb t0, N(src), l_exc); \381SUB len, len, 1; \382beqz len, done; \383EXC( sb t0, N(dst), s_exc_p1)384385COPY_BYTE(0)386COPY_BYTE(1)387#ifdef USE_DOUBLE388COPY_BYTE(2)389COPY_BYTE(3)390COPY_BYTE(4)391COPY_BYTE(5)392#endif393EXC( lb t0, NBYTES-2(src), l_exc)394SUB len, len, 1395jr ra396EXC( sb t0, NBYTES-2(dst), s_exc_p1)397done:398jr ra399nop400END(memcpy)401402l_exc_copy:403/*404* Copy bytes from src until faulting load address (or until a405* lb faults)406*407* When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)408* may be more than a byte beyond the last address.409* Hence, the lb below may get an exception.410*411* Assumes src < THREAD_BUADDR($28)412*/413LOAD t0, TI_TASK($28)414nop415LOAD t0, THREAD_BUADDR(t0)4161:417EXC( lb t1, 0(src), l_exc)418ADD src, src, 1419sb t1, 0(dst) # can't fault -- we're copy_from_user420bne src, t0, 1b421ADD dst, dst, 1422l_exc:423LOAD t0, TI_TASK($28)424nop425LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address426nop427SUB len, AT, t0 # len number of uncopied bytes428/*429* Here's where we rely on src and dst being incremented in tandem,430* See (3) above.431* dst += (fault addr - src) to put dst at first byte to clear432*/433ADD dst, t0 # compute start address in a1434SUB dst, src435/*436* Clear len bytes starting at dst. Can't call __bzero because it437* might modify len. An inefficient loop for these rare times...438*/439beqz len, done440SUB src, len, 14411: sb zero, 0(dst)442ADD dst, dst, 1443bnez src, 1b444SUB src, src, 1445jr ra446nop447448449#define SEXC(n) \450s_exc_p ## n ## u: \451jr ra; \452ADD len, len, n*NBYTES453454SEXC(16)455SEXC(15)456SEXC(14)457SEXC(13)458SEXC(12)459SEXC(11)460SEXC(10)461SEXC(9)462SEXC(8)463SEXC(7)464SEXC(6)465SEXC(5)466SEXC(4)467SEXC(3)468SEXC(2)469SEXC(1)470471s_exc_p1:472jr ra473ADD len, len, 1474s_exc:475jr ra476nop477478.align 5479LEAF(memmove)480ADD t0, a0, a2481ADD t1, a1, a2482sltu t0, a1, t0 # dst + len <= src -> memcpy483sltu t1, a0, t1 # dst >= src + len -> memcpy484and t0, t1485beqz t0, __memcpy486move v0, a0 /* return value */487beqz a2, r_out488END(memmove)489490/* fall through to __rmemcpy */491LEAF(__rmemcpy) /* a0=dst a1=src a2=len */492sltu t0, a1, a0493beqz t0, r_end_bytes_up # src >= dst494nop495ADD a0, a2 # dst = dst + len496ADD a1, a2 # src = src + len497498r_end_bytes:499lb t0, -1(a1)500SUB a2, a2, 0x1501sb t0, -1(a0)502SUB a1, a1, 0x1503bnez a2, r_end_bytes504SUB a0, a0, 0x1505506r_out:507jr ra508move a2, zero509510r_end_bytes_up:511lb t0, (a1)512SUB a2, a2, 0x1513sb t0, (a0)514ADD a1, a1, 0x1515bnez a2, r_end_bytes_up516ADD a0, a0, 0x1517518jr ra519move a2, zero520END(__rmemcpy)521522523