Path: blob/master/arch/mips/cavium-octeon/octeon-memcpy.S
26424 views
/*1* This file is subject to the terms and conditions of the GNU General Public2* License. See the file "COPYING" in the main directory of this archive3* for more details.4*5* Unified implementation of memcpy, memmove and the __copy_user backend.6*7* Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle ([email protected])8* Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.9* Copyright (C) 2002 Broadcom, Inc.10* memcpy/copy_user author: Mark Vandevoorde11*12* Mnemonic names for arguments to memcpy/__copy_user13*/1415#include <linux/export.h>16#include <asm/asm.h>17#include <asm/asm-offsets.h>18#include <asm/regdef.h>1920#define dst a021#define src a122#define len a22324/*25* Spec26*27* memcpy copies len bytes from src to dst and sets v0 to dst.28* It assumes that29* - src and dst don't overlap30* - src is readable31* - dst is writable32* memcpy uses the standard calling convention33*34* __copy_user copies up to len bytes from src to dst and sets a2 (len) to35* the number of uncopied bytes due to an exception caused by a read or write.36* __copy_user assumes that src and dst don't overlap, and that the call is37* implementing one of the following:38* copy_to_user39* - src is readable (no exceptions when reading src)40* copy_from_user41* - dst is writable (no exceptions when writing dst)42* __copy_user uses a non-standard calling convention; see43* arch/mips/include/asm/uaccess.h44*45* When an exception happens on a load, the handler must46# ensure that all of the destination buffer is overwritten to prevent47* leaking information to user mode programs.48*/4950/*51* Implementation52*/5354/*55* The exception handler for loads requires that:56* 1- AT contain the address of the byte just past the end of the source57* of the copy,58* 2- src_entry <= src < AT, and59* 3- (dst - src) == (dst_entry - src_entry),60* The _entry suffix denotes values when __copy_user was called.61*62* (1) is set up up by uaccess.h and maintained by not writing AT in copy_user63* (2) is met by incrementing src by the number of bytes copied64* (3) is met by not doing loads between a pair of increments of dst and src65*66* The exception handlers for stores adjust len (if necessary) and return.67* These handlers do not need to overwrite any data.68*69* For __rmemcpy and memmove an exception is always a kernel bug, therefore70* they're not protected.71*/7273#define EXC(inst_reg,addr,handler) \749: inst_reg, addr; \75.section __ex_table,"a"; \76PTR_WD 9b, handler; \77.previous7879/*80* Only on the 64-bit kernel we can made use of 64-bit registers.81*/8283#define LOAD ld84#define LOADL ldl85#define LOADR ldr86#define STOREL sdl87#define STORER sdr88#define STORE sd89#define ADD daddu90#define SUB dsubu91#define SRL dsrl92#define SRA dsra93#define SLL dsll94#define SLLV dsllv95#define SRLV dsrlv96#define NBYTES 897#define LOG_NBYTES 39899/*100* As we are sharing code base with the mips32 tree (which use the o32 ABI101* register definitions). We need to redefine the register definitions from102* the n64 ABI register naming to the o32 ABI register naming.103*/104#undef t0105#undef t1106#undef t2107#undef t3108#define t0 $8109#define t1 $9110#define t2 $10111#define t3 $11112#define t4 $12113#define t5 $13114#define t6 $14115#define t7 $15116117#ifdef CONFIG_CPU_LITTLE_ENDIAN118#define LDFIRST LOADR119#define LDREST LOADL120#define STFIRST STORER121#define STREST STOREL122#define SHIFT_DISCARD SLLV123#else124#define LDFIRST LOADL125#define LDREST LOADR126#define STFIRST STOREL127#define STREST STORER128#define SHIFT_DISCARD SRLV129#endif130131#define FIRST(unit) ((unit)*NBYTES)132#define REST(unit) (FIRST(unit)+NBYTES-1)133#define UNIT(unit) FIRST(unit)134135#define ADDRMASK (NBYTES-1)136137.text138.set noreorder139.set noat140141/*142* A combined memcpy/__copy_user143* __copy_user sets len to 0 for success; else to an upper bound of144* the number of uncopied bytes.145* memcpy sets v0 to dst.146*/147.align 5148LEAF(memcpy) /* a0=dst a1=src a2=len */149EXPORT_SYMBOL(memcpy)150move v0, dst /* return value */151__memcpy:152FEXPORT(__raw_copy_from_user)153EXPORT_SYMBOL(__raw_copy_from_user)154FEXPORT(__raw_copy_to_user)155EXPORT_SYMBOL(__raw_copy_to_user)156/*157* Note: dst & src may be unaligned, len may be 0158* Temps159*/160#161# Octeon doesn't care if the destination is unaligned. The hardware162# can fix it faster than we can special case the assembly.163#164pref 0, 0(src)165sltu t0, len, NBYTES # Check if < 1 word166bnez t0, copy_bytes_checklen167and t0, src, ADDRMASK # Check if src unaligned168bnez t0, src_unaligned169sltu t0, len, 4*NBYTES # Check if < 4 words170bnez t0, less_than_4units171sltu t0, len, 8*NBYTES # Check if < 8 words172bnez t0, less_than_8units173sltu t0, len, 16*NBYTES # Check if < 16 words174bnez t0, cleanup_both_aligned175sltu t0, len, 128+1 # Check if len < 129176bnez t0, 1f # Skip prefetch if len is too short177sltu t0, len, 256+1 # Check if len < 257178bnez t0, 1f # Skip prefetch if len is too short179pref 0, 128(src) # We must not prefetch invalid addresses180#181# This is where we loop if there is more than 128 bytes left1822: pref 0, 256(src) # We must not prefetch invalid addresses183#184# This is where we loop if we can't prefetch anymore1851:186EXC( LOAD t0, UNIT(0)(src), l_exc)187EXC( LOAD t1, UNIT(1)(src), l_exc_copy)188EXC( LOAD t2, UNIT(2)(src), l_exc_copy)189EXC( LOAD t3, UNIT(3)(src), l_exc_copy)190SUB len, len, 16*NBYTES191EXC( STORE t0, UNIT(0)(dst), s_exc_p16u)192EXC( STORE t1, UNIT(1)(dst), s_exc_p15u)193EXC( STORE t2, UNIT(2)(dst), s_exc_p14u)194EXC( STORE t3, UNIT(3)(dst), s_exc_p13u)195EXC( LOAD t0, UNIT(4)(src), l_exc_copy)196EXC( LOAD t1, UNIT(5)(src), l_exc_copy)197EXC( LOAD t2, UNIT(6)(src), l_exc_copy)198EXC( LOAD t3, UNIT(7)(src), l_exc_copy)199EXC( STORE t0, UNIT(4)(dst), s_exc_p12u)200EXC( STORE t1, UNIT(5)(dst), s_exc_p11u)201EXC( STORE t2, UNIT(6)(dst), s_exc_p10u)202ADD src, src, 16*NBYTES203EXC( STORE t3, UNIT(7)(dst), s_exc_p9u)204ADD dst, dst, 16*NBYTES205EXC( LOAD t0, UNIT(-8)(src), l_exc_copy_rewind16)206EXC( LOAD t1, UNIT(-7)(src), l_exc_copy_rewind16)207EXC( LOAD t2, UNIT(-6)(src), l_exc_copy_rewind16)208EXC( LOAD t3, UNIT(-5)(src), l_exc_copy_rewind16)209EXC( STORE t0, UNIT(-8)(dst), s_exc_p8u)210EXC( STORE t1, UNIT(-7)(dst), s_exc_p7u)211EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u)212EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u)213EXC( LOAD t0, UNIT(-4)(src), l_exc_copy_rewind16)214EXC( LOAD t1, UNIT(-3)(src), l_exc_copy_rewind16)215EXC( LOAD t2, UNIT(-2)(src), l_exc_copy_rewind16)216EXC( LOAD t3, UNIT(-1)(src), l_exc_copy_rewind16)217EXC( STORE t0, UNIT(-4)(dst), s_exc_p4u)218EXC( STORE t1, UNIT(-3)(dst), s_exc_p3u)219EXC( STORE t2, UNIT(-2)(dst), s_exc_p2u)220EXC( STORE t3, UNIT(-1)(dst), s_exc_p1u)221sltu t0, len, 256+1 # See if we can prefetch more222beqz t0, 2b223sltu t0, len, 128 # See if we can loop more time224beqz t0, 1b225nop226#227# Jump here if there are less than 16*NBYTES left.228#229cleanup_both_aligned:230beqz len, done231sltu t0, len, 8*NBYTES232bnez t0, less_than_8units233nop234EXC( LOAD t0, UNIT(0)(src), l_exc)235EXC( LOAD t1, UNIT(1)(src), l_exc_copy)236EXC( LOAD t2, UNIT(2)(src), l_exc_copy)237EXC( LOAD t3, UNIT(3)(src), l_exc_copy)238SUB len, len, 8*NBYTES239EXC( STORE t0, UNIT(0)(dst), s_exc_p8u)240EXC( STORE t1, UNIT(1)(dst), s_exc_p7u)241EXC( STORE t2, UNIT(2)(dst), s_exc_p6u)242EXC( STORE t3, UNIT(3)(dst), s_exc_p5u)243EXC( LOAD t0, UNIT(4)(src), l_exc_copy)244EXC( LOAD t1, UNIT(5)(src), l_exc_copy)245EXC( LOAD t2, UNIT(6)(src), l_exc_copy)246EXC( LOAD t3, UNIT(7)(src), l_exc_copy)247EXC( STORE t0, UNIT(4)(dst), s_exc_p4u)248EXC( STORE t1, UNIT(5)(dst), s_exc_p3u)249EXC( STORE t2, UNIT(6)(dst), s_exc_p2u)250EXC( STORE t3, UNIT(7)(dst), s_exc_p1u)251ADD src, src, 8*NBYTES252beqz len, done253ADD dst, dst, 8*NBYTES254#255# Jump here if there are less than 8*NBYTES left.256#257less_than_8units:258sltu t0, len, 4*NBYTES259bnez t0, less_than_4units260nop261EXC( LOAD t0, UNIT(0)(src), l_exc)262EXC( LOAD t1, UNIT(1)(src), l_exc_copy)263EXC( LOAD t2, UNIT(2)(src), l_exc_copy)264EXC( LOAD t3, UNIT(3)(src), l_exc_copy)265SUB len, len, 4*NBYTES266EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)267EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)268EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)269EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)270ADD src, src, 4*NBYTES271beqz len, done272ADD dst, dst, 4*NBYTES273#274# Jump here if there are less than 4*NBYTES left. This means275# we may need to copy up to 3 NBYTES words.276#277less_than_4units:278sltu t0, len, 1*NBYTES279bnez t0, copy_bytes_checklen280nop281#282# 1) Copy NBYTES, then check length again283#284EXC( LOAD t0, 0(src), l_exc)285SUB len, len, NBYTES286sltu t1, len, 8287EXC( STORE t0, 0(dst), s_exc_p1u)288ADD src, src, NBYTES289bnez t1, copy_bytes_checklen290ADD dst, dst, NBYTES291#292# 2) Copy NBYTES, then check length again293#294EXC( LOAD t0, 0(src), l_exc)295SUB len, len, NBYTES296sltu t1, len, 8297EXC( STORE t0, 0(dst), s_exc_p1u)298ADD src, src, NBYTES299bnez t1, copy_bytes_checklen300ADD dst, dst, NBYTES301#302# 3) Copy NBYTES, then check length again303#304EXC( LOAD t0, 0(src), l_exc)305SUB len, len, NBYTES306ADD src, src, NBYTES307ADD dst, dst, NBYTES308b copy_bytes_checklen309EXC( STORE t0, -8(dst), s_exc_p1u)310311src_unaligned:312#define rem t8313SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter314beqz t0, cleanup_src_unaligned315and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES3161:317/*318* Avoid consecutive LD*'s to the same register since some mips319* implementations can't issue them in the same cycle.320* It's OK to load FIRST(N+1) before REST(N) because the two addresses321* are to the same unit (unless src is aligned, but it's not).322*/323EXC( LDFIRST t0, FIRST(0)(src), l_exc)324EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)325SUB len, len, 4*NBYTES326EXC( LDREST t0, REST(0)(src), l_exc_copy)327EXC( LDREST t1, REST(1)(src), l_exc_copy)328EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)329EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)330EXC( LDREST t2, REST(2)(src), l_exc_copy)331EXC( LDREST t3, REST(3)(src), l_exc_copy)332ADD src, src, 4*NBYTES333EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)334EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)335EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)336EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)337bne len, rem, 1b338ADD dst, dst, 4*NBYTES339340cleanup_src_unaligned:341beqz len, done342and rem, len, NBYTES-1 # rem = len % NBYTES343beq rem, len, copy_bytes344nop3451:346EXC( LDFIRST t0, FIRST(0)(src), l_exc)347EXC( LDREST t0, REST(0)(src), l_exc_copy)348SUB len, len, NBYTES349EXC( STORE t0, 0(dst), s_exc_p1u)350ADD src, src, NBYTES351bne len, rem, 1b352ADD dst, dst, NBYTES353354copy_bytes_checklen:355beqz len, done356nop357copy_bytes:358/* 0 < len < NBYTES */359#define COPY_BYTE(N) \360EXC( lb t0, N(src), l_exc); \361SUB len, len, 1; \362beqz len, done; \363EXC( sb t0, N(dst), s_exc_p1)364365COPY_BYTE(0)366COPY_BYTE(1)367COPY_BYTE(2)368COPY_BYTE(3)369COPY_BYTE(4)370COPY_BYTE(5)371EXC( lb t0, NBYTES-2(src), l_exc)372SUB len, len, 1373jr ra374EXC( sb t0, NBYTES-2(dst), s_exc_p1)375done:376jr ra377nop378END(memcpy)379380l_exc_copy_rewind16:381/* Rewind src and dst by 16*NBYTES for l_exc_copy */382SUB src, src, 16*NBYTES383SUB dst, dst, 16*NBYTES384l_exc_copy:385/*386* Copy bytes from src until faulting load address (or until a387* lb faults)388*389* When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)390* may be more than a byte beyond the last address.391* Hence, the lb below may get an exception.392*393* Assumes src < THREAD_BUADDR($28)394*/395LOAD t0, TI_TASK($28)396LOAD t0, THREAD_BUADDR(t0)3971:398EXC( lb t1, 0(src), l_exc)399ADD src, src, 1400sb t1, 0(dst) # can't fault -- we're copy_from_user401bne src, t0, 1b402ADD dst, dst, 1403l_exc:404LOAD t0, TI_TASK($28)405LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address406SUB len, AT, t0 # len number of uncopied bytes407jr ra408nop409410411#define SEXC(n) \412s_exc_p ## n ## u: \413jr ra; \414ADD len, len, n*NBYTES415416SEXC(16)417SEXC(15)418SEXC(14)419SEXC(13)420SEXC(12)421SEXC(11)422SEXC(10)423SEXC(9)424SEXC(8)425SEXC(7)426SEXC(6)427SEXC(5)428SEXC(4)429SEXC(3)430SEXC(2)431SEXC(1)432433s_exc_p1:434jr ra435ADD len, len, 1436s_exc:437jr ra438nop439440.align 5441LEAF(memmove)442EXPORT_SYMBOL(memmove)443ADD t0, a0, a2444ADD t1, a1, a2445sltu t0, a1, t0 # dst + len <= src -> memcpy446sltu t1, a0, t1 # dst >= src + len -> memcpy447and t0, t1448beqz t0, __memcpy449move v0, a0 /* return value */450beqz a2, r_out451END(memmove)452453/* fall through to __rmemcpy */454LEAF(__rmemcpy) /* a0=dst a1=src a2=len */455sltu t0, a1, a0456beqz t0, r_end_bytes_up # src >= dst457nop458ADD a0, a2 # dst = dst + len459ADD a1, a2 # src = src + len460461r_end_bytes:462lb t0, -1(a1)463SUB a2, a2, 0x1464sb t0, -1(a0)465SUB a1, a1, 0x1466bnez a2, r_end_bytes467SUB a0, a0, 0x1468469r_out:470jr ra471move a2, zero472473r_end_bytes_up:474lb t0, (a1)475SUB a2, a2, 0x1476sb t0, (a0)477ADD a1, a1, 0x1478bnez a2, r_end_bytes_up479ADD a0, a0, 0x1480481jr ra482move a2, zero483END(__rmemcpy)484485486