/*1* This file is subject to the terms and conditions of the GNU General Public2* License. See the file "COPYING" in the main directory of this archive3* for more details.4*5* Quick'n'dirty IP checksum ...6*7* Copyright (C) 1998, 1999 Ralf Baechle8* Copyright (C) 1999 Silicon Graphics, Inc.9* Copyright (C) 2007 Maciej W. Rozycki10*/11#include <linux/errno.h>12#include <asm/asm.h>13#include <asm/asm-offsets.h>14#include <asm/regdef.h>1516#ifdef CONFIG_64BIT17/*18* As we are sharing code base with the mips32 tree (which use the o32 ABI19* register definitions). We need to redefine the register definitions from20* the n64 ABI register naming to the o32 ABI register naming.21*/22#undef t023#undef t124#undef t225#undef t326#define t0 $827#define t1 $928#define t2 $1029#define t3 $1130#define t4 $1231#define t5 $1332#define t6 $1433#define t7 $153435#define USE_DOUBLE36#endif3738#ifdef USE_DOUBLE3940#define LOAD ld41#define LOAD32 lwu42#define ADD daddu43#define NBYTES 84445#else4647#define LOAD lw48#define LOAD32 lw49#define ADD addu50#define NBYTES 45152#endif /* USE_DOUBLE */5354#define UNIT(unit) ((unit)*NBYTES)5556#define ADDC(sum,reg) \57ADD sum, reg; \58sltu v1, sum, reg; \59ADD sum, v1; \6061#define ADDC32(sum,reg) \62addu sum, reg; \63sltu v1, sum, reg; \64addu sum, v1; \6566#define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3) \67LOAD _t0, (offset + UNIT(0))(src); \68LOAD _t1, (offset + UNIT(1))(src); \69LOAD _t2, (offset + UNIT(2))(src); \70LOAD _t3, (offset + UNIT(3))(src); \71ADDC(sum, _t0); \72ADDC(sum, _t1); \73ADDC(sum, _t2); \74ADDC(sum, _t3)7576#ifdef USE_DOUBLE77#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \78CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)79#else80#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \81CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3); \82CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3)83#endif8485/*86* a0: source address87* a1: length of the area to checksum88* a2: partial checksum89*/9091#define src a092#define sum v09394.text95.set noreorder96.align 597LEAF(csum_partial)98move sum, zero99move t7, zero100101sltiu t8, a1, 0x8102bnez t8, .Lsmall_csumcpy /* < 8 bytes to copy */103move t2, a1104105andi t7, src, 0x1 /* odd buffer? */106107.Lhword_align:108beqz t7, .Lword_align109andi t8, src, 0x2110111lbu t0, (src)112LONG_SUBU a1, a1, 0x1113#ifdef __MIPSEL__114sll t0, t0, 8115#endif116ADDC(sum, t0)117PTR_ADDU src, src, 0x1118andi t8, src, 0x2119120.Lword_align:121beqz t8, .Ldword_align122sltiu t8, a1, 56123124lhu t0, (src)125LONG_SUBU a1, a1, 0x2126ADDC(sum, t0)127sltiu t8, a1, 56128PTR_ADDU src, src, 0x2129130.Ldword_align:131bnez t8, .Ldo_end_words132move t8, a1133134andi t8, src, 0x4135beqz t8, .Lqword_align136andi t8, src, 0x8137138LOAD32 t0, 0x00(src)139LONG_SUBU a1, a1, 0x4140ADDC(sum, t0)141PTR_ADDU src, src, 0x4142andi t8, src, 0x8143144.Lqword_align:145beqz t8, .Loword_align146andi t8, src, 0x10147148#ifdef USE_DOUBLE149ld t0, 0x00(src)150LONG_SUBU a1, a1, 0x8151ADDC(sum, t0)152#else153lw t0, 0x00(src)154lw t1, 0x04(src)155LONG_SUBU a1, a1, 0x8156ADDC(sum, t0)157ADDC(sum, t1)158#endif159PTR_ADDU src, src, 0x8160andi t8, src, 0x10161162.Loword_align:163beqz t8, .Lbegin_movement164LONG_SRL t8, a1, 0x7165166#ifdef USE_DOUBLE167ld t0, 0x00(src)168ld t1, 0x08(src)169ADDC(sum, t0)170ADDC(sum, t1)171#else172CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4)173#endif174LONG_SUBU a1, a1, 0x10175PTR_ADDU src, src, 0x10176LONG_SRL t8, a1, 0x7177178.Lbegin_movement:179beqz t8, 1f180andi t2, a1, 0x40181182.Lmove_128bytes:183CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)184CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)185CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)186CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)187LONG_SUBU t8, t8, 0x01188.set reorder /* DADDI_WAR */189PTR_ADDU src, src, 0x80190bnez t8, .Lmove_128bytes191.set noreorder1921931:194beqz t2, 1f195andi t2, a1, 0x20196197.Lmove_64bytes:198CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)199CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)200PTR_ADDU src, src, 0x402012021:203beqz t2, .Ldo_end_words204andi t8, a1, 0x1c205206.Lmove_32bytes:207CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)208andi t8, a1, 0x1c209PTR_ADDU src, src, 0x20210211.Ldo_end_words:212beqz t8, .Lsmall_csumcpy213andi t2, a1, 0x3214LONG_SRL t8, t8, 0x2215216.Lend_words:217LOAD32 t0, (src)218LONG_SUBU t8, t8, 0x1219ADDC(sum, t0)220.set reorder /* DADDI_WAR */221PTR_ADDU src, src, 0x4222bnez t8, .Lend_words223.set noreorder224225/* unknown src alignment and < 8 bytes to go */226.Lsmall_csumcpy:227move a1, t2228229andi t0, a1, 4230beqz t0, 1f231andi t0, a1, 2232233/* Still a full word to go */234ulw t1, (src)235PTR_ADDIU src, 4236#ifdef USE_DOUBLE237dsll t1, t1, 32 /* clear lower 32bit */238#endif239ADDC(sum, t1)2402411: move t1, zero242beqz t0, 1f243andi t0, a1, 1244245/* Still a halfword to go */246ulhu t1, (src)247PTR_ADDIU src, 22482491: beqz t0, 1f250sll t1, t1, 16251252lbu t2, (src)253nop254255#ifdef __MIPSEB__256sll t2, t2, 8257#endif258or t1, t22592601: ADDC(sum, t1)261262/* fold checksum */263#ifdef USE_DOUBLE264dsll32 v1, sum, 0265daddu sum, v1266sltu v1, sum, v1267dsra32 sum, sum, 0268addu sum, v1269#endif270271/* odd buffer alignment? */272#ifdef CPU_MIPSR2273wsbh v1, sum274movn sum, v1, t7275#else276beqz t7, 1f /* odd buffer alignment? */277lui v1, 0x00ff278addu v1, 0x00ff279and t0, sum, v1280sll t0, t0, 8281srl sum, sum, 8282and sum, sum, v1283or sum, sum, t02841:285#endif286.set reorder287/* Add the passed partial csum. */288ADDC32(sum, a2)289jr ra290.set noreorder291END(csum_partial)292293294/*295* checksum and copy routines based on memcpy.S296*297* csum_partial_copy_nocheck(src, dst, len, sum)298* __csum_partial_copy_user(src, dst, len, sum, errp)299*300* See "Spec" in memcpy.S for details. Unlike __copy_user, all301* function in this file use the standard calling convention.302*/303304#define src a0305#define dst a1306#define len a2307#define psum a3308#define sum v0309#define odd t8310#define errptr t9311312/*313* The exception handler for loads requires that:314* 1- AT contain the address of the byte just past the end of the source315* of the copy,316* 2- src_entry <= src < AT, and317* 3- (dst - src) == (dst_entry - src_entry),318* The _entry suffix denotes values when __copy_user was called.319*320* (1) is set up up by __csum_partial_copy_from_user and maintained by321* not writing AT in __csum_partial_copy322* (2) is met by incrementing src by the number of bytes copied323* (3) is met by not doing loads between a pair of increments of dst and src324*325* The exception handlers for stores stores -EFAULT to errptr and return.326* These handlers do not need to overwrite any data.327*/328329#define EXC(inst_reg,addr,handler) \3309: inst_reg, addr; \331.section __ex_table,"a"; \332PTR 9b, handler; \333.previous334335#ifdef USE_DOUBLE336337#define LOAD ld338#define LOADL ldl339#define LOADR ldr340#define STOREL sdl341#define STORER sdr342#define STORE sd343#define ADD daddu344#define SUB dsubu345#define SRL dsrl346#define SLL dsll347#define SLLV dsllv348#define SRLV dsrlv349#define NBYTES 8350#define LOG_NBYTES 3351352#else353354#define LOAD lw355#define LOADL lwl356#define LOADR lwr357#define STOREL swl358#define STORER swr359#define STORE sw360#define ADD addu361#define SUB subu362#define SRL srl363#define SLL sll364#define SLLV sllv365#define SRLV srlv366#define NBYTES 4367#define LOG_NBYTES 2368369#endif /* USE_DOUBLE */370371#ifdef CONFIG_CPU_LITTLE_ENDIAN372#define LDFIRST LOADR373#define LDREST LOADL374#define STFIRST STORER375#define STREST STOREL376#define SHIFT_DISCARD SLLV377#define SHIFT_DISCARD_REVERT SRLV378#else379#define LDFIRST LOADL380#define LDREST LOADR381#define STFIRST STOREL382#define STREST STORER383#define SHIFT_DISCARD SRLV384#define SHIFT_DISCARD_REVERT SLLV385#endif386387#define FIRST(unit) ((unit)*NBYTES)388#define REST(unit) (FIRST(unit)+NBYTES-1)389390#define ADDRMASK (NBYTES-1)391392#ifndef CONFIG_CPU_DADDI_WORKAROUNDS393.set noat394#else395.set at=v1396#endif397398LEAF(__csum_partial_copy_user)399PTR_ADDU AT, src, len /* See (1) above. */400#ifdef CONFIG_64BIT401move errptr, a4402#else403lw errptr, 16(sp)404#endif405FEXPORT(csum_partial_copy_nocheck)406move sum, zero407move odd, zero408/*409* Note: dst & src may be unaligned, len may be 0410* Temps411*/412/*413* The "issue break"s below are very approximate.414* Issue delays for dcache fills will perturb the schedule, as will415* load queue full replay traps, etc.416*417* If len < NBYTES use byte operations.418*/419sltu t2, len, NBYTES420and t1, dst, ADDRMASK421bnez t2, .Lcopy_bytes_checklen422and t0, src, ADDRMASK423andi odd, dst, 0x1 /* odd buffer? */424bnez t1, .Ldst_unaligned425nop426bnez t0, .Lsrc_unaligned_dst_aligned427/*428* use delay slot for fall-through429* src and dst are aligned; need to compute rem430*/431.Lboth_aligned:432SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter433beqz t0, .Lcleanup_both_aligned # len < 8*NBYTES434nop435SUB len, 8*NBYTES # subtract here for bgez loop436.align 44371:438EXC( LOAD t0, UNIT(0)(src), .Ll_exc)439EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)440EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)441EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)442EXC( LOAD t4, UNIT(4)(src), .Ll_exc_copy)443EXC( LOAD t5, UNIT(5)(src), .Ll_exc_copy)444EXC( LOAD t6, UNIT(6)(src), .Ll_exc_copy)445EXC( LOAD t7, UNIT(7)(src), .Ll_exc_copy)446SUB len, len, 8*NBYTES447ADD src, src, 8*NBYTES448EXC( STORE t0, UNIT(0)(dst), .Ls_exc)449ADDC(sum, t0)450EXC( STORE t1, UNIT(1)(dst), .Ls_exc)451ADDC(sum, t1)452EXC( STORE t2, UNIT(2)(dst), .Ls_exc)453ADDC(sum, t2)454EXC( STORE t3, UNIT(3)(dst), .Ls_exc)455ADDC(sum, t3)456EXC( STORE t4, UNIT(4)(dst), .Ls_exc)457ADDC(sum, t4)458EXC( STORE t5, UNIT(5)(dst), .Ls_exc)459ADDC(sum, t5)460EXC( STORE t6, UNIT(6)(dst), .Ls_exc)461ADDC(sum, t6)462EXC( STORE t7, UNIT(7)(dst), .Ls_exc)463ADDC(sum, t7)464.set reorder /* DADDI_WAR */465ADD dst, dst, 8*NBYTES466bgez len, 1b467.set noreorder468ADD len, 8*NBYTES # revert len (see above)469470/*471* len == the number of bytes left to copy < 8*NBYTES472*/473.Lcleanup_both_aligned:474#define rem t7475beqz len, .Ldone476sltu t0, len, 4*NBYTES477bnez t0, .Lless_than_4units478and rem, len, (NBYTES-1) # rem = len % NBYTES479/*480* len >= 4*NBYTES481*/482EXC( LOAD t0, UNIT(0)(src), .Ll_exc)483EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)484EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)485EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)486SUB len, len, 4*NBYTES487ADD src, src, 4*NBYTES488EXC( STORE t0, UNIT(0)(dst), .Ls_exc)489ADDC(sum, t0)490EXC( STORE t1, UNIT(1)(dst), .Ls_exc)491ADDC(sum, t1)492EXC( STORE t2, UNIT(2)(dst), .Ls_exc)493ADDC(sum, t2)494EXC( STORE t3, UNIT(3)(dst), .Ls_exc)495ADDC(sum, t3)496.set reorder /* DADDI_WAR */497ADD dst, dst, 4*NBYTES498beqz len, .Ldone499.set noreorder500.Lless_than_4units:501/*502* rem = len % NBYTES503*/504beq rem, len, .Lcopy_bytes505nop5061:507EXC( LOAD t0, 0(src), .Ll_exc)508ADD src, src, NBYTES509SUB len, len, NBYTES510EXC( STORE t0, 0(dst), .Ls_exc)511ADDC(sum, t0)512.set reorder /* DADDI_WAR */513ADD dst, dst, NBYTES514bne rem, len, 1b515.set noreorder516517/*518* src and dst are aligned, need to copy rem bytes (rem < NBYTES)519* A loop would do only a byte at a time with possible branch520* mispredicts. Can't do an explicit LOAD dst,mask,or,STORE521* because can't assume read-access to dst. Instead, use522* STREST dst, which doesn't require read access to dst.523*524* This code should perform better than a simple loop on modern,525* wide-issue mips processors because the code has fewer branches and526* more instruction-level parallelism.527*/528#define bits t2529beqz len, .Ldone530ADD t1, dst, len # t1 is just past last byte of dst531li bits, 8*NBYTES532SLL rem, len, 3 # rem = number of bits to keep533EXC( LOAD t0, 0(src), .Ll_exc)534SUB bits, bits, rem # bits = number of bits to discard535SHIFT_DISCARD t0, t0, bits536EXC( STREST t0, -1(t1), .Ls_exc)537SHIFT_DISCARD_REVERT t0, t0, bits538.set reorder539ADDC(sum, t0)540b .Ldone541.set noreorder542.Ldst_unaligned:543/*544* dst is unaligned545* t0 = src & ADDRMASK546* t1 = dst & ADDRMASK; T1 > 0547* len >= NBYTES548*549* Copy enough bytes to align dst550* Set match = (src and dst have same alignment)551*/552#define match rem553EXC( LDFIRST t3, FIRST(0)(src), .Ll_exc)554ADD t2, zero, NBYTES555EXC( LDREST t3, REST(0)(src), .Ll_exc_copy)556SUB t2, t2, t1 # t2 = number of bytes copied557xor match, t0, t1558EXC( STFIRST t3, FIRST(0)(dst), .Ls_exc)559SLL t4, t1, 3 # t4 = number of bits to discard560SHIFT_DISCARD t3, t3, t4561/* no SHIFT_DISCARD_REVERT to handle odd buffer properly */562ADDC(sum, t3)563beq len, t2, .Ldone564SUB len, len, t2565ADD dst, dst, t2566beqz match, .Lboth_aligned567ADD src, src, t2568569.Lsrc_unaligned_dst_aligned:570SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter571beqz t0, .Lcleanup_src_unaligned572and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES5731:574/*575* Avoid consecutive LD*'s to the same register since some mips576* implementations can't issue them in the same cycle.577* It's OK to load FIRST(N+1) before REST(N) because the two addresses578* are to the same unit (unless src is aligned, but it's not).579*/580EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)581EXC( LDFIRST t1, FIRST(1)(src), .Ll_exc_copy)582SUB len, len, 4*NBYTES583EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)584EXC( LDREST t1, REST(1)(src), .Ll_exc_copy)585EXC( LDFIRST t2, FIRST(2)(src), .Ll_exc_copy)586EXC( LDFIRST t3, FIRST(3)(src), .Ll_exc_copy)587EXC( LDREST t2, REST(2)(src), .Ll_exc_copy)588EXC( LDREST t3, REST(3)(src), .Ll_exc_copy)589ADD src, src, 4*NBYTES590#ifdef CONFIG_CPU_SB1591nop # improves slotting592#endif593EXC( STORE t0, UNIT(0)(dst), .Ls_exc)594ADDC(sum, t0)595EXC( STORE t1, UNIT(1)(dst), .Ls_exc)596ADDC(sum, t1)597EXC( STORE t2, UNIT(2)(dst), .Ls_exc)598ADDC(sum, t2)599EXC( STORE t3, UNIT(3)(dst), .Ls_exc)600ADDC(sum, t3)601.set reorder /* DADDI_WAR */602ADD dst, dst, 4*NBYTES603bne len, rem, 1b604.set noreorder605606.Lcleanup_src_unaligned:607beqz len, .Ldone608and rem, len, NBYTES-1 # rem = len % NBYTES609beq rem, len, .Lcopy_bytes610nop6111:612EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)613EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)614ADD src, src, NBYTES615SUB len, len, NBYTES616EXC( STORE t0, 0(dst), .Ls_exc)617ADDC(sum, t0)618.set reorder /* DADDI_WAR */619ADD dst, dst, NBYTES620bne len, rem, 1b621.set noreorder622623.Lcopy_bytes_checklen:624beqz len, .Ldone625nop626.Lcopy_bytes:627/* 0 < len < NBYTES */628#ifdef CONFIG_CPU_LITTLE_ENDIAN629#define SHIFT_START 0630#define SHIFT_INC 8631#else632#define SHIFT_START 8*(NBYTES-1)633#define SHIFT_INC -8634#endif635move t2, zero # partial word636li t3, SHIFT_START # shift637/* use .Ll_exc_copy here to return correct sum on fault */638#define COPY_BYTE(N) \639EXC( lbu t0, N(src), .Ll_exc_copy); \640SUB len, len, 1; \641EXC( sb t0, N(dst), .Ls_exc); \642SLLV t0, t0, t3; \643addu t3, SHIFT_INC; \644beqz len, .Lcopy_bytes_done; \645or t2, t0646647COPY_BYTE(0)648COPY_BYTE(1)649#ifdef USE_DOUBLE650COPY_BYTE(2)651COPY_BYTE(3)652COPY_BYTE(4)653COPY_BYTE(5)654#endif655EXC( lbu t0, NBYTES-2(src), .Ll_exc_copy)656SUB len, len, 1657EXC( sb t0, NBYTES-2(dst), .Ls_exc)658SLLV t0, t0, t3659or t2, t0660.Lcopy_bytes_done:661ADDC(sum, t2)662.Ldone:663/* fold checksum */664#ifdef USE_DOUBLE665dsll32 v1, sum, 0666daddu sum, v1667sltu v1, sum, v1668dsra32 sum, sum, 0669addu sum, v1670#endif671672#ifdef CPU_MIPSR2673wsbh v1, sum674movn sum, v1, odd675#else676beqz odd, 1f /* odd buffer alignment? */677lui v1, 0x00ff678addu v1, 0x00ff679and t0, sum, v1680sll t0, t0, 8681srl sum, sum, 8682and sum, sum, v1683or sum, sum, t06841:685#endif686.set reorder687ADDC32(sum, psum)688jr ra689.set noreorder690691.Ll_exc_copy:692/*693* Copy bytes from src until faulting load address (or until a694* lb faults)695*696* When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)697* may be more than a byte beyond the last address.698* Hence, the lb below may get an exception.699*700* Assumes src < THREAD_BUADDR($28)701*/702LOAD t0, TI_TASK($28)703li t2, SHIFT_START704LOAD t0, THREAD_BUADDR(t0)7051:706EXC( lbu t1, 0(src), .Ll_exc)707ADD src, src, 1708sb t1, 0(dst) # can't fault -- we're copy_from_user709SLLV t1, t1, t2710addu t2, SHIFT_INC711ADDC(sum, t1)712.set reorder /* DADDI_WAR */713ADD dst, dst, 1714bne src, t0, 1b715.set noreorder716.Ll_exc:717LOAD t0, TI_TASK($28)718nop719LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address720nop721SUB len, AT, t0 # len number of uncopied bytes722/*723* Here's where we rely on src and dst being incremented in tandem,724* See (3) above.725* dst += (fault addr - src) to put dst at first byte to clear726*/727ADD dst, t0 # compute start address in a1728SUB dst, src729/*730* Clear len bytes starting at dst. Can't call __bzero because it731* might modify len. An inefficient loop for these rare times...732*/733.set reorder /* DADDI_WAR */734SUB src, len, 1735beqz len, .Ldone736.set noreorder7371: sb zero, 0(dst)738ADD dst, dst, 1739.set push740.set noat741#ifndef CONFIG_CPU_DADDI_WORKAROUNDS742bnez src, 1b743SUB src, src, 1744#else745li v1, 1746bnez src, 1b747SUB src, src, v1748#endif749li v1, -EFAULT750b .Ldone751sw v1, (errptr)752753.Ls_exc:754li v0, -1 /* invalid checksum */755li v1, -EFAULT756jr ra757sw v1, (errptr)758.set pop759END(__csum_partial_copy_user)760761762