Path: blob/master/arch/sh/lib64/copy_user_memcpy.S
10817 views
!1! Fast SH memcpy2!3! by Toshiyasu Morita (tm@netcom.com)4! hacked by J"orn Rernnecke ([email protected]) ("o for o-umlaut)5! SH5 code Copyright 2002 SuperH Ltd.6!7! Entry: ARG0: destination pointer8! ARG1: source pointer9! ARG2: byte count10!11! Exit: RESULT: destination pointer12! any other registers in the range r0-r7: trashed13!14! Notes: Usually one wants to do small reads and write a longword, but15! unfortunately it is difficult in some cases to concatanate bytes16! into a longword on the SH, so this does a longword read and small17! writes.18!19! This implementation makes two assumptions about how it is called:20!21! 1.: If the byte count is nonzero, the address of the last byte to be22! copied is unsigned greater than the address of the first byte to23! be copied. This could be easily swapped for a signed comparison,24! but the algorithm used needs some comparison.25!26! 2.: When there are two or three bytes in the last word of an 11-or-more27! bytes memory chunk to b copied, the rest of the word can be read28! without side effects.29! This could be easily changed by increasing the minimum size of30! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,31! however, this would cost a few extra cyles on average.32! For SHmedia, the assumption is that any quadword can be read in its33! enirety if at least one byte is included in the copy.3435/* Imported into Linux kernel by Richard Curnow. This is used to implement the36__copy_user function in the general case, so it has to be a distinct37function from intra-kernel memcpy to allow for exception fix-ups in the38event that the user pointer is bad somewhere in the copy (e.g. due to39running off the end of the vma).4041Note, this algorithm will be slightly wasteful in the case where the source42and destination pointers are equally aligned, because the stlo/sthi pairs43could then be merged back into single stores. If there are a lot of cache44misses, this is probably offset by the stall lengths on the preloads.4546*/4748/* NOTE : Prefetches removed and allocos guarded by synco to avoid TAKum0302049* erratum. The first two prefetches are nop-ed out to avoid upsetting the50* instruction counts used in the jump address calculation.51* */5253.section .text..SHmedia32,"ax"54.little55.balign 3256.global copy_user_memcpy57.global copy_user_memcpy_end58copy_user_memcpy:5960#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D161#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D162#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D163#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D16465nop ! ld.b r3,0,r63 ! TAKum0302066pta/l Large,tr067movi 25,r068bgeu/u r4,r0,tr069nsb r4,r070shlli r0,5,r071movi (L1-L0+63*32 + 1) & 0xffff,r172sub r1, r0, r073L0: ptrel r0,tr074add r2,r4,r575ptabs r18,tr176add r3,r4,r677blink tr0,r637879/* Rearranged to make cut2 safe */80.balign 881L4_7: /* 4..7 byte memcpy cntd. */82stlo.l r2, 0, r083or r6, r7, r684sthi.l r5, -1, r685stlo.l r5, -4, r686blink tr1,r638788.balign 889L1: /* 0 byte memcpy */90nop91blink tr1,r6392nop93nop94nop95nop9697L2_3: /* 2 or 3 byte memcpy cntd. */98st.b r5,-1,r699blink tr1,r63100101/* 1 byte memcpy */102ld.b r3,0,r0103st.b r2,0,r0104blink tr1,r63105106L8_15: /* 8..15 byte memcpy cntd. */107stlo.q r2, 0, r0108or r6, r7, r6109sthi.q r5, -1, r6110stlo.q r5, -8, r6111blink tr1,r63112113/* 2 or 3 byte memcpy */114ld.b r3,0,r0115nop ! ld.b r2,0,r63 ! TAKum03020116ld.b r3,1,r1117st.b r2,0,r0118pta/l L2_3,tr0119ld.b r6,-1,r6120st.b r2,1,r1121blink tr0, r63122123/* 4 .. 7 byte memcpy */124LDUAL (r3, 0, r0, r1)125pta L4_7, tr0126ldlo.l r6, -4, r7127or r0, r1, r0128sthi.l r2, 3, r0129ldhi.l r6, -1, r6130blink tr0, r63131132/* 8 .. 15 byte memcpy */133LDUAQ (r3, 0, r0, r1)134pta L8_15, tr0135ldlo.q r6, -8, r7136or r0, r1, r0137sthi.q r2, 7, r0138ldhi.q r6, -1, r6139blink tr0, r63140141/* 16 .. 24 byte memcpy */142LDUAQ (r3, 0, r0, r1)143LDUAQ (r3, 8, r8, r9)144or r0, r1, r0145sthi.q r2, 7, r0146or r8, r9, r8147sthi.q r2, 15, r8148ldlo.q r6, -8, r7149ldhi.q r6, -1, r6150stlo.q r2, 8, r8151stlo.q r2, 0, r0152or r6, r7, r6153sthi.q r5, -1, r6154stlo.q r5, -8, r6155blink tr1,r63156157Large:158! ld.b r2, 0, r63 ! TAKum03020159pta/l Loop_ua, tr1160ori r3, -8, r7161sub r2, r7, r22162sub r3, r2, r6163add r2, r4, r5164ldlo.q r3, 0, r0165addi r5, -16, r5166movi 64+8, r27 ! could subtract r7 from that.167stlo.q r2, 0, r0168sthi.q r2, 7, r0169ldx.q r22, r6, r0170bgtu/l r27, r4, tr1171172addi r5, -48, r27173pta/l Loop_line, tr0174addi r6, 64, r36175addi r6, -24, r19176addi r6, -16, r20177addi r6, -8, r21178179Loop_line:180! ldx.q r22, r36, r63 ! TAKum03020181alloco r22, 32182synco183addi r22, 32, r22184ldx.q r22, r19, r23185sthi.q r22, -25, r0186ldx.q r22, r20, r24187ldx.q r22, r21, r25188stlo.q r22, -32, r0189ldx.q r22, r6, r0190sthi.q r22, -17, r23191sthi.q r22, -9, r24192sthi.q r22, -1, r25193stlo.q r22, -24, r23194stlo.q r22, -16, r24195stlo.q r22, -8, r25196bgeu r27, r22, tr0197198Loop_ua:199addi r22, 8, r22200sthi.q r22, -1, r0201stlo.q r22, -8, r0202ldx.q r22, r6, r0203bgtu/l r5, r22, tr1204205add r3, r4, r7206ldlo.q r7, -8, r1207sthi.q r22, 7, r0208ldhi.q r7, -1, r7209ptabs r18,tr1210stlo.q r22, 0, r0211or r1, r7, r1212sthi.q r5, 15, r1213stlo.q r5, 8, r1214blink tr1, r63215copy_user_memcpy_end:216nop217218219