Path: blob/master/arch/alpha/lib/ev6-strncpy_from_user.S
10817 views
/*1* arch/alpha/lib/ev6-strncpy_from_user.S2* 21264 version contributed by Rick Gorton <[email protected]>3*4* Just like strncpy except in the return value:5*6* -EFAULT if an exception occurs before the terminator is copied.7* N if the buffer filled.8*9* Otherwise the length of the string is returned.10*11* Much of the information about 21264 scheduling/coding comes from:12* Compiler Writer's Guide for the Alpha 2126413* abbreviated as 'CWG' in other comments here14* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html15* Scheduling notation:16* E - either cluster17* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U118* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L119* A bunch of instructions got moved and temp registers were changed20* to aid in scheduling. Control flow was also re-arranged to eliminate21* branches, and to provide longer code sequences to enable better scheduling.22* A total rewrite (using byte load/stores for start & tail sequences)23* is desirable, but very difficult to do without a from-scratch rewrite.24* Save that for the future.25*/262728#include <asm/errno.h>29#include <asm/regdef.h>303132/* Allow an exception for an insn; exit if we get one. */33#define EX(x,y...) \3499: x,##y; \35.section __ex_table,"a"; \36.long 99b - .; \37lda $31, $exception-99b($0); \38.previous394041.set noat42.set noreorder43.text4445.globl __strncpy_from_user46.ent __strncpy_from_user47.frame $30, 0, $2648.prologue 04950.align 451__strncpy_from_user:52and a0, 7, t3 # E : find dest misalignment53beq a2, $zerolength # U :5455/* Are source and destination co-aligned? */56mov a0, v0 # E : save the string start57xor a0, a1, t4 # E :58EX( ldq_u t1, 0(a1) ) # L : Latency=3 load first quadword59ldq_u t0, 0(a0) # L : load first (partial) aligned dest quadword6061addq a2, t3, a2 # E : bias count by dest misalignment62subq a2, 1, a3 # E :63addq zero, 1, t10 # E :64and t4, 7, t4 # E : misalignment between the two6566and a3, 7, t6 # E : number of tail bytes67sll t10, t6, t10 # E : t10 = bitmask of last count byte68bne t4, $unaligned # U :69lda t2, -1 # E : build a mask against false zero7071/*72* We are co-aligned; take care of a partial first word.73* On entry to this basic block:74* t0 == the first destination word for masking back in75* t1 == the first source word.76*/7778srl a3, 3, a2 # E : a2 = loop counter = (count - 1)/879addq a1, 8, a1 # E :80mskqh t2, a1, t2 # U : detection in the src word81nop8283/* Create the 1st output word and detect 0's in the 1st input word. */84mskqh t1, a1, t3 # U :85mskql t0, a1, t0 # U : assemble the first output word86ornot t1, t2, t2 # E :87nop8889cmpbge zero, t2, t8 # E : bits set iff null found90or t0, t3, t0 # E :91beq a2, $a_eoc # U :92bne t8, $a_eos # U : 2nd branch in a quad. Bad.9394/* On entry to this basic block:95* t0 == a source quad not containing a null.96* a0 - current aligned destination address97* a1 - current aligned source address98* a2 - count of quadwords to move.99* NOTE: Loop improvement - unrolling this is going to be100* a huge win, since we're going to stall otherwise.101* Fix this later. For _really_ large copies, look102* at using wh64 on a look-ahead basis. See the code103* in clear_user.S and copy_user.S.104* Presumably, since (a0) and (a1) do not overlap (by C definition)105* Lots of nops here:106* - Separate loads from stores107* - Keep it to 1 branch/quadpack so the branch predictor108* can train.109*/110$a_loop:111stq_u t0, 0(a0) # L :112addq a0, 8, a0 # E :113nop114subq a2, 1, a2 # E :115116EX( ldq_u t0, 0(a1) ) # L :117addq a1, 8, a1 # E :118cmpbge zero, t0, t8 # E : Stall 2 cycles on t0119beq a2, $a_eoc # U :120121beq t8, $a_loop # U :122nop123nop124nop125126/* Take care of the final (partial) word store. At this point127* the end-of-count bit is set in t8 iff it applies.128*129* On entry to this basic block we have:130* t0 == the source word containing the null131* t8 == the cmpbge mask that found it.132*/133$a_eos:134negq t8, t12 # E : find low bit set135and t8, t12, t12 # E :136137/* We're doing a partial word store and so need to combine138our source and original destination words. */139ldq_u t1, 0(a0) # L :140subq t12, 1, t6 # E :141142or t12, t6, t8 # E :143zapnot t0, t8, t0 # U : clear src bytes > null144zap t1, t8, t1 # U : clear dst bytes <= null145or t0, t1, t0 # E :146147stq_u t0, 0(a0) # L :148br $finish_up # L0 :149nop150nop151152/* Add the end-of-count bit to the eos detection bitmask. */153.align 4154$a_eoc:155or t10, t8, t8156br $a_eos157nop158nop159160161/* The source and destination are not co-aligned. Align the destination162and cope. We have to be very careful about not reading too much and163causing a SEGV. */164165.align 4166$u_head:167/* We know just enough now to be able to assemble the first168full source word. We can still find a zero at the end of it169that prevents us from outputting the whole thing.170171On entry to this basic block:172t0 == the first dest word, unmasked173t1 == the shifted low bits of the first source word174t6 == bytemask that is -1 in dest word bytes */175176EX( ldq_u t2, 8(a1) ) # L : load second src word177addq a1, 8, a1 # E :178mskql t0, a0, t0 # U : mask trailing garbage in dst179extqh t2, a1, t4 # U :180181or t1, t4, t1 # E : first aligned src word complete182mskqh t1, a0, t1 # U : mask leading garbage in src183or t0, t1, t0 # E : first output word complete184or t0, t6, t6 # E : mask original data for zero test185186cmpbge zero, t6, t8 # E :187beq a2, $u_eocfin # U :188bne t8, $u_final # U : bad news - 2nd branch in a quad189lda t6, -1 # E : mask out the bits we have190191mskql t6, a1, t6 # U : already seen192stq_u t0, 0(a0) # L : store first output word193or t6, t2, t2 # E :194cmpbge zero, t2, t8 # E : find nulls in second partial195196addq a0, 8, a0 # E :197subq a2, 1, a2 # E :198bne t8, $u_late_head_exit # U :199nop200201/* Finally, we've got all the stupid leading edge cases taken care202of and we can set up to enter the main loop. */203204extql t2, a1, t1 # U : position hi-bits of lo word205EX( ldq_u t2, 8(a1) ) # L : read next high-order source word206addq a1, 8, a1 # E :207cmpbge zero, t2, t8 # E :208209beq a2, $u_eoc # U :210bne t8, $u_eos # U :211nop212nop213214/* Unaligned copy main loop. In order to avoid reading too much,215the loop is structured to detect zeros in aligned source words.216This has, unfortunately, effectively pulled half of a loop217iteration out into the head and half into the tail, but it does218prevent nastiness from accumulating in the very thing we want219to run as fast as possible.220221On entry to this basic block:222t1 == the shifted high-order bits from the previous source word223t2 == the unshifted current source word224225We further know that t2 does not contain a null terminator. */226227/*228* Extra nops here:229* separate load quads from store quads230* only one branch/quad to permit predictor training231*/232233.align 4234$u_loop:235extqh t2, a1, t0 # U : extract high bits for current word236addq a1, 8, a1 # E :237extql t2, a1, t3 # U : extract low bits for next time238addq a0, 8, a0 # E :239240or t0, t1, t0 # E : current dst word now complete241EX( ldq_u t2, 0(a1) ) # L : load high word for next time242subq a2, 1, a2 # E :243nop244245stq_u t0, -8(a0) # L : save the current word246mov t3, t1 # E :247cmpbge zero, t2, t8 # E : test new word for eos248beq a2, $u_eoc # U :249250beq t8, $u_loop # U :251nop252nop253nop254255/* We've found a zero somewhere in the source word we just read.256If it resides in the lower half, we have one (probably partial)257word to write out, and if it resides in the upper half, we258have one full and one partial word left to write out.259260On entry to this basic block:261t1 == the shifted high-order bits from the previous source word262t2 == the unshifted current source word. */263.align 4264$u_eos:265extqh t2, a1, t0 # U :266or t0, t1, t0 # E : first (partial) source word complete267cmpbge zero, t0, t8 # E : is the null in this first bit?268nop269270bne t8, $u_final # U :271stq_u t0, 0(a0) # L : the null was in the high-order bits272addq a0, 8, a0 # E :273subq a2, 1, a2 # E :274275.align 4276$u_late_head_exit:277extql t2, a1, t0 # U :278cmpbge zero, t0, t8 # E :279or t8, t10, t6 # E :280cmoveq a2, t6, t8 # E :281282/* Take care of a final (probably partial) result word.283On entry to this basic block:284t0 == assembled source word285t8 == cmpbge mask that found the null. */286.align 4287$u_final:288negq t8, t6 # E : isolate low bit set289and t6, t8, t12 # E :290ldq_u t1, 0(a0) # L :291subq t12, 1, t6 # E :292293or t6, t12, t8 # E :294zapnot t0, t8, t0 # U : kill source bytes > null295zap t1, t8, t1 # U : kill dest bytes <= null296or t0, t1, t0 # E :297298stq_u t0, 0(a0) # E :299br $finish_up # U :300nop301nop302303.align 4304$u_eoc: # end-of-count305extqh t2, a1, t0 # U :306or t0, t1, t0 # E :307cmpbge zero, t0, t8 # E :308nop309310.align 4311$u_eocfin: # end-of-count, final word312or t10, t8, t8 # E :313br $u_final # U :314nop315nop316317/* Unaligned copy entry point. */318.align 4319$unaligned:320321srl a3, 3, a2 # U : a2 = loop counter = (count - 1)/8322and a0, 7, t4 # E : find dest misalignment323and a1, 7, t5 # E : find src misalignment324mov zero, t0 # E :325326/* Conditionally load the first destination word and a bytemask327with 0xff indicating that the destination byte is sacrosanct. */328329mov zero, t6 # E :330beq t4, 1f # U :331ldq_u t0, 0(a0) # L :332lda t6, -1 # E :333334mskql t6, a0, t6 # E :335nop336nop337nop338339.align 43401:341subq a1, t4, a1 # E : sub dest misalignment from src addr342/* If source misalignment is larger than dest misalignment, we need343extra startup checks to avoid SEGV. */344cmplt t4, t5, t12 # E :345extql t1, a1, t1 # U : shift src into place346lda t2, -1 # E : for creating masks later347348beq t12, $u_head # U :349mskqh t2, t5, t2 # U : begin src byte validity mask350cmpbge zero, t1, t8 # E : is there a zero?351nop352353extql t2, a1, t2 # U :354or t8, t10, t5 # E : test for end-of-count too355cmpbge zero, t2, t3 # E :356cmoveq a2, t5, t8 # E : Latency=2, extra map slot357358nop # E : goes with cmov359andnot t8, t3, t8 # E :360beq t8, $u_head # U :361nop362363/* At this point we've found a zero in the first partial word of364the source. We need to isolate the valid source data and mask365it into the original destination data. (Incidentally, we know366that we'll need at least one byte of that original dest word.) */367368ldq_u t0, 0(a0) # L :369negq t8, t6 # E : build bitmask of bytes <= zero370mskqh t1, t4, t1 # U :371and t6, t8, t12 # E :372373subq t12, 1, t6 # E :374or t6, t12, t8 # E :375zapnot t2, t8, t2 # U : prepare source word; mirror changes376zapnot t1, t8, t1 # U : to source validity mask377378andnot t0, t2, t0 # E : zero place for source to reside379or t0, t1, t0 # E : and put it there380stq_u t0, 0(a0) # L :381nop382383.align 4384$finish_up:385zapnot t0, t12, t4 # U : was last byte written null?386and t12, 0xf0, t3 # E : binary search for the address of the387cmovne t4, 1, t4 # E : Latency=2, extra map slot388nop # E : with cmovne389390and t12, 0xcc, t2 # E : last byte written391and t12, 0xaa, t1 # E :392cmovne t3, 4, t3 # E : Latency=2, extra map slot393nop # E : with cmovne394395bic a0, 7, t0396cmovne t2, 2, t2 # E : Latency=2, extra map slot397nop # E : with cmovne398nop399400cmovne t1, 1, t1 # E : Latency=2, extra map slot401nop # E : with cmovne402addq t0, t3, t0 # E :403addq t1, t2, t1 # E :404405addq t0, t1, t0 # E :406addq t0, t4, t0 # add one if we filled the buffer407subq t0, v0, v0 # find string length408ret # L0 :409410.align 4411$zerolength:412nop413nop414nop415clr v0416417$exception:418nop419nop420nop421ret422423.end __strncpy_from_user424425426