/*1* arch/alpha/lib/ev6-stxcpy.S2* 21264 version contributed by Rick Gorton <[email protected]>3*4* Copy a null-terminated string from SRC to DST.5*6* This is an internal routine used by strcpy, stpcpy, and strcat.7* As such, it uses special linkage conventions to make implementation8* of these public functions more efficient.9*10* On input:11* t9 = return address12* a0 = DST13* a1 = SRC14*15* On output:16* t12 = bitmask (with one bit set) indicating the last byte written17* a0 = unaligned address of the last *word* written18*19* Furthermore, v0, a3-a5, t11, and t12 are untouched.20*21* Much of the information about 21264 scheduling/coding comes from:22* Compiler Writer's Guide for the Alpha 2126423* abbreviated as 'CWG' in other comments here24* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html25* Scheduling notation:26* E - either cluster27* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U128* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L129* Try not to change the actual algorithm if possible for consistency.30*/3132#include <asm/regdef.h>3334.set noat35.set noreorder3637.text3839/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that40doesn't like putting the entry point for a procedure somewhere in the41middle of the procedure descriptor. Work around this by putting the42aligned copy in its own procedure descriptor */434445.ent stxcpy_aligned46.align 447stxcpy_aligned:48.frame sp, 0, t949.prologue 05051/* On entry to this basic block:52t0 == the first destination word for masking back in53t1 == the first source word. */5455/* Create the 1st output word and detect 0's in the 1st input word. */56lda t2, -1 # E : build a mask against false zero57mskqh t2, a1, t2 # U : detection in the src word (stall)58mskqh t1, a1, t3 # U :59ornot t1, t2, t2 # E : (stall)6061mskql t0, a1, t0 # U : assemble the first output word62cmpbge zero, t2, t8 # E : bits set iff null found63or t0, t3, t1 # E : (stall)64bne t8, $a_eos # U : (stall)6566/* On entry to this basic block:67t0 == the first destination word for masking back in68t1 == a source word not containing a null. */69/* Nops here to separate store quads from load quads */7071$a_loop:72stq_u t1, 0(a0) # L :73addq a0, 8, a0 # E :74nop75nop7677ldq_u t1, 0(a1) # L : Latency=378addq a1, 8, a1 # E :79cmpbge zero, t1, t8 # E : (3 cycle stall)80beq t8, $a_loop # U : (stall for t8)8182/* Take care of the final (partial) word store.83On entry to this basic block we have:84t1 == the source word containing the null85t8 == the cmpbge mask that found it. */86$a_eos:87negq t8, t6 # E : find low bit set88and t8, t6, t12 # E : (stall)89/* For the sake of the cache, don't read a destination word90if we're not going to need it. */91and t12, 0x80, t6 # E : (stall)92bne t6, 1f # U : (stall)9394/* We're doing a partial word store and so need to combine95our source and original destination words. */96ldq_u t0, 0(a0) # L : Latency=397subq t12, 1, t6 # E :98zapnot t1, t6, t1 # U : clear src bytes >= null (stall)99or t12, t6, t8 # E : (stall)100101zap t0, t8, t0 # E : clear dst bytes <= null102or t0, t1, t1 # E : (stall)103nop104nop1051061: stq_u t1, 0(a0) # L :107ret (t9) # L0 : Latency=3108nop109nop110111.end stxcpy_aligned112113.align 4114.ent __stxcpy115.globl __stxcpy116__stxcpy:117.frame sp, 0, t9118.prologue 0119120/* Are source and destination co-aligned? */121xor a0, a1, t0 # E :122unop # E :123and t0, 7, t0 # E : (stall)124bne t0, $unaligned # U : (stall)125126/* We are co-aligned; take care of a partial first word. */127ldq_u t1, 0(a1) # L : load first src word128and a0, 7, t0 # E : take care not to load a word ...129addq a1, 8, a1 # E :130beq t0, stxcpy_aligned # U : ... if we wont need it (stall)131132ldq_u t0, 0(a0) # L :133br stxcpy_aligned # L0 : Latency=3134nop135nop136137138/* The source and destination are not co-aligned. Align the destination139and cope. We have to be very careful about not reading too much and140causing a SEGV. */141142.align 4143$u_head:144/* We know just enough now to be able to assemble the first145full source word. We can still find a zero at the end of it146that prevents us from outputting the whole thing.147148On entry to this basic block:149t0 == the first dest word, for masking back in, if needed else 0150t1 == the low bits of the first source word151t6 == bytemask that is -1 in dest word bytes */152153ldq_u t2, 8(a1) # L :154addq a1, 8, a1 # E :155extql t1, a1, t1 # U : (stall on a1)156extqh t2, a1, t4 # U : (stall on a1)157158mskql t0, a0, t0 # U :159or t1, t4, t1 # E :160mskqh t1, a0, t1 # U : (stall on t1)161or t0, t1, t1 # E : (stall on t1)162163or t1, t6, t6 # E :164cmpbge zero, t6, t8 # E : (stall)165lda t6, -1 # E : for masking just below166bne t8, $u_final # U : (stall)167168mskql t6, a1, t6 # U : mask out the bits we have169or t6, t2, t2 # E : already extracted before (stall)170cmpbge zero, t2, t8 # E : testing eos (stall)171bne t8, $u_late_head_exit # U : (stall)172173/* Finally, we've got all the stupid leading edge cases taken care174of and we can set up to enter the main loop. */175176stq_u t1, 0(a0) # L : store first output word177addq a0, 8, a0 # E :178extql t2, a1, t0 # U : position ho-bits of lo word179ldq_u t2, 8(a1) # U : read next high-order source word180181addq a1, 8, a1 # E :182cmpbge zero, t2, t8 # E : (stall for t2)183nop # E :184bne t8, $u_eos # U : (stall)185186/* Unaligned copy main loop. In order to avoid reading too much,187the loop is structured to detect zeros in aligned source words.188This has, unfortunately, effectively pulled half of a loop189iteration out into the head and half into the tail, but it does190prevent nastiness from accumulating in the very thing we want191to run as fast as possible.192193On entry to this basic block:194t0 == the shifted high-order bits from the previous source word195t2 == the unshifted current source word196197We further know that t2 does not contain a null terminator. */198199.align 3200$u_loop:201extqh t2, a1, t1 # U : extract high bits for current word202addq a1, 8, a1 # E : (stall)203extql t2, a1, t3 # U : extract low bits for next time (stall)204addq a0, 8, a0 # E :205206or t0, t1, t1 # E : current dst word now complete207ldq_u t2, 0(a1) # L : Latency=3 load high word for next time208stq_u t1, -8(a0) # L : save the current word (stall)209mov t3, t0 # E :210211cmpbge zero, t2, t8 # E : test new word for eos212beq t8, $u_loop # U : (stall)213nop214nop215216/* We've found a zero somewhere in the source word we just read.217If it resides in the lower half, we have one (probably partial)218word to write out, and if it resides in the upper half, we219have one full and one partial word left to write out.220221On entry to this basic block:222t0 == the shifted high-order bits from the previous source word223t2 == the unshifted current source word. */224$u_eos:225extqh t2, a1, t1 # U :226or t0, t1, t1 # E : first (partial) source word complete (stall)227cmpbge zero, t1, t8 # E : is the null in this first bit? (stall)228bne t8, $u_final # U : (stall)229230$u_late_head_exit:231stq_u t1, 0(a0) # L : the null was in the high-order bits232addq a0, 8, a0 # E :233extql t2, a1, t1 # U :234cmpbge zero, t1, t8 # E : (stall)235236/* Take care of a final (probably partial) result word.237On entry to this basic block:238t1 == assembled source word239t8 == cmpbge mask that found the null. */240$u_final:241negq t8, t6 # E : isolate low bit set242and t6, t8, t12 # E : (stall)243and t12, 0x80, t6 # E : avoid dest word load if we can (stall)244bne t6, 1f # U : (stall)245246ldq_u t0, 0(a0) # E :247subq t12, 1, t6 # E :248or t6, t12, t8 # E : (stall)249zapnot t1, t6, t1 # U : kill source bytes >= null (stall)250251zap t0, t8, t0 # U : kill dest bytes <= null (2 cycle data stall)252or t0, t1, t1 # E : (stall)253nop254nop2552561: stq_u t1, 0(a0) # L :257ret (t9) # L0 : Latency=3258nop259nop260261/* Unaligned copy entry point. */262.align 4263$unaligned:264265ldq_u t1, 0(a1) # L : load first source word266and a0, 7, t4 # E : find dest misalignment267and a1, 7, t5 # E : find src misalignment268/* Conditionally load the first destination word and a bytemask269with 0xff indicating that the destination byte is sacrosanct. */270mov zero, t0 # E :271272mov zero, t6 # E :273beq t4, 1f # U :274ldq_u t0, 0(a0) # L :275lda t6, -1 # E :276277mskql t6, a0, t6 # U :278nop279nop280nop2811:282subq a1, t4, a1 # E : sub dest misalignment from src addr283/* If source misalignment is larger than dest misalignment, we need284extra startup checks to avoid SEGV. */285cmplt t4, t5, t12 # E :286beq t12, $u_head # U :287lda t2, -1 # E : mask out leading garbage in source288289mskqh t2, t5, t2 # U :290ornot t1, t2, t3 # E : (stall)291cmpbge zero, t3, t8 # E : is there a zero? (stall)292beq t8, $u_head # U : (stall)293294/* At this point we've found a zero in the first partial word of295the source. We need to isolate the valid source data and mask296it into the original destination data. (Incidentally, we know297that we'll need at least one byte of that original dest word.) */298299ldq_u t0, 0(a0) # L :300negq t8, t6 # E : build bitmask of bytes <= zero301and t6, t8, t12 # E : (stall)302and a1, 7, t5 # E :303304subq t12, 1, t6 # E :305or t6, t12, t8 # E : (stall)306srl t12, t5, t12 # U : adjust final null return value307zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall)308309and t1, t2, t1 # E : to source validity mask310extql t2, a1, t2 # U :311extql t1, a1, t1 # U : (stall)312andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall)313314or t0, t1, t1 # e1 : and put it there315stq_u t1, 0(a0) # .. e0 : (stall)316ret (t9) # e1 :317nop318319.end __stxcpy320321322323