/*1* arch/alpha/lib/ev6-stxncpy.S2* 21264 version contributed by Rick Gorton <[email protected]>3*4* Copy no more than COUNT bytes of the null-terminated string from5* SRC to DST.6*7* This is an internal routine used by strncpy, stpncpy, and strncat.8* As such, it uses special linkage conventions to make implementation9* of these public functions more efficient.10*11* On input:12* t9 = return address13* a0 = DST14* a1 = SRC15* a2 = COUNT16*17* Furthermore, COUNT may not be zero.18*19* On output:20* t0 = last word written21* t10 = bitmask (with one bit set) indicating the byte position of22* the end of the range specified by COUNT23* t12 = bitmask (with one bit set) indicating the last byte written24* a0 = unaligned address of the last *word* written25* a2 = the number of full words left in COUNT26*27* Furthermore, v0, a3-a5, t11, and $at are untouched.28*29* Much of the information about 21264 scheduling/coding comes from:30* Compiler Writer's Guide for the Alpha 2126431* abbreviated as 'CWG' in other comments here32* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html33* Scheduling notation:34* E - either cluster35* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U136* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L137* Try not to change the actual algorithm if possible for consistency.38*/3940#include <asm/regdef.h>4142.set noat43.set noreorder4445.text4647/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that48doesn't like putting the entry point for a procedure somewhere in the49middle of the procedure descriptor. Work around this by putting the50aligned copy in its own procedure descriptor */515253.ent stxncpy_aligned54.align 455stxncpy_aligned:56.frame sp, 0, t9, 057.prologue 05859/* On entry to this basic block:60t0 == the first destination word for masking back in61t1 == the first source word. */6263/* Create the 1st output word and detect 0's in the 1st input word. */64lda t2, -1 # E : build a mask against false zero65mskqh t2, a1, t2 # U : detection in the src word (stall)66mskqh t1, a1, t3 # U :67ornot t1, t2, t2 # E : (stall)6869mskql t0, a1, t0 # U : assemble the first output word70cmpbge zero, t2, t8 # E : bits set iff null found71or t0, t3, t0 # E : (stall)72beq a2, $a_eoc # U :7374bne t8, $a_eos # U :75nop76nop77nop7879/* On entry to this basic block:80t0 == a source word not containing a null. */8182/*83* nops here to:84* separate store quads from load quads85* limit of 1 bcond/quad to permit training86*/87$a_loop:88stq_u t0, 0(a0) # L :89addq a0, 8, a0 # E :90subq a2, 1, a2 # E :91nop9293ldq_u t0, 0(a1) # L :94addq a1, 8, a1 # E :95cmpbge zero, t0, t8 # E :96beq a2, $a_eoc # U :9798beq t8, $a_loop # U :99nop100nop101nop102103/* Take care of the final (partial) word store. At this point104the end-of-count bit is set in t8 iff it applies.105106On entry to this basic block we have:107t0 == the source word containing the null108t8 == the cmpbge mask that found it. */109110$a_eos:111negq t8, t12 # E : find low bit set112and t8, t12, t12 # E : (stall)113/* For the sake of the cache, don't read a destination word114if we're not going to need it. */115and t12, 0x80, t6 # E : (stall)116bne t6, 1f # U : (stall)117118/* We're doing a partial word store and so need to combine119our source and original destination words. */120ldq_u t1, 0(a0) # L :121subq t12, 1, t6 # E :122or t12, t6, t8 # E : (stall)123zapnot t0, t8, t0 # U : clear src bytes > null (stall)124125zap t1, t8, t1 # .. e1 : clear dst bytes <= null126or t0, t1, t0 # e1 : (stall)127nop128nop1291301: stq_u t0, 0(a0) # L :131ret (t9) # L0 : Latency=3132nop133nop134135/* Add the end-of-count bit to the eos detection bitmask. */136$a_eoc:137or t10, t8, t8 # E :138br $a_eos # L0 : Latency=3139nop140nop141142.end stxncpy_aligned143144.align 4145.ent __stxncpy146.globl __stxncpy147__stxncpy:148.frame sp, 0, t9, 0149.prologue 0150151/* Are source and destination co-aligned? */152xor a0, a1, t1 # E :153and a0, 7, t0 # E : find dest misalignment154and t1, 7, t1 # E : (stall)155addq a2, t0, a2 # E : bias count by dest misalignment (stall)156157subq a2, 1, a2 # E :158and a2, 7, t2 # E : (stall)159srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8 (stall)160addq zero, 1, t10 # E :161162sll t10, t2, t10 # U : t10 = bitmask of last count byte163bne t1, $unaligned # U :164/* We are co-aligned; take care of a partial first word. */165ldq_u t1, 0(a1) # L : load first src word166addq a1, 8, a1 # E :167168beq t0, stxncpy_aligned # U : avoid loading dest word if not needed169ldq_u t0, 0(a0) # L :170nop171nop172173br stxncpy_aligned # .. e1 :174nop175nop176nop177178179180/* The source and destination are not co-aligned. Align the destination181and cope. We have to be very careful about not reading too much and182causing a SEGV. */183184.align 4185$u_head:186/* We know just enough now to be able to assemble the first187full source word. We can still find a zero at the end of it188that prevents us from outputting the whole thing.189190On entry to this basic block:191t0 == the first dest word, unmasked192t1 == the shifted low bits of the first source word193t6 == bytemask that is -1 in dest word bytes */194195ldq_u t2, 8(a1) # L : Latency=3 load second src word196addq a1, 8, a1 # E :197mskql t0, a0, t0 # U : mask trailing garbage in dst198extqh t2, a1, t4 # U : (3 cycle stall on t2)199200or t1, t4, t1 # E : first aligned src word complete (stall)201mskqh t1, a0, t1 # U : mask leading garbage in src (stall)202or t0, t1, t0 # E : first output word complete (stall)203or t0, t6, t6 # E : mask original data for zero test (stall)204205cmpbge zero, t6, t8 # E :206beq a2, $u_eocfin # U :207lda t6, -1 # E :208nop209210bne t8, $u_final # U :211mskql t6, a1, t6 # U : mask out bits already seen212stq_u t0, 0(a0) # L : store first output word213or t6, t2, t2 # E : (stall)214215cmpbge zero, t2, t8 # E : find nulls in second partial216addq a0, 8, a0 # E :217subq a2, 1, a2 # E :218bne t8, $u_late_head_exit # U :219220/* Finally, we've got all the stupid leading edge cases taken care221of and we can set up to enter the main loop. */222extql t2, a1, t1 # U : position hi-bits of lo word223beq a2, $u_eoc # U :224ldq_u t2, 8(a1) # L : read next high-order source word225addq a1, 8, a1 # E :226227extqh t2, a1, t0 # U : position lo-bits of hi word (stall)228cmpbge zero, t2, t8 # E :229nop230bne t8, $u_eos # U :231232/* Unaligned copy main loop. In order to avoid reading too much,233the loop is structured to detect zeros in aligned source words.234This has, unfortunately, effectively pulled half of a loop235iteration out into the head and half into the tail, but it does236prevent nastiness from accumulating in the very thing we want237to run as fast as possible.238239On entry to this basic block:240t0 == the shifted low-order bits from the current source word241t1 == the shifted high-order bits from the previous source word242t2 == the unshifted current source word243244We further know that t2 does not contain a null terminator. */245246.align 4247$u_loop:248or t0, t1, t0 # E : current dst word now complete249subq a2, 1, a2 # E : decrement word count250extql t2, a1, t1 # U : extract low bits for next time251addq a0, 8, a0 # E :252253stq_u t0, -8(a0) # U : save the current word254beq a2, $u_eoc # U :255ldq_u t2, 8(a1) # U : Latency=3 load high word for next time256addq a1, 8, a1 # E :257258extqh t2, a1, t0 # U : extract low bits (2 cycle stall)259cmpbge zero, t2, t8 # E : test new word for eos260nop261beq t8, $u_loop # U :262263/* We've found a zero somewhere in the source word we just read.264If it resides in the lower half, we have one (probably partial)265word to write out, and if it resides in the upper half, we266have one full and one partial word left to write out.267268On entry to this basic block:269t0 == the shifted low-order bits from the current source word270t1 == the shifted high-order bits from the previous source word271t2 == the unshifted current source word. */272$u_eos:273or t0, t1, t0 # E : first (partial) source word complete274nop275cmpbge zero, t0, t8 # E : is the null in this first bit? (stall)276bne t8, $u_final # U : (stall)277278stq_u t0, 0(a0) # L : the null was in the high-order bits279addq a0, 8, a0 # E :280subq a2, 1, a2 # E :281nop282283$u_late_head_exit:284extql t2, a1, t0 # U :285cmpbge zero, t0, t8 # E :286or t8, t10, t6 # E : (stall)287cmoveq a2, t6, t8 # E : Latency=2, extra map slot (stall)288289/* Take care of a final (probably partial) result word.290On entry to this basic block:291t0 == assembled source word292t8 == cmpbge mask that found the null. */293$u_final:294negq t8, t6 # E : isolate low bit set295and t6, t8, t12 # E : (stall)296and t12, 0x80, t6 # E : avoid dest word load if we can (stall)297bne t6, 1f # U : (stall)298299ldq_u t1, 0(a0) # L :300subq t12, 1, t6 # E :301or t6, t12, t8 # E : (stall)302zapnot t0, t8, t0 # U : kill source bytes > null303304zap t1, t8, t1 # U : kill dest bytes <= null305or t0, t1, t0 # E : (stall)306nop307nop3083091: stq_u t0, 0(a0) # L :310ret (t9) # L0 : Latency=3311312/* Got to end-of-count before end of string.313On entry to this basic block:314t1 == the shifted high-order bits from the previous source word */315$u_eoc:316and a1, 7, t6 # E : avoid final load if possible317sll t10, t6, t6 # U : (stall)318and t6, 0xff, t6 # E : (stall)319bne t6, 1f # U : (stall)320321ldq_u t2, 8(a1) # L : load final src word322nop323extqh t2, a1, t0 # U : extract low bits for last word (stall)324or t1, t0, t1 # E : (stall)3253261: cmpbge zero, t1, t8 # E :327mov t1, t0 # E :328329$u_eocfin: # end-of-count, final word330or t10, t8, t8 # E :331br $u_final # L0 : Latency=3332333/* Unaligned copy entry point. */334.align 4335$unaligned:336337ldq_u t1, 0(a1) # L : load first source word338and a0, 7, t4 # E : find dest misalignment339and a1, 7, t5 # E : find src misalignment340/* Conditionally load the first destination word and a bytemask341with 0xff indicating that the destination byte is sacrosanct. */342mov zero, t0 # E :343344mov zero, t6 # E :345beq t4, 1f # U :346ldq_u t0, 0(a0) # L :347lda t6, -1 # E :348349mskql t6, a0, t6 # U :350nop351nop352subq a1, t4, a1 # E : sub dest misalignment from src addr353354/* If source misalignment is larger than dest misalignment, we need355extra startup checks to avoid SEGV. */3563571: cmplt t4, t5, t12 # E :358extql t1, a1, t1 # U : shift src into place359lda t2, -1 # E : for creating masks later360beq t12, $u_head # U : (stall)361362extql t2, a1, t2 # U :363cmpbge zero, t1, t8 # E : is there a zero?364andnot t2, t6, t2 # E : dest mask for a single word copy365or t8, t10, t5 # E : test for end-of-count too366367cmpbge zero, t2, t3 # E :368cmoveq a2, t5, t8 # E : Latency=2, extra map slot369nop # E : keep with cmoveq370andnot t8, t3, t8 # E : (stall)371372beq t8, $u_head # U :373/* At this point we've found a zero in the first partial word of374the source. We need to isolate the valid source data and mask375it into the original destination data. (Incidentally, we know376that we'll need at least one byte of that original dest word.) */377ldq_u t0, 0(a0) # L :378negq t8, t6 # E : build bitmask of bytes <= zero379mskqh t1, t4, t1 # U :380381and t6, t8, t12 # E :382subq t12, 1, t6 # E : (stall)383or t6, t12, t8 # E : (stall)384zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall)385386zapnot t1, t8, t1 # U : to source validity mask387andnot t0, t2, t0 # E : zero place for source to reside388or t0, t1, t0 # E : and put it there (stall both t0, t1)389stq_u t0, 0(a0) # L : (stall)390391ret (t9) # L0 : Latency=3392nop393nop394nop395396.end __stxncpy397398399