/* SPDX-License-Identifier: GPL-2.0 */1/*2* arch/alpha/lib/ev6-stxncpy.S3* 21264 version contributed by Rick Gorton <[email protected]>4*5* Copy no more than COUNT bytes of the null-terminated string from6* SRC to DST.7*8* This is an internal routine used by strncpy, stpncpy, and strncat.9* As such, it uses special linkage conventions to make implementation10* of these public functions more efficient.11*12* On input:13* t9 = return address14* a0 = DST15* a1 = SRC16* a2 = COUNT17*18* Furthermore, COUNT may not be zero.19*20* On output:21* t0 = last word written22* t10 = bitmask (with one bit set) indicating the byte position of23* the end of the range specified by COUNT24* t12 = bitmask (with one bit set) indicating the last byte written25* a0 = unaligned address of the last *word* written26* a2 = the number of full words left in COUNT27*28* Furthermore, v0, a3-a5, t11, and $at are untouched.29*30* Much of the information about 21264 scheduling/coding comes from:31* Compiler Writer's Guide for the Alpha 2126432* abbreviated as 'CWG' in other comments here33* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html34* Scheduling notation:35* E - either cluster36* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U137* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L138* Try not to change the actual algorithm if possible for consistency.39*/4041#include <asm/regdef.h>4243.set noat44.set noreorder4546.text4748/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that49doesn't like putting the entry point for a procedure somewhere in the50middle of the procedure descriptor. Work around this by putting the51aligned copy in its own procedure descriptor */525354.ent stxncpy_aligned55.align 456stxncpy_aligned:57.frame sp, 0, t9, 058.prologue 05960/* On entry to this basic block:61t0 == the first destination word for masking back in62t1 == the first source word. */6364/* Create the 1st output word and detect 0's in the 1st input word. */65lda t2, -1 # E : build a mask against false zero66mskqh t2, a1, t2 # U : detection in the src word (stall)67mskqh t1, a1, t3 # U :68ornot t1, t2, t2 # E : (stall)6970mskql t0, a1, t0 # U : assemble the first output word71cmpbge zero, t2, t8 # E : bits set iff null found72or t0, t3, t0 # E : (stall)73beq a2, $a_eoc # U :7475bne t8, $a_eos # U :76nop77nop78nop7980/* On entry to this basic block:81t0 == a source word not containing a null. */8283/*84* nops here to:85* separate store quads from load quads86* limit of 1 bcond/quad to permit training87*/88$a_loop:89stq_u t0, 0(a0) # L :90addq a0, 8, a0 # E :91subq a2, 1, a2 # E :92nop9394ldq_u t0, 0(a1) # L :95addq a1, 8, a1 # E :96cmpbge zero, t0, t8 # E :97beq a2, $a_eoc # U :9899beq t8, $a_loop # U :100nop101nop102nop103104/* Take care of the final (partial) word store. At this point105the end-of-count bit is set in t8 iff it applies.106107On entry to this basic block we have:108t0 == the source word containing the null109t8 == the cmpbge mask that found it. */110111$a_eos:112negq t8, t12 # E : find low bit set113and t8, t12, t12 # E : (stall)114/* For the sake of the cache, don't read a destination word115if we're not going to need it. */116and t12, 0x80, t6 # E : (stall)117bne t6, 1f # U : (stall)118119/* We're doing a partial word store and so need to combine120our source and original destination words. */121ldq_u t1, 0(a0) # L :122subq t12, 1, t6 # E :123or t12, t6, t8 # E : (stall)124zapnot t0, t8, t0 # U : clear src bytes > null (stall)125126zap t1, t8, t1 # .. e1 : clear dst bytes <= null127or t0, t1, t0 # e1 : (stall)128nop129nop1301311: stq_u t0, 0(a0) # L :132ret (t9) # L0 : Latency=3133nop134nop135136/* Add the end-of-count bit to the eos detection bitmask. */137$a_eoc:138or t10, t8, t8 # E :139br $a_eos # L0 : Latency=3140nop141nop142143.end stxncpy_aligned144145.align 4146.ent __stxncpy147.globl __stxncpy148__stxncpy:149.frame sp, 0, t9, 0150.prologue 0151152/* Are source and destination co-aligned? */153xor a0, a1, t1 # E :154and a0, 7, t0 # E : find dest misalignment155and t1, 7, t1 # E : (stall)156addq a2, t0, a2 # E : bias count by dest misalignment (stall)157158subq a2, 1, a2 # E :159and a2, 7, t2 # E : (stall)160srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8 (stall)161addq zero, 1, t10 # E :162163sll t10, t2, t10 # U : t10 = bitmask of last count byte164bne t1, $unaligned # U :165/* We are co-aligned; take care of a partial first word. */166ldq_u t1, 0(a1) # L : load first src word167addq a1, 8, a1 # E :168169beq t0, stxncpy_aligned # U : avoid loading dest word if not needed170ldq_u t0, 0(a0) # L :171nop172nop173174br stxncpy_aligned # .. e1 :175nop176nop177nop178179180181/* The source and destination are not co-aligned. Align the destination182and cope. We have to be very careful about not reading too much and183causing a SEGV. */184185.align 4186$u_head:187/* We know just enough now to be able to assemble the first188full source word. We can still find a zero at the end of it189that prevents us from outputting the whole thing.190191On entry to this basic block:192t0 == the first dest word, unmasked193t1 == the shifted low bits of the first source word194t6 == bytemask that is -1 in dest word bytes */195196ldq_u t2, 8(a1) # L : Latency=3 load second src word197addq a1, 8, a1 # E :198mskql t0, a0, t0 # U : mask trailing garbage in dst199extqh t2, a1, t4 # U : (3 cycle stall on t2)200201or t1, t4, t1 # E : first aligned src word complete (stall)202mskqh t1, a0, t1 # U : mask leading garbage in src (stall)203or t0, t1, t0 # E : first output word complete (stall)204or t0, t6, t6 # E : mask original data for zero test (stall)205206cmpbge zero, t6, t8 # E :207beq a2, $u_eocfin # U :208lda t6, -1 # E :209nop210211bne t8, $u_final # U :212mskql t6, a1, t6 # U : mask out bits already seen213stq_u t0, 0(a0) # L : store first output word214or t6, t2, t2 # E : (stall)215216cmpbge zero, t2, t8 # E : find nulls in second partial217addq a0, 8, a0 # E :218subq a2, 1, a2 # E :219bne t8, $u_late_head_exit # U :220221/* Finally, we've got all the stupid leading edge cases taken care222of and we can set up to enter the main loop. */223extql t2, a1, t1 # U : position hi-bits of lo word224beq a2, $u_eoc # U :225ldq_u t2, 8(a1) # L : read next high-order source word226addq a1, 8, a1 # E :227228extqh t2, a1, t0 # U : position lo-bits of hi word (stall)229cmpbge zero, t2, t8 # E :230nop231bne t8, $u_eos # U :232233/* Unaligned copy main loop. In order to avoid reading too much,234the loop is structured to detect zeros in aligned source words.235This has, unfortunately, effectively pulled half of a loop236iteration out into the head and half into the tail, but it does237prevent nastiness from accumulating in the very thing we want238to run as fast as possible.239240On entry to this basic block:241t0 == the shifted low-order bits from the current source word242t1 == the shifted high-order bits from the previous source word243t2 == the unshifted current source word244245We further know that t2 does not contain a null terminator. */246247.align 4248$u_loop:249or t0, t1, t0 # E : current dst word now complete250subq a2, 1, a2 # E : decrement word count251extql t2, a1, t1 # U : extract low bits for next time252addq a0, 8, a0 # E :253254stq_u t0, -8(a0) # U : save the current word255beq a2, $u_eoc # U :256ldq_u t2, 8(a1) # U : Latency=3 load high word for next time257addq a1, 8, a1 # E :258259extqh t2, a1, t0 # U : extract low bits (2 cycle stall)260cmpbge zero, t2, t8 # E : test new word for eos261nop262beq t8, $u_loop # U :263264/* We've found a zero somewhere in the source word we just read.265If it resides in the lower half, we have one (probably partial)266word to write out, and if it resides in the upper half, we267have one full and one partial word left to write out.268269On entry to this basic block:270t0 == the shifted low-order bits from the current source word271t1 == the shifted high-order bits from the previous source word272t2 == the unshifted current source word. */273$u_eos:274or t0, t1, t0 # E : first (partial) source word complete275nop276cmpbge zero, t0, t8 # E : is the null in this first bit? (stall)277bne t8, $u_final # U : (stall)278279stq_u t0, 0(a0) # L : the null was in the high-order bits280addq a0, 8, a0 # E :281subq a2, 1, a2 # E :282nop283284$u_late_head_exit:285extql t2, a1, t0 # U :286cmpbge zero, t0, t8 # E :287or t8, t10, t6 # E : (stall)288cmoveq a2, t6, t8 # E : Latency=2, extra map slot (stall)289290/* Take care of a final (probably partial) result word.291On entry to this basic block:292t0 == assembled source word293t8 == cmpbge mask that found the null. */294$u_final:295negq t8, t6 # E : isolate low bit set296and t6, t8, t12 # E : (stall)297and t12, 0x80, t6 # E : avoid dest word load if we can (stall)298bne t6, 1f # U : (stall)299300ldq_u t1, 0(a0) # L :301subq t12, 1, t6 # E :302or t6, t12, t8 # E : (stall)303zapnot t0, t8, t0 # U : kill source bytes > null304305zap t1, t8, t1 # U : kill dest bytes <= null306or t0, t1, t0 # E : (stall)307nop308nop3093101: stq_u t0, 0(a0) # L :311ret (t9) # L0 : Latency=3312313/* Got to end-of-count before end of string.314On entry to this basic block:315t1 == the shifted high-order bits from the previous source word */316$u_eoc:317and a1, 7, t6 # E : avoid final load if possible318sll t10, t6, t6 # U : (stall)319and t6, 0xff, t6 # E : (stall)320bne t6, 1f # U : (stall)321322ldq_u t2, 8(a1) # L : load final src word323nop324extqh t2, a1, t0 # U : extract low bits for last word (stall)325or t1, t0, t1 # E : (stall)3263271: cmpbge zero, t1, t8 # E :328mov t1, t0 # E :329330$u_eocfin: # end-of-count, final word331or t10, t8, t8 # E :332br $u_final # L0 : Latency=3333334/* Unaligned copy entry point. */335.align 4336$unaligned:337338ldq_u t1, 0(a1) # L : load first source word339and a0, 7, t4 # E : find dest misalignment340and a1, 7, t5 # E : find src misalignment341/* Conditionally load the first destination word and a bytemask342with 0xff indicating that the destination byte is sacrosanct. */343mov zero, t0 # E :344345mov zero, t6 # E :346beq t4, 1f # U :347ldq_u t0, 0(a0) # L :348lda t6, -1 # E :349350mskql t6, a0, t6 # U :351nop352nop353subq a1, t4, a1 # E : sub dest misalignment from src addr354355/* If source misalignment is larger than dest misalignment, we need356extra startup checks to avoid SEGV. */3573581: cmplt t4, t5, t12 # E :359extql t1, a1, t1 # U : shift src into place360lda t2, -1 # E : for creating masks later361beq t12, $u_head # U : (stall)362363extql t2, a1, t2 # U :364cmpbge zero, t1, t8 # E : is there a zero?365andnot t2, t6, t2 # E : dest mask for a single word copy366or t8, t10, t5 # E : test for end-of-count too367368cmpbge zero, t2, t3 # E :369cmoveq a2, t5, t8 # E : Latency=2, extra map slot370nop # E : keep with cmoveq371andnot t8, t3, t8 # E : (stall)372373beq t8, $u_head # U :374/* At this point we've found a zero in the first partial word of375the source. We need to isolate the valid source data and mask376it into the original destination data. (Incidentally, we know377that we'll need at least one byte of that original dest word.) */378ldq_u t0, 0(a0) # L :379negq t8, t6 # E : build bitmask of bytes <= zero380mskqh t1, t4, t1 # U :381382and t6, t8, t12 # E :383subq t12, 1, t6 # E : (stall)384or t6, t12, t8 # E : (stall)385zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall)386387zapnot t1, t8, t1 # U : to source validity mask388andnot t0, t2, t0 # E : zero place for source to reside389or t0, t1, t0 # E : and put it there (stall both t0, t1)390stq_u t0, 0(a0) # L : (stall)391392ret (t9) # L0 : Latency=3393nop394nop395nop396397.end __stxncpy398399400