/* SPDX-License-Identifier: GPL-2.0 */1/*2* arch/alpha/lib/ev6-stxcpy.S3* 21264 version contributed by Rick Gorton <[email protected]>4*5* Copy a null-terminated string from SRC to DST.6*7* This is an internal routine used by strcpy, stpcpy, and strcat.8* As such, it uses special linkage conventions to make implementation9* of these public functions more efficient.10*11* On input:12* t9 = return address13* a0 = DST14* a1 = SRC15*16* On output:17* t12 = bitmask (with one bit set) indicating the last byte written18* a0 = unaligned address of the last *word* written19*20* Furthermore, v0, a3-a5, t11, and t12 are untouched.21*22* Much of the information about 21264 scheduling/coding comes from:23* Compiler Writer's Guide for the Alpha 2126424* abbreviated as 'CWG' in other comments here25* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html26* Scheduling notation:27* E - either cluster28* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U129* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L130* Try not to change the actual algorithm if possible for consistency.31*/3233#include <asm/regdef.h>3435.set noat36.set noreorder3738.text3940/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that41doesn't like putting the entry point for a procedure somewhere in the42middle of the procedure descriptor. Work around this by putting the43aligned copy in its own procedure descriptor */444546.ent stxcpy_aligned47.align 448stxcpy_aligned:49.frame sp, 0, t950.prologue 05152/* On entry to this basic block:53t0 == the first destination word for masking back in54t1 == the first source word. */5556/* Create the 1st output word and detect 0's in the 1st input word. */57lda t2, -1 # E : build a mask against false zero58mskqh t2, a1, t2 # U : detection in the src word (stall)59mskqh t1, a1, t3 # U :60ornot t1, t2, t2 # E : (stall)6162mskql t0, a1, t0 # U : assemble the first output word63cmpbge zero, t2, t8 # E : bits set iff null found64or t0, t3, t1 # E : (stall)65bne t8, $a_eos # U : (stall)6667/* On entry to this basic block:68t0 == the first destination word for masking back in69t1 == a source word not containing a null. */70/* Nops here to separate store quads from load quads */7172$a_loop:73stq_u t1, 0(a0) # L :74addq a0, 8, a0 # E :75nop76nop7778ldq_u t1, 0(a1) # L : Latency=379addq a1, 8, a1 # E :80cmpbge zero, t1, t8 # E : (3 cycle stall)81beq t8, $a_loop # U : (stall for t8)8283/* Take care of the final (partial) word store.84On entry to this basic block we have:85t1 == the source word containing the null86t8 == the cmpbge mask that found it. */87$a_eos:88negq t8, t6 # E : find low bit set89and t8, t6, t12 # E : (stall)90/* For the sake of the cache, don't read a destination word91if we're not going to need it. */92and t12, 0x80, t6 # E : (stall)93bne t6, 1f # U : (stall)9495/* We're doing a partial word store and so need to combine96our source and original destination words. */97ldq_u t0, 0(a0) # L : Latency=398subq t12, 1, t6 # E :99zapnot t1, t6, t1 # U : clear src bytes >= null (stall)100or t12, t6, t8 # E : (stall)101102zap t0, t8, t0 # E : clear dst bytes <= null103or t0, t1, t1 # E : (stall)104nop105nop1061071: stq_u t1, 0(a0) # L :108ret (t9) # L0 : Latency=3109nop110nop111112.end stxcpy_aligned113114.align 4115.ent __stxcpy116.globl __stxcpy117__stxcpy:118.frame sp, 0, t9119.prologue 0120121/* Are source and destination co-aligned? */122xor a0, a1, t0 # E :123unop # E :124and t0, 7, t0 # E : (stall)125bne t0, $unaligned # U : (stall)126127/* We are co-aligned; take care of a partial first word. */128ldq_u t1, 0(a1) # L : load first src word129and a0, 7, t0 # E : take care not to load a word ...130addq a1, 8, a1 # E :131beq t0, stxcpy_aligned # U : ... if we wont need it (stall)132133ldq_u t0, 0(a0) # L :134br stxcpy_aligned # L0 : Latency=3135nop136nop137138139/* The source and destination are not co-aligned. Align the destination140and cope. We have to be very careful about not reading too much and141causing a SEGV. */142143.align 4144$u_head:145/* We know just enough now to be able to assemble the first146full source word. We can still find a zero at the end of it147that prevents us from outputting the whole thing.148149On entry to this basic block:150t0 == the first dest word, for masking back in, if needed else 0151t1 == the low bits of the first source word152t6 == bytemask that is -1 in dest word bytes */153154ldq_u t2, 8(a1) # L :155addq a1, 8, a1 # E :156extql t1, a1, t1 # U : (stall on a1)157extqh t2, a1, t4 # U : (stall on a1)158159mskql t0, a0, t0 # U :160or t1, t4, t1 # E :161mskqh t1, a0, t1 # U : (stall on t1)162or t0, t1, t1 # E : (stall on t1)163164or t1, t6, t6 # E :165cmpbge zero, t6, t8 # E : (stall)166lda t6, -1 # E : for masking just below167bne t8, $u_final # U : (stall)168169mskql t6, a1, t6 # U : mask out the bits we have170or t6, t2, t2 # E : already extracted before (stall)171cmpbge zero, t2, t8 # E : testing eos (stall)172bne t8, $u_late_head_exit # U : (stall)173174/* Finally, we've got all the stupid leading edge cases taken care175of and we can set up to enter the main loop. */176177stq_u t1, 0(a0) # L : store first output word178addq a0, 8, a0 # E :179extql t2, a1, t0 # U : position ho-bits of lo word180ldq_u t2, 8(a1) # U : read next high-order source word181182addq a1, 8, a1 # E :183cmpbge zero, t2, t8 # E : (stall for t2)184nop # E :185bne t8, $u_eos # U : (stall)186187/* Unaligned copy main loop. In order to avoid reading too much,188the loop is structured to detect zeros in aligned source words.189This has, unfortunately, effectively pulled half of a loop190iteration out into the head and half into the tail, but it does191prevent nastiness from accumulating in the very thing we want192to run as fast as possible.193194On entry to this basic block:195t0 == the shifted high-order bits from the previous source word196t2 == the unshifted current source word197198We further know that t2 does not contain a null terminator. */199200.align 3201$u_loop:202extqh t2, a1, t1 # U : extract high bits for current word203addq a1, 8, a1 # E : (stall)204extql t2, a1, t3 # U : extract low bits for next time (stall)205addq a0, 8, a0 # E :206207or t0, t1, t1 # E : current dst word now complete208ldq_u t2, 0(a1) # L : Latency=3 load high word for next time209stq_u t1, -8(a0) # L : save the current word (stall)210mov t3, t0 # E :211212cmpbge zero, t2, t8 # E : test new word for eos213beq t8, $u_loop # U : (stall)214nop215nop216217/* We've found a zero somewhere in the source word we just read.218If it resides in the lower half, we have one (probably partial)219word to write out, and if it resides in the upper half, we220have one full and one partial word left to write out.221222On entry to this basic block:223t0 == the shifted high-order bits from the previous source word224t2 == the unshifted current source word. */225$u_eos:226extqh t2, a1, t1 # U :227or t0, t1, t1 # E : first (partial) source word complete (stall)228cmpbge zero, t1, t8 # E : is the null in this first bit? (stall)229bne t8, $u_final # U : (stall)230231$u_late_head_exit:232stq_u t1, 0(a0) # L : the null was in the high-order bits233addq a0, 8, a0 # E :234extql t2, a1, t1 # U :235cmpbge zero, t1, t8 # E : (stall)236237/* Take care of a final (probably partial) result word.238On entry to this basic block:239t1 == assembled source word240t8 == cmpbge mask that found the null. */241$u_final:242negq t8, t6 # E : isolate low bit set243and t6, t8, t12 # E : (stall)244and t12, 0x80, t6 # E : avoid dest word load if we can (stall)245bne t6, 1f # U : (stall)246247ldq_u t0, 0(a0) # E :248subq t12, 1, t6 # E :249or t6, t12, t8 # E : (stall)250zapnot t1, t6, t1 # U : kill source bytes >= null (stall)251252zap t0, t8, t0 # U : kill dest bytes <= null (2 cycle data stall)253or t0, t1, t1 # E : (stall)254nop255nop2562571: stq_u t1, 0(a0) # L :258ret (t9) # L0 : Latency=3259nop260nop261262/* Unaligned copy entry point. */263.align 4264$unaligned:265266ldq_u t1, 0(a1) # L : load first source word267and a0, 7, t4 # E : find dest misalignment268and a1, 7, t5 # E : find src misalignment269/* Conditionally load the first destination word and a bytemask270with 0xff indicating that the destination byte is sacrosanct. */271mov zero, t0 # E :272273mov zero, t6 # E :274beq t4, 1f # U :275ldq_u t0, 0(a0) # L :276lda t6, -1 # E :277278mskql t6, a0, t6 # U :279nop280nop281nop2821:283subq a1, t4, a1 # E : sub dest misalignment from src addr284/* If source misalignment is larger than dest misalignment, we need285extra startup checks to avoid SEGV. */286cmplt t4, t5, t12 # E :287beq t12, $u_head # U :288lda t2, -1 # E : mask out leading garbage in source289290mskqh t2, t5, t2 # U :291ornot t1, t2, t3 # E : (stall)292cmpbge zero, t3, t8 # E : is there a zero? (stall)293beq t8, $u_head # U : (stall)294295/* At this point we've found a zero in the first partial word of296the source. We need to isolate the valid source data and mask297it into the original destination data. (Incidentally, we know298that we'll need at least one byte of that original dest word.) */299300ldq_u t0, 0(a0) # L :301negq t8, t6 # E : build bitmask of bytes <= zero302and t6, t8, t12 # E : (stall)303and a1, 7, t5 # E :304305subq t12, 1, t6 # E :306or t6, t12, t8 # E : (stall)307srl t12, t5, t12 # U : adjust final null return value308zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall)309310and t1, t2, t1 # E : to source validity mask311extql t2, a1, t2 # U :312extql t1, a1, t1 # U : (stall)313andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall)314315or t0, t1, t1 # e1 : and put it there316stq_u t1, 0(a0) # .. e0 : (stall)317ret (t9) # e1 :318nop319320.end __stxcpy321322323324