/*1* arch/alpha/lib/ev6-memset.S2*3* This is an efficient (and relatively small) implementation of the C library4* "memset()" function for the 21264 implementation of Alpha.5*6* 21264 version contributed by Rick Gorton <[email protected]>7*8* Much of the information about 21264 scheduling/coding comes from:9* Compiler Writer's Guide for the Alpha 2126410* abbreviated as 'CWG' in other comments here11* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html12* Scheduling notation:13* E - either cluster14* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U115* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L116* The algorithm for the leading and trailing quadwords remains the same,17* however the loop has been unrolled to enable better memory throughput,18* and the code has been replicated for each of the entry points: __memset19* and __memsetw to permit better scheduling to eliminate the stalling20* encountered during the mask replication.21* A future enhancement might be to put in a byte store loop for really22* small (say < 32 bytes) memset()s. Whether or not that change would be23* a win in the kernel would depend upon the contextual usage.24* WARNING: Maintaining this is going to be more work than the above version,25* as fixes will need to be made in multiple places. The performance gain26* is worth it.27*/2829.set noat30.set noreorder31.text32.globl __memset33.globl __memsetw34.globl __constant_c_memset35.globl memset3637.ent __memset38.align 539__memset:40.frame $30,0,$26,041.prologue 04243/*44* Serious stalling happens. The only way to mitigate this is to45* undertake a major re-write to interleave the constant materialization46* with other parts of the fall-through code. This is important, even47* though it makes maintenance tougher.48* Do this later.49*/50and $17,255,$1 # E : 00000000000000ch51insbl $17,1,$2 # U : 000000000000ch0052bis $16,$16,$0 # E : return value53ble $18,end_b # U : zero length requested?5455addq $18,$16,$6 # E : max address to write to56bis $1,$2,$17 # E : 000000000000chch57insbl $1,2,$3 # U : 0000000000ch000058insbl $1,3,$4 # U : 00000000ch0000005960or $3,$4,$3 # E : 00000000chch000061inswl $17,4,$5 # U : 0000chch0000000062xor $16,$6,$1 # E : will complete write be within one quadword?63inswl $17,6,$2 # U : chch0000000000006465or $17,$3,$17 # E : 00000000chchchch66or $2,$5,$2 # E : chchchch0000000067bic $1,7,$1 # E : fit within a single quadword?68and $16,7,$3 # E : Target addr misalignment6970or $17,$2,$17 # E : chchchchchchchch71beq $1,within_quad_b # U :72nop # E :73beq $3,aligned_b # U : target is 0mod87475/*76* Target address is misaligned, and won't fit within a quadword77*/78ldq_u $4,0($16) # L : Fetch first partial79bis $16,$16,$5 # E : Save the address80insql $17,$16,$2 # U : Insert new bytes81subq $3,8,$3 # E : Invert (for addressing uses)8283addq $18,$3,$18 # E : $18 is new count ($3 is negative)84mskql $4,$16,$4 # U : clear relevant parts of the quad85subq $16,$3,$16 # E : $16 is new aligned destination86bis $2,$4,$1 # E : Final bytes8788nop89stq_u $1,0($5) # L : Store result90nop91nop9293.align 494aligned_b:95/*96* We are now guaranteed to be quad aligned, with at least97* one partial quad to write.98*/99100sra $18,3,$3 # U : Number of remaining quads to write101and $18,7,$18 # E : Number of trailing bytes to write102bis $16,$16,$5 # E : Save dest address103beq $3,no_quad_b # U : tail stuff only104105/*106* it's worth the effort to unroll this and use wh64 if possible107* Lifted a bunch of code from clear_user.S108* At this point, entry values are:109* $16 Current destination address110* $5 A copy of $16111* $6 The max quadword address to write to112* $18 Number trailer bytes113* $3 Number quads to write114*/115116and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)117subq $3, 16, $4 # E : Only try to unroll if > 128 bytes118subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)119blt $4, loop_b # U :120121/*122* We know we've got at least 16 quads, minimum of one trip123* through unrolled loop. Do a quad at a time to get us 0mod64124* aligned.125*/126127nop # E :128nop # E :129nop # E :130beq $1, $bigalign_b # U :131132$alignmod64_b:133stq $17, 0($5) # L :134subq $3, 1, $3 # E : For consistency later135addq $1, 8, $1 # E : Increment towards zero for alignment136addq $5, 8, $4 # E : Initial wh64 address (filler instruction)137138nop139nop140addq $5, 8, $5 # E : Inc address141blt $1, $alignmod64_b # U :142143$bigalign_b:144/*145* $3 - number quads left to go146* $5 - target address (aligned 0mod64)147* $17 - mask of stuff to store148* Scratch registers available: $7, $2, $4, $1149* we know that we'll be taking a minimum of one trip through150* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle151* Assumes the wh64 needs to be for 2 trips through the loop in the future152* The wh64 is issued on for the starting destination address for trip +2153* through the loop, and if there are less than two trips left, the target154* address will be for the current trip.155*/156157$do_wh64_b:158wh64 ($4) # L1 : memory subsystem write hint159subq $3, 24, $2 # E : For determining future wh64 addresses160stq $17, 0($5) # L :161nop # E :162163addq $5, 128, $4 # E : speculative target of next wh64164stq $17, 8($5) # L :165stq $17, 16($5) # L :166addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)167168stq $17, 24($5) # L :169stq $17, 32($5) # L :170cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle171nop172173stq $17, 40($5) # L :174stq $17, 48($5) # L :175subq $3, 16, $2 # E : Repeat the loop at least once more?176nop177178stq $17, 56($5) # L :179addq $5, 64, $5 # E :180subq $3, 8, $3 # E :181bge $2, $do_wh64_b # U :182183nop184nop185nop186beq $3, no_quad_b # U : Might have finished already187188.align 4189/*190* Simple loop for trailing quadwords, or for small amounts191* of data (where we can't use an unrolled loop and wh64)192*/193loop_b:194stq $17,0($5) # L :195subq $3,1,$3 # E : Decrement number quads left196addq $5,8,$5 # E : Inc address197bne $3,loop_b # U : more?198199no_quad_b:200/*201* Write 0..7 trailing bytes.202*/203nop # E :204beq $18,end_b # U : All done?205ldq $7,0($5) # L :206mskqh $7,$6,$2 # U : Mask final quad207208insqh $17,$6,$4 # U : New bits209bis $2,$4,$1 # E : Put it all together210stq $1,0($5) # L : And back to memory211ret $31,($26),1 # L0 :212213within_quad_b:214ldq_u $1,0($16) # L :215insql $17,$16,$2 # U : New bits216mskql $1,$16,$4 # U : Clear old217bis $2,$4,$2 # E : New result218219mskql $2,$6,$4 # U :220mskqh $1,$6,$2 # U :221bis $2,$4,$1 # E :222stq_u $1,0($16) # L :223224end_b:225nop226nop227nop228ret $31,($26),1 # L0 :229.end __memset230231/*232* This is the original body of code, prior to replication and233* rescheduling. Leave it here, as there may be calls to this234* entry point.235*/236.align 4237.ent __constant_c_memset238__constant_c_memset:239.frame $30,0,$26,0240.prologue 0241242addq $18,$16,$6 # E : max address to write to243bis $16,$16,$0 # E : return value244xor $16,$6,$1 # E : will complete write be within one quadword?245ble $18,end # U : zero length requested?246247bic $1,7,$1 # E : fit within a single quadword248beq $1,within_one_quad # U :249and $16,7,$3 # E : Target addr misalignment250beq $3,aligned # U : target is 0mod8251252/*253* Target address is misaligned, and won't fit within a quadword254*/255ldq_u $4,0($16) # L : Fetch first partial256bis $16,$16,$5 # E : Save the address257insql $17,$16,$2 # U : Insert new bytes258subq $3,8,$3 # E : Invert (for addressing uses)259260addq $18,$3,$18 # E : $18 is new count ($3 is negative)261mskql $4,$16,$4 # U : clear relevant parts of the quad262subq $16,$3,$16 # E : $16 is new aligned destination263bis $2,$4,$1 # E : Final bytes264265nop266stq_u $1,0($5) # L : Store result267nop268nop269270.align 4271aligned:272/*273* We are now guaranteed to be quad aligned, with at least274* one partial quad to write.275*/276277sra $18,3,$3 # U : Number of remaining quads to write278and $18,7,$18 # E : Number of trailing bytes to write279bis $16,$16,$5 # E : Save dest address280beq $3,no_quad # U : tail stuff only281282/*283* it's worth the effort to unroll this and use wh64 if possible284* Lifted a bunch of code from clear_user.S285* At this point, entry values are:286* $16 Current destination address287* $5 A copy of $16288* $6 The max quadword address to write to289* $18 Number trailer bytes290* $3 Number quads to write291*/292293and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)294subq $3, 16, $4 # E : Only try to unroll if > 128 bytes295subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)296blt $4, loop # U :297298/*299* We know we've got at least 16 quads, minimum of one trip300* through unrolled loop. Do a quad at a time to get us 0mod64301* aligned.302*/303304nop # E :305nop # E :306nop # E :307beq $1, $bigalign # U :308309$alignmod64:310stq $17, 0($5) # L :311subq $3, 1, $3 # E : For consistency later312addq $1, 8, $1 # E : Increment towards zero for alignment313addq $5, 8, $4 # E : Initial wh64 address (filler instruction)314315nop316nop317addq $5, 8, $5 # E : Inc address318blt $1, $alignmod64 # U :319320$bigalign:321/*322* $3 - number quads left to go323* $5 - target address (aligned 0mod64)324* $17 - mask of stuff to store325* Scratch registers available: $7, $2, $4, $1326* we know that we'll be taking a minimum of one trip through327* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle328* Assumes the wh64 needs to be for 2 trips through the loop in the future329* The wh64 is issued on for the starting destination address for trip +2330* through the loop, and if there are less than two trips left, the target331* address will be for the current trip.332*/333334$do_wh64:335wh64 ($4) # L1 : memory subsystem write hint336subq $3, 24, $2 # E : For determining future wh64 addresses337stq $17, 0($5) # L :338nop # E :339340addq $5, 128, $4 # E : speculative target of next wh64341stq $17, 8($5) # L :342stq $17, 16($5) # L :343addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)344345stq $17, 24($5) # L :346stq $17, 32($5) # L :347cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle348nop349350stq $17, 40($5) # L :351stq $17, 48($5) # L :352subq $3, 16, $2 # E : Repeat the loop at least once more?353nop354355stq $17, 56($5) # L :356addq $5, 64, $5 # E :357subq $3, 8, $3 # E :358bge $2, $do_wh64 # U :359360nop361nop362nop363beq $3, no_quad # U : Might have finished already364365.align 4366/*367* Simple loop for trailing quadwords, or for small amounts368* of data (where we can't use an unrolled loop and wh64)369*/370loop:371stq $17,0($5) # L :372subq $3,1,$3 # E : Decrement number quads left373addq $5,8,$5 # E : Inc address374bne $3,loop # U : more?375376no_quad:377/*378* Write 0..7 trailing bytes.379*/380nop # E :381beq $18,end # U : All done?382ldq $7,0($5) # L :383mskqh $7,$6,$2 # U : Mask final quad384385insqh $17,$6,$4 # U : New bits386bis $2,$4,$1 # E : Put it all together387stq $1,0($5) # L : And back to memory388ret $31,($26),1 # L0 :389390within_one_quad:391ldq_u $1,0($16) # L :392insql $17,$16,$2 # U : New bits393mskql $1,$16,$4 # U : Clear old394bis $2,$4,$2 # E : New result395396mskql $2,$6,$4 # U :397mskqh $1,$6,$2 # U :398bis $2,$4,$1 # E :399stq_u $1,0($16) # L :400401end:402nop403nop404nop405ret $31,($26),1 # L0 :406.end __constant_c_memset407408/*409* This is a replicant of the __constant_c_memset code, rescheduled410* to mask stalls. Note that entry point names also had to change411*/412.align 5413.ent __memsetw414415__memsetw:416.frame $30,0,$26,0417.prologue 0418419inswl $17,0,$5 # U : 000000000000c1c2420inswl $17,2,$2 # U : 00000000c1c20000421bis $16,$16,$0 # E : return value422addq $18,$16,$6 # E : max address to write to423424ble $18, end_w # U : zero length requested?425inswl $17,4,$3 # U : 0000c1c200000000426inswl $17,6,$4 # U : c1c2000000000000427xor $16,$6,$1 # E : will complete write be within one quadword?428429or $2,$5,$2 # E : 00000000c1c2c1c2430or $3,$4,$17 # E : c1c2c1c200000000431bic $1,7,$1 # E : fit within a single quadword432and $16,7,$3 # E : Target addr misalignment433434or $17,$2,$17 # E : c1c2c1c2c1c2c1c2435beq $1,within_quad_w # U :436nop437beq $3,aligned_w # U : target is 0mod8438439/*440* Target address is misaligned, and won't fit within a quadword441*/442ldq_u $4,0($16) # L : Fetch first partial443bis $16,$16,$5 # E : Save the address444insql $17,$16,$2 # U : Insert new bytes445subq $3,8,$3 # E : Invert (for addressing uses)446447addq $18,$3,$18 # E : $18 is new count ($3 is negative)448mskql $4,$16,$4 # U : clear relevant parts of the quad449subq $16,$3,$16 # E : $16 is new aligned destination450bis $2,$4,$1 # E : Final bytes451452nop453stq_u $1,0($5) # L : Store result454nop455nop456457.align 4458aligned_w:459/*460* We are now guaranteed to be quad aligned, with at least461* one partial quad to write.462*/463464sra $18,3,$3 # U : Number of remaining quads to write465and $18,7,$18 # E : Number of trailing bytes to write466bis $16,$16,$5 # E : Save dest address467beq $3,no_quad_w # U : tail stuff only468469/*470* it's worth the effort to unroll this and use wh64 if possible471* Lifted a bunch of code from clear_user.S472* At this point, entry values are:473* $16 Current destination address474* $5 A copy of $16475* $6 The max quadword address to write to476* $18 Number trailer bytes477* $3 Number quads to write478*/479480and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)481subq $3, 16, $4 # E : Only try to unroll if > 128 bytes482subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)483blt $4, loop_w # U :484485/*486* We know we've got at least 16 quads, minimum of one trip487* through unrolled loop. Do a quad at a time to get us 0mod64488* aligned.489*/490491nop # E :492nop # E :493nop # E :494beq $1, $bigalign_w # U :495496$alignmod64_w:497stq $17, 0($5) # L :498subq $3, 1, $3 # E : For consistency later499addq $1, 8, $1 # E : Increment towards zero for alignment500addq $5, 8, $4 # E : Initial wh64 address (filler instruction)501502nop503nop504addq $5, 8, $5 # E : Inc address505blt $1, $alignmod64_w # U :506507$bigalign_w:508/*509* $3 - number quads left to go510* $5 - target address (aligned 0mod64)511* $17 - mask of stuff to store512* Scratch registers available: $7, $2, $4, $1513* we know that we'll be taking a minimum of one trip through514* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle515* Assumes the wh64 needs to be for 2 trips through the loop in the future516* The wh64 is issued on for the starting destination address for trip +2517* through the loop, and if there are less than two trips left, the target518* address will be for the current trip.519*/520521$do_wh64_w:522wh64 ($4) # L1 : memory subsystem write hint523subq $3, 24, $2 # E : For determining future wh64 addresses524stq $17, 0($5) # L :525nop # E :526527addq $5, 128, $4 # E : speculative target of next wh64528stq $17, 8($5) # L :529stq $17, 16($5) # L :530addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)531532stq $17, 24($5) # L :533stq $17, 32($5) # L :534cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle535nop536537stq $17, 40($5) # L :538stq $17, 48($5) # L :539subq $3, 16, $2 # E : Repeat the loop at least once more?540nop541542stq $17, 56($5) # L :543addq $5, 64, $5 # E :544subq $3, 8, $3 # E :545bge $2, $do_wh64_w # U :546547nop548nop549nop550beq $3, no_quad_w # U : Might have finished already551552.align 4553/*554* Simple loop for trailing quadwords, or for small amounts555* of data (where we can't use an unrolled loop and wh64)556*/557loop_w:558stq $17,0($5) # L :559subq $3,1,$3 # E : Decrement number quads left560addq $5,8,$5 # E : Inc address561bne $3,loop_w # U : more?562563no_quad_w:564/*565* Write 0..7 trailing bytes.566*/567nop # E :568beq $18,end_w # U : All done?569ldq $7,0($5) # L :570mskqh $7,$6,$2 # U : Mask final quad571572insqh $17,$6,$4 # U : New bits573bis $2,$4,$1 # E : Put it all together574stq $1,0($5) # L : And back to memory575ret $31,($26),1 # L0 :576577within_quad_w:578ldq_u $1,0($16) # L :579insql $17,$16,$2 # U : New bits580mskql $1,$16,$4 # U : Clear old581bis $2,$4,$2 # E : New result582583mskql $2,$6,$4 # U :584mskqh $1,$6,$2 # U :585bis $2,$4,$1 # E :586stq_u $1,0($16) # L :587588end_w:589nop590nop591nop592ret $31,($26),1 # L0 :593594.end __memsetw595596memset = __memset597598599