/* SPDX-License-Identifier: GPL-2.0 */1/*2* arch/alpha/lib/ev6-memset.S3*4* This is an efficient (and relatively small) implementation of the C library5* "memset()" function for the 21264 implementation of Alpha.6*7* 21264 version contributed by Rick Gorton <[email protected]>8*9* Much of the information about 21264 scheduling/coding comes from:10* Compiler Writer's Guide for the Alpha 2126411* abbreviated as 'CWG' in other comments here12* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html13* Scheduling notation:14* E - either cluster15* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U116* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L117* The algorithm for the leading and trailing quadwords remains the same,18* however the loop has been unrolled to enable better memory throughput,19* and the code has been replicated for each of the entry points: __memset20* and __memset16 to permit better scheduling to eliminate the stalling21* encountered during the mask replication.22* A future enhancement might be to put in a byte store loop for really23* small (say < 32 bytes) memset()s. Whether or not that change would be24* a win in the kernel would depend upon the contextual usage.25* WARNING: Maintaining this is going to be more work than the above version,26* as fixes will need to be made in multiple places. The performance gain27* is worth it.28*/29#include <linux/export.h>30.set noat31.set noreorder32.text33.globl memset34.globl __memset35.globl ___memset36.globl __memset1637.globl __constant_c_memset3839.ent ___memset40.align 541___memset:42.frame $30,0,$26,043.prologue 04445/*46* Serious stalling happens. The only way to mitigate this is to47* undertake a major re-write to interleave the constant materialization48* with other parts of the fall-through code. This is important, even49* though it makes maintenance tougher.50* Do this later.51*/52and $17,255,$1 # E : 00000000000000ch53insbl $17,1,$2 # U : 000000000000ch0054bis $16,$16,$0 # E : return value55ble $18,end_b # U : zero length requested?5657addq $18,$16,$6 # E : max address to write to58bis $1,$2,$17 # E : 000000000000chch59insbl $1,2,$3 # U : 0000000000ch000060insbl $1,3,$4 # U : 00000000ch0000006162or $3,$4,$3 # E : 00000000chch000063inswl $17,4,$5 # U : 0000chch0000000064xor $16,$6,$1 # E : will complete write be within one quadword?65inswl $17,6,$2 # U : chch0000000000006667or $17,$3,$17 # E : 00000000chchchch68or $2,$5,$2 # E : chchchch0000000069bic $1,7,$1 # E : fit within a single quadword?70and $16,7,$3 # E : Target addr misalignment7172or $17,$2,$17 # E : chchchchchchchch73beq $1,within_quad_b # U :74nop # E :75beq $3,aligned_b # U : target is 0mod87677/*78* Target address is misaligned, and won't fit within a quadword79*/80ldq_u $4,0($16) # L : Fetch first partial81bis $16,$16,$5 # E : Save the address82insql $17,$16,$2 # U : Insert new bytes83subq $3,8,$3 # E : Invert (for addressing uses)8485addq $18,$3,$18 # E : $18 is new count ($3 is negative)86mskql $4,$16,$4 # U : clear relevant parts of the quad87subq $16,$3,$16 # E : $16 is new aligned destination88bis $2,$4,$1 # E : Final bytes8990nop91stq_u $1,0($5) # L : Store result92nop93nop9495.align 496aligned_b:97/*98* We are now guaranteed to be quad aligned, with at least99* one partial quad to write.100*/101102sra $18,3,$3 # U : Number of remaining quads to write103and $18,7,$18 # E : Number of trailing bytes to write104bis $16,$16,$5 # E : Save dest address105beq $3,no_quad_b # U : tail stuff only106107/*108* it's worth the effort to unroll this and use wh64 if possible109* Lifted a bunch of code from clear_user.S110* At this point, entry values are:111* $16 Current destination address112* $5 A copy of $16113* $6 The max quadword address to write to114* $18 Number trailer bytes115* $3 Number quads to write116*/117118and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)119subq $3, 16, $4 # E : Only try to unroll if > 128 bytes120subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)121blt $4, loop_b # U :122123/*124* We know we've got at least 16 quads, minimum of one trip125* through unrolled loop. Do a quad at a time to get us 0mod64126* aligned.127*/128129nop # E :130nop # E :131nop # E :132beq $1, $bigalign_b # U :133134$alignmod64_b:135stq $17, 0($5) # L :136subq $3, 1, $3 # E : For consistency later137addq $1, 8, $1 # E : Increment towards zero for alignment138addq $5, 8, $4 # E : Initial wh64 address (filler instruction)139140nop141nop142addq $5, 8, $5 # E : Inc address143blt $1, $alignmod64_b # U :144145$bigalign_b:146/*147* $3 - number quads left to go148* $5 - target address (aligned 0mod64)149* $17 - mask of stuff to store150* Scratch registers available: $7, $2, $4, $1151* we know that we'll be taking a minimum of one trip through152* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle153* Assumes the wh64 needs to be for 2 trips through the loop in the future154* The wh64 is issued on for the starting destination address for trip +2155* through the loop, and if there are less than two trips left, the target156* address will be for the current trip.157*/158159$do_wh64_b:160wh64 ($4) # L1 : memory subsystem write hint161subq $3, 24, $2 # E : For determining future wh64 addresses162stq $17, 0($5) # L :163nop # E :164165addq $5, 128, $4 # E : speculative target of next wh64166stq $17, 8($5) # L :167stq $17, 16($5) # L :168addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)169170stq $17, 24($5) # L :171stq $17, 32($5) # L :172cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle173nop174175stq $17, 40($5) # L :176stq $17, 48($5) # L :177subq $3, 16, $2 # E : Repeat the loop at least once more?178nop179180stq $17, 56($5) # L :181addq $5, 64, $5 # E :182subq $3, 8, $3 # E :183bge $2, $do_wh64_b # U :184185nop186nop187nop188beq $3, no_quad_b # U : Might have finished already189190.align 4191/*192* Simple loop for trailing quadwords, or for small amounts193* of data (where we can't use an unrolled loop and wh64)194*/195loop_b:196stq $17,0($5) # L :197subq $3,1,$3 # E : Decrement number quads left198addq $5,8,$5 # E : Inc address199bne $3,loop_b # U : more?200201no_quad_b:202/*203* Write 0..7 trailing bytes.204*/205nop # E :206beq $18,end_b # U : All done?207ldq $7,0($5) # L :208mskqh $7,$6,$2 # U : Mask final quad209210insqh $17,$6,$4 # U : New bits211bis $2,$4,$1 # E : Put it all together212stq $1,0($5) # L : And back to memory213ret $31,($26),1 # L0 :214215within_quad_b:216ldq_u $1,0($16) # L :217insql $17,$16,$2 # U : New bits218mskql $1,$16,$4 # U : Clear old219bis $2,$4,$2 # E : New result220221mskql $2,$6,$4 # U :222mskqh $1,$6,$2 # U :223bis $2,$4,$1 # E :224stq_u $1,0($16) # L :225226end_b:227nop228nop229nop230ret $31,($26),1 # L0 :231.end ___memset232EXPORT_SYMBOL(___memset)233234/*235* This is the original body of code, prior to replication and236* rescheduling. Leave it here, as there may be calls to this237* entry point.238*/239.align 4240.ent __constant_c_memset241__constant_c_memset:242.frame $30,0,$26,0243.prologue 0244245addq $18,$16,$6 # E : max address to write to246bis $16,$16,$0 # E : return value247xor $16,$6,$1 # E : will complete write be within one quadword?248ble $18,end # U : zero length requested?249250bic $1,7,$1 # E : fit within a single quadword251beq $1,within_one_quad # U :252and $16,7,$3 # E : Target addr misalignment253beq $3,aligned # U : target is 0mod8254255/*256* Target address is misaligned, and won't fit within a quadword257*/258ldq_u $4,0($16) # L : Fetch first partial259bis $16,$16,$5 # E : Save the address260insql $17,$16,$2 # U : Insert new bytes261subq $3,8,$3 # E : Invert (for addressing uses)262263addq $18,$3,$18 # E : $18 is new count ($3 is negative)264mskql $4,$16,$4 # U : clear relevant parts of the quad265subq $16,$3,$16 # E : $16 is new aligned destination266bis $2,$4,$1 # E : Final bytes267268nop269stq_u $1,0($5) # L : Store result270nop271nop272273.align 4274aligned:275/*276* We are now guaranteed to be quad aligned, with at least277* one partial quad to write.278*/279280sra $18,3,$3 # U : Number of remaining quads to write281and $18,7,$18 # E : Number of trailing bytes to write282bis $16,$16,$5 # E : Save dest address283beq $3,no_quad # U : tail stuff only284285/*286* it's worth the effort to unroll this and use wh64 if possible287* Lifted a bunch of code from clear_user.S288* At this point, entry values are:289* $16 Current destination address290* $5 A copy of $16291* $6 The max quadword address to write to292* $18 Number trailer bytes293* $3 Number quads to write294*/295296and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)297subq $3, 16, $4 # E : Only try to unroll if > 128 bytes298subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)299blt $4, loop # U :300301/*302* We know we've got at least 16 quads, minimum of one trip303* through unrolled loop. Do a quad at a time to get us 0mod64304* aligned.305*/306307nop # E :308nop # E :309nop # E :310beq $1, $bigalign # U :311312$alignmod64:313stq $17, 0($5) # L :314subq $3, 1, $3 # E : For consistency later315addq $1, 8, $1 # E : Increment towards zero for alignment316addq $5, 8, $4 # E : Initial wh64 address (filler instruction)317318nop319nop320addq $5, 8, $5 # E : Inc address321blt $1, $alignmod64 # U :322323$bigalign:324/*325* $3 - number quads left to go326* $5 - target address (aligned 0mod64)327* $17 - mask of stuff to store328* Scratch registers available: $7, $2, $4, $1329* we know that we'll be taking a minimum of one trip through330* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle331* Assumes the wh64 needs to be for 2 trips through the loop in the future332* The wh64 is issued on for the starting destination address for trip +2333* through the loop, and if there are less than two trips left, the target334* address will be for the current trip.335*/336337$do_wh64:338wh64 ($4) # L1 : memory subsystem write hint339subq $3, 24, $2 # E : For determining future wh64 addresses340stq $17, 0($5) # L :341nop # E :342343addq $5, 128, $4 # E : speculative target of next wh64344stq $17, 8($5) # L :345stq $17, 16($5) # L :346addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)347348stq $17, 24($5) # L :349stq $17, 32($5) # L :350cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle351nop352353stq $17, 40($5) # L :354stq $17, 48($5) # L :355subq $3, 16, $2 # E : Repeat the loop at least once more?356nop357358stq $17, 56($5) # L :359addq $5, 64, $5 # E :360subq $3, 8, $3 # E :361bge $2, $do_wh64 # U :362363nop364nop365nop366beq $3, no_quad # U : Might have finished already367368.align 4369/*370* Simple loop for trailing quadwords, or for small amounts371* of data (where we can't use an unrolled loop and wh64)372*/373loop:374stq $17,0($5) # L :375subq $3,1,$3 # E : Decrement number quads left376addq $5,8,$5 # E : Inc address377bne $3,loop # U : more?378379no_quad:380/*381* Write 0..7 trailing bytes.382*/383nop # E :384beq $18,end # U : All done?385ldq $7,0($5) # L :386mskqh $7,$6,$2 # U : Mask final quad387388insqh $17,$6,$4 # U : New bits389bis $2,$4,$1 # E : Put it all together390stq $1,0($5) # L : And back to memory391ret $31,($26),1 # L0 :392393within_one_quad:394ldq_u $1,0($16) # L :395insql $17,$16,$2 # U : New bits396mskql $1,$16,$4 # U : Clear old397bis $2,$4,$2 # E : New result398399mskql $2,$6,$4 # U :400mskqh $1,$6,$2 # U :401bis $2,$4,$1 # E :402stq_u $1,0($16) # L :403404end:405nop406nop407nop408ret $31,($26),1 # L0 :409.end __constant_c_memset410EXPORT_SYMBOL(__constant_c_memset)411412/*413* This is a replicant of the __constant_c_memset code, rescheduled414* to mask stalls. Note that entry point names also had to change415*/416.align 5417.ent __memset16418419__memset16:420.frame $30,0,$26,0421.prologue 0422423inswl $17,0,$5 # U : 000000000000c1c2424inswl $17,2,$2 # U : 00000000c1c20000425bis $16,$16,$0 # E : return value426addq $18,$16,$6 # E : max address to write to427428ble $18, end_w # U : zero length requested?429inswl $17,4,$3 # U : 0000c1c200000000430inswl $17,6,$4 # U : c1c2000000000000431xor $16,$6,$1 # E : will complete write be within one quadword?432433or $2,$5,$2 # E : 00000000c1c2c1c2434or $3,$4,$17 # E : c1c2c1c200000000435bic $1,7,$1 # E : fit within a single quadword436and $16,7,$3 # E : Target addr misalignment437438or $17,$2,$17 # E : c1c2c1c2c1c2c1c2439beq $1,within_quad_w # U :440nop441beq $3,aligned_w # U : target is 0mod8442443/*444* Target address is misaligned, and won't fit within a quadword445*/446ldq_u $4,0($16) # L : Fetch first partial447bis $16,$16,$5 # E : Save the address448insql $17,$16,$2 # U : Insert new bytes449subq $3,8,$3 # E : Invert (for addressing uses)450451addq $18,$3,$18 # E : $18 is new count ($3 is negative)452mskql $4,$16,$4 # U : clear relevant parts of the quad453subq $16,$3,$16 # E : $16 is new aligned destination454bis $2,$4,$1 # E : Final bytes455456nop457stq_u $1,0($5) # L : Store result458nop459nop460461.align 4462aligned_w:463/*464* We are now guaranteed to be quad aligned, with at least465* one partial quad to write.466*/467468sra $18,3,$3 # U : Number of remaining quads to write469and $18,7,$18 # E : Number of trailing bytes to write470bis $16,$16,$5 # E : Save dest address471beq $3,no_quad_w # U : tail stuff only472473/*474* it's worth the effort to unroll this and use wh64 if possible475* Lifted a bunch of code from clear_user.S476* At this point, entry values are:477* $16 Current destination address478* $5 A copy of $16479* $6 The max quadword address to write to480* $18 Number trailer bytes481* $3 Number quads to write482*/483484and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)485subq $3, 16, $4 # E : Only try to unroll if > 128 bytes486subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)487blt $4, loop_w # U :488489/*490* We know we've got at least 16 quads, minimum of one trip491* through unrolled loop. Do a quad at a time to get us 0mod64492* aligned.493*/494495nop # E :496nop # E :497nop # E :498beq $1, $bigalign_w # U :499500$alignmod64_w:501stq $17, 0($5) # L :502subq $3, 1, $3 # E : For consistency later503addq $1, 8, $1 # E : Increment towards zero for alignment504addq $5, 8, $4 # E : Initial wh64 address (filler instruction)505506nop507nop508addq $5, 8, $5 # E : Inc address509blt $1, $alignmod64_w # U :510511$bigalign_w:512/*513* $3 - number quads left to go514* $5 - target address (aligned 0mod64)515* $17 - mask of stuff to store516* Scratch registers available: $7, $2, $4, $1517* we know that we'll be taking a minimum of one trip through518* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle519* Assumes the wh64 needs to be for 2 trips through the loop in the future520* The wh64 is issued on for the starting destination address for trip +2521* through the loop, and if there are less than two trips left, the target522* address will be for the current trip.523*/524525$do_wh64_w:526wh64 ($4) # L1 : memory subsystem write hint527subq $3, 24, $2 # E : For determining future wh64 addresses528stq $17, 0($5) # L :529nop # E :530531addq $5, 128, $4 # E : speculative target of next wh64532stq $17, 8($5) # L :533stq $17, 16($5) # L :534addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)535536stq $17, 24($5) # L :537stq $17, 32($5) # L :538cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle539nop540541stq $17, 40($5) # L :542stq $17, 48($5) # L :543subq $3, 16, $2 # E : Repeat the loop at least once more?544nop545546stq $17, 56($5) # L :547addq $5, 64, $5 # E :548subq $3, 8, $3 # E :549bge $2, $do_wh64_w # U :550551nop552nop553nop554beq $3, no_quad_w # U : Might have finished already555556.align 4557/*558* Simple loop for trailing quadwords, or for small amounts559* of data (where we can't use an unrolled loop and wh64)560*/561loop_w:562stq $17,0($5) # L :563subq $3,1,$3 # E : Decrement number quads left564addq $5,8,$5 # E : Inc address565bne $3,loop_w # U : more?566567no_quad_w:568/*569* Write 0..7 trailing bytes.570*/571nop # E :572beq $18,end_w # U : All done?573ldq $7,0($5) # L :574mskqh $7,$6,$2 # U : Mask final quad575576insqh $17,$6,$4 # U : New bits577bis $2,$4,$1 # E : Put it all together578stq $1,0($5) # L : And back to memory579ret $31,($26),1 # L0 :580581within_quad_w:582ldq_u $1,0($16) # L :583insql $17,$16,$2 # U : New bits584mskql $1,$16,$4 # U : Clear old585bis $2,$4,$2 # E : New result586587mskql $2,$6,$4 # U :588mskqh $1,$6,$2 # U :589bis $2,$4,$1 # E :590stq_u $1,0($16) # L :591592end_w:593nop594nop595nop596ret $31,($26),1 # L0 :597598.end __memset16599EXPORT_SYMBOL(__memset16)600601memset = ___memset602__memset = ___memset603EXPORT_SYMBOL(memset)604EXPORT_SYMBOL(__memset)605606607