/* SPDX-License-Identifier: GPL-2.0 */1/*2* arch/alpha/lib/ev6-clear_user.S3* 21264 version contributed by Rick Gorton <[email protected]>4*5* Zero user space, handling exceptions as we go.6*7* We have to make sure that $0 is always up-to-date and contains the8* right "bytes left to zero" value (and that it is updated only _after_9* a successful copy). There is also some rather minor exception setup10* stuff.11*12* Much of the information about 21264 scheduling/coding comes from:13* Compiler Writer's Guide for the Alpha 2126414* abbreviated as 'CWG' in other comments here15* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html16* Scheduling notation:17* E - either cluster18* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U119* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L120* Try not to change the actual algorithm if possible for consistency.21* Determining actual stalls (other than slotting) doesn't appear to be easy to do.22* From perusing the source code context where this routine is called, it is23* a fair assumption that significant fractions of entire pages are zeroed, so24* it's going to be worth the effort to hand-unroll a big loop, and use wh64.25* ASSUMPTION:26* The believed purpose of only updating $0 after a store is that a signal27* may come along during the execution of this chunk of code, and we don't28* want to leave a hole (and we also want to avoid repeating lots of work)29*/3031#include <linux/export.h>32/* Allow an exception for an insn; exit if we get one. */33#define EX(x,y...) \3499: x,##y; \35.section __ex_table,"a"; \36.long 99b - .; \37lda $31, $exception-99b($31); \38.previous3940.set noat41.set noreorder42.align 44344.globl __clear_user45.ent __clear_user46.frame $30, 0, $2647.prologue 04849# Pipeline info : Slotting & Comments50__clear_user:51and $17, $17, $052and $16, 7, $4 # .. E .. .. : find dest head misalignment53beq $0, $zerolength # U .. .. .. : U L U L5455addq $0, $4, $1 # .. .. .. E : bias counter56and $1, 7, $2 # .. .. E .. : number of misaligned bytes in tail57# Note - we never actually use $2, so this is a moot computation58# and we can rewrite this later...59srl $1, 3, $1 # .. E .. .. : number of quadwords to clear60beq $4, $headalign # U .. .. .. : U L U L6162/*63* Head is not aligned. Write (8 - $4) bytes to head of destination64* This means $16 is known to be misaligned65*/66EX( ldq_u $5, 0($16) ) # .. .. .. L : load dst word to mask back in67beq $1, $onebyte # .. .. U .. : sub-word store?68mskql $5, $16, $5 # .. U .. .. : take care of misaligned head69addq $16, 8, $16 # E .. .. .. : L U U L7071EX( stq_u $5, -8($16) ) # .. .. .. L :72subq $1, 1, $1 # .. .. E .. :73addq $0, $4, $0 # .. E .. .. : bytes left -= 8 - misalignment74subq $0, 8, $0 # E .. .. .. : U L U L7576.align 477/*78* (The .align directive ought to be a moot point)79* values upon initial entry to the loop80* $1 is number of quadwords to clear (zero is a valid value)81* $2 is number of trailing bytes (0..7) ($2 never used...)82* $16 is known to be aligned 0mod883*/84$headalign:85subq $1, 16, $4 # .. .. .. E : If < 16, we can not use the huge loop86and $16, 0x3f, $2 # .. .. E .. : Forward work for huge loop87subq $2, 0x40, $3 # .. E .. .. : bias counter (huge loop)88blt $4, $trailquad # U .. .. .. : U L U L8990/*91* We know that we're going to do at least 16 quads, which means we are92* going to be able to use the large block clear loop at least once.93* Figure out how many quads we need to clear before we are 0mod64 aligned94* so we can use the wh64 instruction.95*/9697nop # .. .. .. E98nop # .. .. E ..99nop # .. E .. ..100beq $3, $bigalign # U .. .. .. : U L U L : Aligned 0mod64101102$alignmod64:103EX( stq_u $31, 0($16) ) # .. .. .. L104addq $3, 8, $3 # .. .. E ..105subq $0, 8, $0 # .. E .. ..106nop # E .. .. .. : U L U L107108nop # .. .. .. E109subq $1, 1, $1 # .. .. E ..110addq $16, 8, $16 # .. E .. ..111blt $3, $alignmod64 # U .. .. .. : U L U L112113$bigalign:114/*115* $0 is the number of bytes left116* $1 is the number of quads left117* $16 is aligned 0mod64118* we know that we'll be taking a minimum of one trip through119* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle120* We are _not_ going to update $0 after every single store. That121* would be silly, because there will be cross-cluster dependencies122* no matter how the code is scheduled. By doing it in slightly123* staggered fashion, we can still do this loop in 5 fetches124* The worse case will be doing two extra quads in some future execution,125* in the event of an interrupted clear.126* Assumes the wh64 needs to be for 2 trips through the loop in the future127* The wh64 is issued on for the starting destination address for trip +2128* through the loop, and if there are less than two trips left, the target129* address will be for the current trip.130*/131nop # E :132nop # E :133nop # E :134bis $16,$16,$3 # E : U L U L : Initial wh64 address is dest135/* This might actually help for the current trip... */136137$do_wh64:138wh64 ($3) # .. .. .. L1 : memory subsystem hint139subq $1, 16, $4 # .. .. E .. : Forward calculation - repeat the loop?140EX( stq_u $31, 0($16) ) # .. L .. ..141subq $0, 8, $0 # E .. .. .. : U L U L142143addq $16, 128, $3 # E : Target address of wh64144EX( stq_u $31, 8($16) ) # L :145EX( stq_u $31, 16($16) ) # L :146subq $0, 16, $0 # E : U L L U147148nop # E :149EX( stq_u $31, 24($16) ) # L :150EX( stq_u $31, 32($16) ) # L :151subq $0, 168, $5 # E : U L L U : two trips through the loop left?152/* 168 = 192 - 24, since we've already completed some stores */153154subq $0, 16, $0 # E :155EX( stq_u $31, 40($16) ) # L :156EX( stq_u $31, 48($16) ) # L :157cmovlt $5, $16, $3 # E : U L L U : Latency 2, extra mapping cycle158159subq $1, 8, $1 # E :160subq $0, 16, $0 # E :161EX( stq_u $31, 56($16) ) # L :162nop # E : U L U L163164nop # E :165subq $0, 8, $0 # E :166addq $16, 64, $16 # E :167bge $4, $do_wh64 # U : U L U L168169$trailquad:170# zero to 16 quadwords left to store, plus any trailing bytes171# $1 is the number of quadwords left to go.172#173nop # .. .. .. E174nop # .. .. E ..175nop # .. E .. ..176beq $1, $trailbytes # U .. .. .. : U L U L : Only 0..7 bytes to go177178$onequad:179EX( stq_u $31, 0($16) ) # .. .. .. L180subq $1, 1, $1 # .. .. E ..181subq $0, 8, $0 # .. E .. ..182nop # E .. .. .. : U L U L183184nop # .. .. .. E185nop # .. .. E ..186addq $16, 8, $16 # .. E .. ..187bgt $1, $onequad # U .. .. .. : U L U L188189# We have an unknown number of bytes left to go.190$trailbytes:191nop # .. .. .. E192nop # .. .. E ..193nop # .. E .. ..194beq $0, $zerolength # U .. .. .. : U L U L195196# $0 contains the number of bytes left to copy (0..31)197# so we will use $0 as the loop counter198# We know for a fact that $0 > 0 zero due to previous context199$onebyte:200EX( stb $31, 0($16) ) # .. .. .. L201subq $0, 1, $0 # .. .. E .. :202addq $16, 1, $16 # .. E .. .. :203bgt $0, $onebyte # U .. .. .. : U L U L204205$zerolength:206$exception: # Destination for exception recovery(?)207nop # .. .. .. E :208nop # .. .. E .. :209nop # .. E .. .. :210ret $31, ($26), 1 # L0 .. .. .. : L U L U211.end __clear_user212EXPORT_SYMBOL(__clear_user)213214215