/* SPDX-License-Identifier: GPL-2.0 */1/*2* arch/alpha/lib/ev6-divide.S3*4* 21264 version contributed by Rick Gorton <[email protected]>5*6* Alpha division..7*/89/*10* The alpha chip doesn't provide hardware division, so we have to do it11* by hand. The compiler expects the functions12*13* __divqu: 64-bit unsigned long divide14* __remqu: 64-bit unsigned long remainder15* __divqs/__remqs: signed 64-bit16* __divlu/__remlu: unsigned 32-bit17* __divls/__remls: signed 32-bit18*19* These are not normal C functions: instead of the normal20* calling sequence, these expect their arguments in registers21* $24 and $25, and return the result in $27. Register $28 may22* be clobbered (assembly temporary), anything else must be saved.23*24* In short: painful.25*26* This is a rather simple bit-at-a-time algorithm: it's very good27* at dividing random 64-bit numbers, but the more usual case where28* the divisor is small is handled better by the DEC algorithm29* using lookup tables. This uses much less memory, though, and is30* nicer on the cache.. Besides, I don't know the copyright status31* of the DEC code.32*/3334/*35* My temporaries:36* $0 - current bit37* $1 - shifted divisor38* $2 - modulus/quotient39*40* $23 - return address41* $24 - dividend42* $25 - divisor43*44* $27 - quotient/modulus45* $28 - compare status46*47* Much of the information about 21264 scheduling/coding comes from:48* Compiler Writer's Guide for the Alpha 2126449* abbreviated as 'CWG' in other comments here50* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html51* Scheduling notation:52* E - either cluster53* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U154* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L155* Try not to change the actual algorithm if possible for consistency.56*/5758#include <linux/export.h>59#define halt .long 06061/*62* Select function type and registers63*/64#define mask $065#define divisor $166#define compare $2867#define tmp1 $368#define tmp2 $46970#ifdef DIV71#define DIV_ONLY(x,y...) x,##y72#define MOD_ONLY(x,y...)73#define func(x) __div##x74#define modulus $275#define quotient $2776#define GETSIGN(x) xor $24,$25,x77#define STACK 4878#else79#define DIV_ONLY(x,y...)80#define MOD_ONLY(x,y...) x,##y81#define func(x) __rem##x82#define modulus $2783#define quotient $284#define GETSIGN(x) bis $24,$24,x85#define STACK 3286#endif8788/*89* For 32-bit operations, we need to extend to 64-bit90*/91#ifdef INTSIZE92#define ufunction func(lu)93#define sfunction func(l)94#define LONGIFY(x) zapnot x,15,x95#define SLONGIFY(x) addl x,0,x96#else97#define ufunction func(qu)98#define sfunction func(q)99#define LONGIFY(x)100#define SLONGIFY(x)101#endif102103.set noat104.align 4105.globl ufunction106.ent ufunction107ufunction:108subq $30,STACK,$30 # E :109.frame $30,STACK,$23110.prologue 01111127: stq $1, 0($30) # L :113bis $25,$25,divisor # E :114stq $2, 8($30) # L : L U L U115116bis $24,$24,modulus # E :117stq $0,16($30) # L :118bis $31,$31,quotient # E :119LONGIFY(divisor) # E : U L L U120121stq tmp1,24($30) # L :122LONGIFY(modulus) # E :123bis $31,1,mask # E :124DIV_ONLY(stq tmp2,32($30)) # L : L U U L125126beq divisor, 9f /* div by zero */127/*128* In spite of the DIV_ONLY being either a non-instruction129* or an actual stq, the addition of the .align directive130* below ensures that label 1 is going to be nicely aligned131*/132133.align 4134#ifdef INTSIZE135/*136* shift divisor left, using 3-bit shifts for137* 32-bit divides as we can't overflow. Three-bit138* shifts will result in looping three times less139* here, but can result in two loops more later.140* Thus using a large shift isn't worth it (and141* s8add pairs better than a sll..)142*/1431: cmpult divisor,modulus,compare # E :144s8addq divisor,$31,divisor # E :145s8addq mask,$31,mask # E :146bne compare,1b # U : U L U L147#else1481: cmpult divisor,modulus,compare # E :149nop # E :150nop # E :151blt divisor, 2f # U : U L U L152153addq divisor,divisor,divisor # E :154addq mask,mask,mask # E :155unop # E :156bne compare,1b # U : U L U L157#endif158159/* ok, start to go right again.. */1602:161/*162* Keep things nicely bundled... use a nop instead of not163* having an instruction for DIV_ONLY164*/165#ifdef DIV166DIV_ONLY(addq quotient,mask,tmp2) # E :167#else168nop # E :169#endif170srl mask,1,mask # U :171cmpule divisor,modulus,compare # E :172subq modulus,divisor,tmp1 # E :173174#ifdef DIV175DIV_ONLY(cmovne compare,tmp2,quotient) # E : Latency 2, extra map slot176nop # E : as part of the cmovne177srl divisor,1,divisor # U :178nop # E : L U L U179180nop # E :181cmovne compare,tmp1,modulus # E : Latency 2, extra map slot182nop # E : as part of the cmovne183bne mask,2b # U : U L U L184#else185srl divisor,1,divisor # U :186cmovne compare,tmp1,modulus # E : Latency 2, extra map slot187nop # E : as part of the cmovne188bne mask,2b # U : U L L U189#endif1901919: ldq $1, 0($30) # L :192ldq $2, 8($30) # L :193nop # E :194nop # E : U U L L195196ldq $0,16($30) # L :197ldq tmp1,24($30) # L :198nop # E :199nop # E :200201#ifdef DIV202DIV_ONLY(ldq tmp2,32($30)) # L :203#else204nop # E :205#endif206addq $30,STACK,$30 # E :207ret $31,($23),1 # L0 : L U U L208.end ufunction209EXPORT_SYMBOL(ufunction)210211/*212* Uhh.. Ugly signed division. I'd rather not have it at all, but213* it's needed in some circumstances. There are different ways to214* handle this, really. This does:215* -a / b = a / -b = -(a / b)216* -a % b = -(a % b)217* a % -b = a % b218* which is probably not the best solution, but at least should219* have the property that (x/y)*y + (x%y) = x.220*/221.align 4222.globl sfunction223.ent sfunction224sfunction:225subq $30,STACK,$30 # E :226.frame $30,STACK,$23227.prologue 0228bis $24,$25,$28 # E :229SLONGIFY($28) # E :230bge $28,7b # U :231232stq $24,0($30) # L :233subq $31,$24,$28 # E :234stq $25,8($30) # L :235nop # E : U L U L236237cmovlt $24,$28,$24 /* abs($24) */ # E : Latency 2, extra map slot238nop # E : as part of the cmov239stq $23,16($30) # L :240subq $31,$25,$28 # E : U L U L241242stq tmp1,24($30) # L :243cmovlt $25,$28,$25 /* abs($25) */ # E : Latency 2, extra map slot244nop # E :245bsr $23,ufunction # L0: L U L U246247ldq $24,0($30) # L :248ldq $25,8($30) # L :249GETSIGN($28) # E :250subq $31,$27,tmp1 # E : U U L L251252SLONGIFY($28) # E :253ldq $23,16($30) # L :254cmovlt $28,tmp1,$27 # E : Latency 2, extra map slot255nop # E : U L L U : as part of the cmov256257ldq tmp1,24($30) # L :258nop # E : as part of the cmov259addq $30,STACK,$30 # E :260ret $31,($23),1 # L0 : L U U L261.end sfunction262EXPORT_SYMBOL(sfunction)263264265