CoCalc -- ev6-divide.S

GitHub Repository: torvalds/linux
Path: blob/master/arch/alpha/lib/ev6-divide.S
⁴⁹⁶³⁹ views
1
/* SPDX-License-Identifier: GPL-2.0 */
2
/*
3
 * arch/alpha/lib/ev6-divide.S
4
 *
5
 * 21264 version contributed by Rick Gorton <[email protected]>
6
 *
7
 * Alpha division..
8
 */
9

10
/*
11
 * The alpha chip doesn't provide hardware division, so we have to do it
12
 * by hand.  The compiler expects the functions
13
 *
14
 *	__divqu: 64-bit unsigned long divide
15
 *	__remqu: 64-bit unsigned long remainder
16
 *	__divqs/__remqs: signed 64-bit
17
 *	__divlu/__remlu: unsigned 32-bit
18
 *	__divls/__remls: signed 32-bit
19
 *
20
 * These are not normal C functions: instead of the normal
21
 * calling sequence, these expect their arguments in registers
22
 * $24 and $25, and return the result in $27. Register $28 may
23
 * be clobbered (assembly temporary), anything else must be saved. 
24
 *
25
 * In short: painful.
26
 *
27
 * This is a rather simple bit-at-a-time algorithm: it's very good
28
 * at dividing random 64-bit numbers, but the more usual case where
29
 * the divisor is small is handled better by the DEC algorithm
30
 * using lookup tables. This uses much less memory, though, and is
31
 * nicer on the cache.. Besides, I don't know the copyright status
32
 * of the DEC code.
33
 */
34

35
/*
36
 * My temporaries:
37
 *	$0 - current bit
38
 *	$1 - shifted divisor
39
 *	$2 - modulus/quotient
40
 *
41
 *	$23 - return address
42
 *	$24 - dividend
43
 *	$25 - divisor
44
 *
45
 *	$27 - quotient/modulus
46
 *	$28 - compare status
47
 *
48
 * Much of the information about 21264 scheduling/coding comes from:
49
 *	Compiler Writer's Guide for the Alpha 21264
50
 *	abbreviated as 'CWG' in other comments here
51
 *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
52
 * Scheduling notation:
53
 *	E	- either cluster
54
 *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
55
 *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
56
 * Try not to change the actual algorithm if possible for consistency.
57
 */
58

59
#include <linux/export.h>
60
#define halt .long 0
61

62
/*
63
 * Select function type and registers
64
 */
65
#define mask	$0
66
#define divisor	$1
67
#define compare $28
68
#define tmp1	$3
69
#define tmp2	$4
70

71
#ifdef DIV
72
#define DIV_ONLY(x,y...) x,##y
73
#define MOD_ONLY(x,y...)
74
#define func(x) __div##x
75
#define modulus $2
76
#define quotient $27
77
#define GETSIGN(x) xor $24,$25,x
78
#define STACK 48
79
#else
80
#define DIV_ONLY(x,y...)
81
#define MOD_ONLY(x,y...) x,##y
82
#define func(x) __rem##x
83
#define modulus $27
84
#define quotient $2
85
#define GETSIGN(x) bis $24,$24,x
86
#define STACK 32
87
#endif
88

89
/*
90
 * For 32-bit operations, we need to extend to 64-bit
91
 */
92
#ifdef INTSIZE
93
#define ufunction func(lu)
94
#define sfunction func(l)
95
#define LONGIFY(x) zapnot x,15,x
96
#define SLONGIFY(x) addl x,0,x
97
#else
98
#define ufunction func(qu)
99
#define sfunction func(q)
100
#define LONGIFY(x)
101
#define SLONGIFY(x)
102
#endif
103

104
.set noat
105
.align	4
106
.globl	ufunction
107
.ent	ufunction
108
ufunction:
109
	subq	$30,STACK,$30		# E :
110
	.frame	$30,STACK,$23
111
	.prologue 0
112

113
7:	stq	$1, 0($30)		# L :
114
	bis	$25,$25,divisor		# E :
115
	stq	$2, 8($30)		# L : L U L U
116

117
	bis	$24,$24,modulus		# E :
118
	stq	$0,16($30)		# L :
119
	bis	$31,$31,quotient	# E :
120
	LONGIFY(divisor)		# E : U L L U
121

122
	stq	tmp1,24($30)		# L :
123
	LONGIFY(modulus)		# E :
124
	bis	$31,1,mask		# E :
125
	DIV_ONLY(stq tmp2,32($30))	# L : L U U L
126

127
	beq	divisor, 9f			/* div by zero */
128
	/*
129
	 * In spite of the DIV_ONLY being either a non-instruction
130
	 * or an actual stq, the addition of the .align directive
131
	 * below ensures that label 1 is going to be nicely aligned
132
	 */
133

134
	.align	4
135
#ifdef INTSIZE
136
	/*
137
	 * shift divisor left, using 3-bit shifts for
138
	 * 32-bit divides as we can't overflow. Three-bit
139
	 * shifts will result in looping three times less
140
	 * here, but can result in two loops more later.
141
	 * Thus using a large shift isn't worth it (and
142
	 * s8add pairs better than a sll..)
143
	 */
144
1:	cmpult	divisor,modulus,compare	# E :
145
	s8addq	divisor,$31,divisor	# E :
146
	s8addq	mask,$31,mask		# E :
147
	bne	compare,1b		# U : U L U L
148
#else
149
1:	cmpult	divisor,modulus,compare	# E :
150
	nop				# E :
151
	nop				# E :
152
	blt     divisor, 2f		# U : U L U L
153

154
	addq	divisor,divisor,divisor	# E :
155
	addq	mask,mask,mask		# E :
156
	unop				# E :
157
	bne	compare,1b		# U : U L U L
158
#endif
159

160
	/* ok, start to go right again.. */
161
2:
162
	/*
163
	 * Keep things nicely bundled... use a nop instead of not
164
	 * having an instruction for DIV_ONLY
165
	 */
166
#ifdef DIV
167
	DIV_ONLY(addq quotient,mask,tmp2) # E :
168
#else
169
	nop				# E :
170
#endif
171
	srl	mask,1,mask		# U :
172
	cmpule	divisor,modulus,compare	# E :
173
	subq	modulus,divisor,tmp1	# E :
174

175
#ifdef DIV
176
	DIV_ONLY(cmovne compare,tmp2,quotient)	# E : Latency 2, extra map slot
177
	nop				# E : as part of the cmovne
178
	srl	divisor,1,divisor	# U :
179
	nop				# E : L U L U
180

181
	nop				# E :
182
	cmovne	compare,tmp1,modulus	# E : Latency 2, extra map slot
183
	nop				# E : as part of the cmovne
184
	bne	mask,2b			# U : U L U L
185
#else
186
	srl	divisor,1,divisor	# U :
187
	cmovne	compare,tmp1,modulus	# E : Latency 2, extra map slot
188
	nop				# E : as part of the cmovne
189
	bne	mask,2b			# U : U L L U
190
#endif
191

192
9:	ldq	$1, 0($30)		# L :
193
	ldq	$2, 8($30)		# L :
194
	nop				# E :
195
	nop				# E : U U L L
196

197
	ldq	$0,16($30)		# L :
198
	ldq	tmp1,24($30)		# L :
199
	nop				# E :
200
	nop				# E :
201

202
#ifdef DIV
203
	DIV_ONLY(ldq tmp2,32($30))	# L :
204
#else
205
	nop				# E :
206
#endif
207
	addq	$30,STACK,$30		# E :
208
	ret	$31,($23),1		# L0 : L U U L
209
	.end	ufunction
210
EXPORT_SYMBOL(ufunction)
211

212
/*
213
 * Uhh.. Ugly signed division. I'd rather not have it at all, but
214
 * it's needed in some circumstances. There are different ways to
215
 * handle this, really. This does:
216
 * 	-a / b = a / -b = -(a / b)
217
 *	-a % b = -(a % b)
218
 *	a % -b = a % b
219
 * which is probably not the best solution, but at least should
220
 * have the property that (x/y)*y + (x%y) = x.
221
 */
222
.align 4
223
.globl	sfunction
224
.ent	sfunction
225
sfunction:
226
	subq	$30,STACK,$30		# E :
227
	.frame	$30,STACK,$23
228
	.prologue 0
229
	bis	$24,$25,$28		# E :
230
	SLONGIFY($28)			# E :
231
	bge	$28,7b			# U :
232

233
	stq	$24,0($30)		# L :
234
	subq	$31,$24,$28		# E :
235
	stq	$25,8($30)		# L :
236
	nop				# E : U L U L
237

238
	cmovlt	$24,$28,$24	/* abs($24) */ # E : Latency 2, extra map slot
239
	nop				# E : as part of the cmov
240
	stq	$23,16($30)		# L :
241
	subq	$31,$25,$28		# E : U L U L
242

243
	stq	tmp1,24($30)		# L :
244
	cmovlt	$25,$28,$25	/* abs($25) */ # E : Latency 2, extra map slot
245
	nop				# E :
246
	bsr	$23,ufunction		# L0: L U L U
247

248
	ldq	$24,0($30)		# L :
249
	ldq	$25,8($30)		# L :
250
	GETSIGN($28)			# E :
251
	subq	$31,$27,tmp1		# E : U U L L
252

253
	SLONGIFY($28)			# E :
254
	ldq	$23,16($30)		# L :
255
	cmovlt	$28,tmp1,$27		# E : Latency 2, extra map slot
256
	nop				# E : U L L U : as part of the cmov
257

258
	ldq	tmp1,24($30)		# L :
259
	nop				# E : as part of the cmov
260
	addq	$30,STACK,$30		# E :
261
	ret	$31,($23),1		# L0 : L U U L
262
	.end	sfunction
263
EXPORT_SYMBOL(sfunction)
264

265
Product

Resources

Company