CoCalc -- ev6-divide.S

GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/alpha/lib/ev6-divide.S
¹⁷³⁷² views
1
/*
2
 * arch/alpha/lib/ev6-divide.S
3
 *
4
 * 21264 version contributed by Rick Gorton <[email protected]>
5
 *
6
 * Alpha division..
7
 */
8

9
/*
10
 * The alpha chip doesn't provide hardware division, so we have to do it
11
 * by hand.  The compiler expects the functions
12
 *
13
 *	__divqu: 64-bit unsigned long divide
14
 *	__remqu: 64-bit unsigned long remainder
15
 *	__divqs/__remqs: signed 64-bit
16
 *	__divlu/__remlu: unsigned 32-bit
17
 *	__divls/__remls: signed 32-bit
18
 *
19
 * These are not normal C functions: instead of the normal
20
 * calling sequence, these expect their arguments in registers
21
 * $24 and $25, and return the result in $27. Register $28 may
22
 * be clobbered (assembly temporary), anything else must be saved. 
23
 *
24
 * In short: painful.
25
 *
26
 * This is a rather simple bit-at-a-time algorithm: it's very good
27
 * at dividing random 64-bit numbers, but the more usual case where
28
 * the divisor is small is handled better by the DEC algorithm
29
 * using lookup tables. This uses much less memory, though, and is
30
 * nicer on the cache.. Besides, I don't know the copyright status
31
 * of the DEC code.
32
 */
33

34
/*
35
 * My temporaries:
36
 *	$0 - current bit
37
 *	$1 - shifted divisor
38
 *	$2 - modulus/quotient
39
 *
40
 *	$23 - return address
41
 *	$24 - dividend
42
 *	$25 - divisor
43
 *
44
 *	$27 - quotient/modulus
45
 *	$28 - compare status
46
 *
47
 * Much of the information about 21264 scheduling/coding comes from:
48
 *	Compiler Writer's Guide for the Alpha 21264
49
 *	abbreviated as 'CWG' in other comments here
50
 *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
51
 * Scheduling notation:
52
 *	E	- either cluster
53
 *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
54
 *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
55
 * Try not to change the actual algorithm if possible for consistency.
56
 */
57

58
#define halt .long 0
59

60
/*
61
 * Select function type and registers
62
 */
63
#define mask	$0
64
#define divisor	$1
65
#define compare $28
66
#define tmp1	$3
67
#define tmp2	$4
68

69
#ifdef DIV
70
#define DIV_ONLY(x,y...) x,##y
71
#define MOD_ONLY(x,y...)
72
#define func(x) __div##x
73
#define modulus $2
74
#define quotient $27
75
#define GETSIGN(x) xor $24,$25,x
76
#define STACK 48
77
#else
78
#define DIV_ONLY(x,y...)
79
#define MOD_ONLY(x,y...) x,##y
80
#define func(x) __rem##x
81
#define modulus $27
82
#define quotient $2
83
#define GETSIGN(x) bis $24,$24,x
84
#define STACK 32
85
#endif
86

87
/*
88
 * For 32-bit operations, we need to extend to 64-bit
89
 */
90
#ifdef INTSIZE
91
#define ufunction func(lu)
92
#define sfunction func(l)
93
#define LONGIFY(x) zapnot x,15,x
94
#define SLONGIFY(x) addl x,0,x
95
#else
96
#define ufunction func(qu)
97
#define sfunction func(q)
98
#define LONGIFY(x)
99
#define SLONGIFY(x)
100
#endif
101

102
.set noat
103
.align	4
104
.globl	ufunction
105
.ent	ufunction
106
ufunction:
107
	subq	$30,STACK,$30		# E :
108
	.frame	$30,STACK,$23
109
	.prologue 0
110

111
7:	stq	$1, 0($30)		# L :
112
	bis	$25,$25,divisor		# E :
113
	stq	$2, 8($30)		# L : L U L U
114

115
	bis	$24,$24,modulus		# E :
116
	stq	$0,16($30)		# L :
117
	bis	$31,$31,quotient	# E :
118
	LONGIFY(divisor)		# E : U L L U
119

120
	stq	tmp1,24($30)		# L :
121
	LONGIFY(modulus)		# E :
122
	bis	$31,1,mask		# E :
123
	DIV_ONLY(stq tmp2,32($30))	# L : L U U L
124

125
	beq	divisor, 9f			/* div by zero */
126
	/*
127
	 * In spite of the DIV_ONLY being either a non-instruction
128
	 * or an actual stq, the addition of the .align directive
129
	 * below ensures that label 1 is going to be nicely aligned
130
	 */
131

132
	.align	4
133
#ifdef INTSIZE
134
	/*
135
	 * shift divisor left, using 3-bit shifts for
136
	 * 32-bit divides as we can't overflow. Three-bit
137
	 * shifts will result in looping three times less
138
	 * here, but can result in two loops more later.
139
	 * Thus using a large shift isn't worth it (and
140
	 * s8add pairs better than a sll..)
141
	 */
142
1:	cmpult	divisor,modulus,compare	# E :
143
	s8addq	divisor,$31,divisor	# E :
144
	s8addq	mask,$31,mask		# E :
145
	bne	compare,1b		# U : U L U L
146
#else
147
1:	cmpult	divisor,modulus,compare	# E :
148
	nop				# E :
149
	nop				# E :
150
	blt     divisor, 2f		# U : U L U L
151

152
	addq	divisor,divisor,divisor	# E :
153
	addq	mask,mask,mask		# E :
154
	unop				# E :
155
	bne	compare,1b		# U : U L U L
156
#endif
157

158
	/* ok, start to go right again.. */
159
2:
160
	/*
161
	 * Keep things nicely bundled... use a nop instead of not
162
	 * having an instruction for DIV_ONLY
163
	 */
164
#ifdef DIV
165
	DIV_ONLY(addq quotient,mask,tmp2) # E :
166
#else
167
	nop				# E :
168
#endif
169
	srl	mask,1,mask		# U :
170
	cmpule	divisor,modulus,compare	# E :
171
	subq	modulus,divisor,tmp1	# E :
172

173
#ifdef DIV
174
	DIV_ONLY(cmovne compare,tmp2,quotient)	# E : Latency 2, extra map slot
175
	nop				# E : as part of the cmovne
176
	srl	divisor,1,divisor	# U :
177
	nop				# E : L U L U
178

179
	nop				# E :
180
	cmovne	compare,tmp1,modulus	# E : Latency 2, extra map slot
181
	nop				# E : as part of the cmovne
182
	bne	mask,2b			# U : U L U L
183
#else
184
	srl	divisor,1,divisor	# U :
185
	cmovne	compare,tmp1,modulus	# E : Latency 2, extra map slot
186
	nop				# E : as part of the cmovne
187
	bne	mask,2b			# U : U L L U
188
#endif
189

190
9:	ldq	$1, 0($30)		# L :
191
	ldq	$2, 8($30)		# L :
192
	nop				# E :
193
	nop				# E : U U L L
194

195
	ldq	$0,16($30)		# L :
196
	ldq	tmp1,24($30)		# L :
197
	nop				# E :
198
	nop				# E :
199

200
#ifdef DIV
201
	DIV_ONLY(ldq tmp2,32($30))	# L :
202
#else
203
	nop				# E :
204
#endif
205
	addq	$30,STACK,$30		# E :
206
	ret	$31,($23),1		# L0 : L U U L
207
	.end	ufunction
208

209
/*
210
 * Uhh.. Ugly signed division. I'd rather not have it at all, but
211
 * it's needed in some circumstances. There are different ways to
212
 * handle this, really. This does:
213
 * 	-a / b = a / -b = -(a / b)
214
 *	-a % b = -(a % b)
215
 *	a % -b = a % b
216
 * which is probably not the best solution, but at least should
217
 * have the property that (x/y)*y + (x%y) = x.
218
 */
219
.align 4
220
.globl	sfunction
221
.ent	sfunction
222
sfunction:
223
	subq	$30,STACK,$30		# E :
224
	.frame	$30,STACK,$23
225
	.prologue 0
226
	bis	$24,$25,$28		# E :
227
	SLONGIFY($28)			# E :
228
	bge	$28,7b			# U :
229

230
	stq	$24,0($30)		# L :
231
	subq	$31,$24,$28		# E :
232
	stq	$25,8($30)		# L :
233
	nop				# E : U L U L
234

235
	cmovlt	$24,$28,$24	/* abs($24) */ # E : Latency 2, extra map slot
236
	nop				# E : as part of the cmov
237
	stq	$23,16($30)		# L :
238
	subq	$31,$25,$28		# E : U L U L
239

240
	stq	tmp1,24($30)		# L :
241
	cmovlt	$25,$28,$25	/* abs($25) */ # E : Latency 2, extra map slot
242
	nop				# E :
243
	bsr	$23,ufunction		# L0: L U L U
244

245
	ldq	$24,0($30)		# L :
246
	ldq	$25,8($30)		# L :
247
	GETSIGN($28)			# E :
248
	subq	$31,$27,tmp1		# E : U U L L
249

250
	SLONGIFY($28)			# E :
251
	ldq	$23,16($30)		# L :
252
	cmovlt	$28,tmp1,$27		# E : Latency 2, extra map slot
253
	nop				# E : U L L U : as part of the cmov
254

255
	ldq	tmp1,24($30)		# L :
256
	nop				# E : as part of the cmov
257
	addq	$30,STACK,$30		# E :
258
	ret	$31,($23),1		# L0 : L U U L
259
	.end	sfunction
260

261
Product

Resources

Company