CoCalc -- setox.S

GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/m68k/fpsp040/setox.S
¹⁷⁵⁸⁰ views
1
|
2
|	setox.sa 3.1 12/10/90
3
|
4
|	The entry point setox computes the exponential of a value.
5
|	setoxd does the same except the input value is a denormalized
6
|	number.	setoxm1 computes exp(X)-1, and setoxm1d computes
7
|	exp(X)-1 for denormalized X.
8
|
9
|	INPUT
10
|	-----
11
|	Double-extended value in memory location pointed to by address
12
|	register a0.
13
|
14
|	OUTPUT
15
|	------
16
|	exp(X) or exp(X)-1 returned in floating-point register fp0.
17
|
18
|	ACCURACY and MONOTONICITY
19
|	-------------------------
20
|	The returned result is within 0.85 ulps in 64 significant bit, i.e.
21
|	within 0.5001 ulp to 53 bits if the result is subsequently rounded
22
|	to double precision. The result is provably monotonic in double
23
|	precision.
24
|
25
|	SPEED
26
|	-----
27
|	Two timings are measured, both in the copy-back mode. The
28
|	first one is measured when the function is invoked the first time
29
|	(so the instructions and data are not in cache), and the
30
|	second one is measured when the function is reinvoked at the same
31
|	input argument.
32
|
33
|	The program setox takes approximately 210/190 cycles for input
34
|	argument X whose magnitude is less than 16380 log2, which
35
|	is the usual situation.	For the less common arguments,
36
|	depending on their values, the program may run faster or slower --
37
|	but no worse than 10% slower even in the extreme cases.
38
|
39
|	The program setoxm1 takes approximately ??? / ??? cycles for input
40
|	argument X, 0.25 <= |X| < 70log2. For |X| < 0.25, it takes
41
|	approximately ??? / ??? cycles. For the less common arguments,
42
|	depending on their values, the program may run faster or slower --
43
|	but no worse than 10% slower even in the extreme cases.
44
|
45
|	ALGORITHM and IMPLEMENTATION NOTES
46
|	----------------------------------
47
|
48
|	setoxd
49
|	------
50
|	Step 1.	Set ans := 1.0
51
|
52
|	Step 2.	Return	ans := ans + sign(X)*2^(-126). Exit.
53
|	Notes:	This will always generate one exception -- inexact.
54
|
55
|
56
|	setox
57
|	-----
58
|
59
|	Step 1.	Filter out extreme cases of input argument.
60
|		1.1	If |X| >= 2^(-65), go to Step 1.3.
61
|		1.2	Go to Step 7.
62
|		1.3	If |X| < 16380 log(2), go to Step 2.
63
|		1.4	Go to Step 8.
64
|	Notes:	The usual case should take the branches 1.1 -> 1.3 -> 2.
65
|		 To avoid the use of floating-point comparisons, a
66
|		 compact representation of |X| is used. This format is a
67
|		 32-bit integer, the upper (more significant) 16 bits are
68
|		 the sign and biased exponent field of |X|; the lower 16
69
|		 bits are the 16 most significant fraction (including the
70
|		 explicit bit) bits of |X|. Consequently, the comparisons
71
|		 in Steps 1.1 and 1.3 can be performed by integer comparison.
72
|		 Note also that the constant 16380 log(2) used in Step 1.3
73
|		 is also in the compact form. Thus taking the branch
74
|		 to Step 2 guarantees |X| < 16380 log(2). There is no harm
75
|		 to have a small number of cases where |X| is less than,
76
|		 but close to, 16380 log(2) and the branch to Step 9 is
77
|		 taken.
78
|
79
|	Step 2.	Calculate N = round-to-nearest-int( X * 64/log2 ).
80
|		2.1	Set AdjFlag := 0 (indicates the branch 1.3 -> 2 was taken)
81
|		2.2	N := round-to-nearest-integer( X * 64/log2 ).
82
|		2.3	Calculate	J = N mod 64; so J = 0,1,2,..., or 63.
83
|		2.4	Calculate	M = (N - J)/64; so N = 64M + J.
84
|		2.5	Calculate the address of the stored value of 2^(J/64).
85
|		2.6	Create the value Scale = 2^M.
86
|	Notes:	The calculation in 2.2 is really performed by
87
|
88
|			Z := X * constant
89
|			N := round-to-nearest-integer(Z)
90
|
91
|		 where
92
|
93
|			constant := single-precision( 64/log 2 ).
94
|
95
|		 Using a single-precision constant avoids memory access.
96
|		 Another effect of using a single-precision "constant" is
97
|		 that the calculated value Z is
98
|
99
|			Z = X*(64/log2)*(1+eps), |eps| <= 2^(-24).
100
|
101
|		 This error has to be considered later in Steps 3 and 4.
102
|
103
|	Step 3.	Calculate X - N*log2/64.
104
|		3.1	R := X + N*L1, where L1 := single-precision(-log2/64).
105
|		3.2	R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
106
|	Notes:	a) The way L1 and L2 are chosen ensures L1+L2 approximate
107
|		 the value	-log2/64	to 88 bits of accuracy.
108
|		 b) N*L1 is exact because N is no longer than 22 bits and
109
|		 L1 is no longer than 24 bits.
110
|		 c) The calculation X+N*L1 is also exact due to cancellation.
111
|		 Thus, R is practically X+N(L1+L2) to full 64 bits.
112
|		 d) It is important to estimate how large can |R| be after
113
|		 Step 3.2.
114
|
115
|			N = rnd-to-int( X*64/log2 (1+eps) ), |eps|<=2^(-24)
116
|			X*64/log2 (1+eps)	=	N + f,	|f| <= 0.5
117
|			X*64/log2 - N	=	f - eps*X 64/log2
118
|			X - N*log2/64	=	f*log2/64 - eps*X
119
|
120
|
121
|		 Now |X| <= 16446 log2, thus
122
|
123
|			|X - N*log2/64| <= (0.5 + 16446/2^(18))*log2/64
124
|					<= 0.57 log2/64.
125
|		 This bound will be used in Step 4.
126
|
127
|	Step 4.	Approximate exp(R)-1 by a polynomial
128
|			p = R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))
129
|	Notes:	a) In order to reduce memory access, the coefficients are
130
|		 made as "short" as possible: A1 (which is 1/2), A4 and A5
131
|		 are single precision; A2 and A3 are double precision.
132
|		 b) Even with the restrictions above,
133
|			|p - (exp(R)-1)| < 2^(-68.8) for all |R| <= 0.0062.
134
|		 Note that 0.0062 is slightly bigger than 0.57 log2/64.
135
|		 c) To fully utilize the pipeline, p is separated into
136
|		 two independent pieces of roughly equal complexities
137
|			p = [ R + R*S*(A2 + S*A4) ]	+
138
|				[ S*(A1 + S*(A3 + S*A5)) ]
139
|		 where S = R*R.
140
|
141
|	Step 5.	Compute 2^(J/64)*exp(R) = 2^(J/64)*(1+p) by
142
|				ans := T + ( T*p + t)
143
|		 where T and t are the stored values for 2^(J/64).
144
|	Notes:	2^(J/64) is stored as T and t where T+t approximates
145
|		 2^(J/64) to roughly 85 bits; T is in extended precision
146
|		 and t is in single precision. Note also that T is rounded
147
|		 to 62 bits so that the last two bits of T are zero. The
148
|		 reason for such a special form is that T-1, T-2, and T-8
149
|		 will all be exact --- a property that will give much
150
|		 more accurate computation of the function EXPM1.
151
|
152
|	Step 6.	Reconstruction of exp(X)
153
|			exp(X) = 2^M * 2^(J/64) * exp(R).
154
|		6.1	If AdjFlag = 0, go to 6.3
155
|		6.2	ans := ans * AdjScale
156
|		6.3	Restore the user FPCR
157
|		6.4	Return ans := ans * Scale. Exit.
158
|	Notes:	If AdjFlag = 0, we have X = Mlog2 + Jlog2/64 + R,
159
|		 |M| <= 16380, and Scale = 2^M. Moreover, exp(X) will
160
|		 neither overflow nor underflow. If AdjFlag = 1, that
161
|		 means that
162
|			X = (M1+M)log2 + Jlog2/64 + R, |M1+M| >= 16380.
163
|		 Hence, exp(X) may overflow or underflow or neither.
164
|		 When that is the case, AdjScale = 2^(M1) where M1 is
165
|		 approximately M. Thus 6.2 will never cause over/underflow.
166
|		 Possible exception in 6.4 is overflow or underflow.
167
|		 The inexact exception is not generated in 6.4. Although
168
|		 one can argue that the inexact flag should always be
169
|		 raised, to simulate that exception cost to much than the
170
|		 flag is worth in practical uses.
171
|
172
|	Step 7.	Return 1 + X.
173
|		7.1	ans := X
174
|		7.2	Restore user FPCR.
175
|		7.3	Return ans := 1 + ans. Exit
176
|	Notes:	For non-zero X, the inexact exception will always be
177
|		 raised by 7.3. That is the only exception raised by 7.3.
178
|		 Note also that we use the FMOVEM instruction to move X
179
|		 in Step 7.1 to avoid unnecessary trapping. (Although
180
|		 the FMOVEM may not seem relevant since X is normalized,
181
|		 the precaution will be useful in the library version of
182
|		 this code where the separate entry for denormalized inputs
183
|		 will be done away with.)
184
|
185
|	Step 8.	Handle exp(X) where |X| >= 16380log2.
186
|		8.1	If |X| > 16480 log2, go to Step 9.
187
|		(mimic 2.2 - 2.6)
188
|		8.2	N := round-to-integer( X * 64/log2 )
189
|		8.3	Calculate J = N mod 64, J = 0,1,...,63
190
|		8.4	K := (N-J)/64, M1 := truncate(K/2), M = K-M1, AdjFlag := 1.
191
|		8.5	Calculate the address of the stored value 2^(J/64).
192
|		8.6	Create the values Scale = 2^M, AdjScale = 2^M1.
193
|		8.7	Go to Step 3.
194
|	Notes:	Refer to notes for 2.2 - 2.6.
195
|
196
|	Step 9.	Handle exp(X), |X| > 16480 log2.
197
|		9.1	If X < 0, go to 9.3
198
|		9.2	ans := Huge, go to 9.4
199
|		9.3	ans := Tiny.
200
|		9.4	Restore user FPCR.
201
|		9.5	Return ans := ans * ans. Exit.
202
|	Notes:	Exp(X) will surely overflow or underflow, depending on
203
|		 X's sign. "Huge" and "Tiny" are respectively large/tiny
204
|		 extended-precision numbers whose square over/underflow
205
|		 with an inexact result. Thus, 9.5 always raises the
206
|		 inexact together with either overflow or underflow.
207
|
208
|
209
|	setoxm1d
210
|	--------
211
|
212
|	Step 1.	Set ans := 0
213
|
214
|	Step 2.	Return	ans := X + ans. Exit.
215
|	Notes:	This will return X with the appropriate rounding
216
|		 precision prescribed by the user FPCR.
217
|
218
|	setoxm1
219
|	-------
220
|
221
|	Step 1.	Check |X|
222
|		1.1	If |X| >= 1/4, go to Step 1.3.
223
|		1.2	Go to Step 7.
224
|		1.3	If |X| < 70 log(2), go to Step 2.
225
|		1.4	Go to Step 10.
226
|	Notes:	The usual case should take the branches 1.1 -> 1.3 -> 2.
227
|		 However, it is conceivable |X| can be small very often
228
|		 because EXPM1 is intended to evaluate exp(X)-1 accurately
229
|		 when |X| is small. For further details on the comparisons,
230
|		 see the notes on Step 1 of setox.
231
|
232
|	Step 2.	Calculate N = round-to-nearest-int( X * 64/log2 ).
233
|		2.1	N := round-to-nearest-integer( X * 64/log2 ).
234
|		2.2	Calculate	J = N mod 64; so J = 0,1,2,..., or 63.
235
|		2.3	Calculate	M = (N - J)/64; so N = 64M + J.
236
|		2.4	Calculate the address of the stored value of 2^(J/64).
237
|		2.5	Create the values Sc = 2^M and OnebySc := -2^(-M).
238
|	Notes:	See the notes on Step 2 of setox.
239
|
240
|	Step 3.	Calculate X - N*log2/64.
241
|		3.1	R := X + N*L1, where L1 := single-precision(-log2/64).
242
|		3.2	R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
243
|	Notes:	Applying the analysis of Step 3 of setox in this case
244
|		 shows that |R| <= 0.0055 (note that |X| <= 70 log2 in
245
|		 this case).
246
|
247
|	Step 4.	Approximate exp(R)-1 by a polynomial
248
|			p = R+R*R*(A1+R*(A2+R*(A3+R*(A4+R*(A5+R*A6)))))
249
|	Notes:	a) In order to reduce memory access, the coefficients are
250
|		 made as "short" as possible: A1 (which is 1/2), A5 and A6
251
|		 are single precision; A2, A3 and A4 are double precision.
252
|		 b) Even with the restriction above,
253
|			|p - (exp(R)-1)| <	|R| * 2^(-72.7)
254
|		 for all |R| <= 0.0055.
255
|		 c) To fully utilize the pipeline, p is separated into
256
|		 two independent pieces of roughly equal complexity
257
|			p = [ R*S*(A2 + S*(A4 + S*A6)) ]	+
258
|				[ R + S*(A1 + S*(A3 + S*A5)) ]
259
|		 where S = R*R.
260
|
261
|	Step 5.	Compute 2^(J/64)*p by
262
|				p := T*p
263
|		 where T and t are the stored values for 2^(J/64).
264
|	Notes:	2^(J/64) is stored as T and t where T+t approximates
265
|		 2^(J/64) to roughly 85 bits; T is in extended precision
266
|		 and t is in single precision. Note also that T is rounded
267
|		 to 62 bits so that the last two bits of T are zero. The
268
|		 reason for such a special form is that T-1, T-2, and T-8
269
|		 will all be exact --- a property that will be exploited
270
|		 in Step 6 below. The total relative error in p is no
271
|		 bigger than 2^(-67.7) compared to the final result.
272
|
273
|	Step 6.	Reconstruction of exp(X)-1
274
|			exp(X)-1 = 2^M * ( 2^(J/64) + p - 2^(-M) ).
275
|		6.1	If M <= 63, go to Step 6.3.
276
|		6.2	ans := T + (p + (t + OnebySc)). Go to 6.6
277
|		6.3	If M >= -3, go to 6.5.
278
|		6.4	ans := (T + (p + t)) + OnebySc. Go to 6.6
279
|		6.5	ans := (T + OnebySc) + (p + t).
280
|		6.6	Restore user FPCR.
281
|		6.7	Return ans := Sc * ans. Exit.
282
|	Notes:	The various arrangements of the expressions give accurate
283
|		 evaluations.
284
|
285
|	Step 7.	exp(X)-1 for |X| < 1/4.
286
|		7.1	If |X| >= 2^(-65), go to Step 9.
287
|		7.2	Go to Step 8.
288
|
289
|	Step 8.	Calculate exp(X)-1, |X| < 2^(-65).
290
|		8.1	If |X| < 2^(-16312), goto 8.3
291
|		8.2	Restore FPCR; return ans := X - 2^(-16382). Exit.
292
|		8.3	X := X * 2^(140).
293
|		8.4	Restore FPCR; ans := ans - 2^(-16382).
294
|		 Return ans := ans*2^(140). Exit
295
|	Notes:	The idea is to return "X - tiny" under the user
296
|		 precision and rounding modes. To avoid unnecessary
297
|		 inefficiency, we stay away from denormalized numbers the
298
|		 best we can. For |X| >= 2^(-16312), the straightforward
299
|		 8.2 generates the inexact exception as the case warrants.
300
|
301
|	Step 9.	Calculate exp(X)-1, |X| < 1/4, by a polynomial
302
|			p = X + X*X*(B1 + X*(B2 + ... + X*B12))
303
|	Notes:	a) In order to reduce memory access, the coefficients are
304
|		 made as "short" as possible: B1 (which is 1/2), B9 to B12
305
|		 are single precision; B3 to B8 are double precision; and
306
|		 B2 is double extended.
307
|		 b) Even with the restriction above,
308
|			|p - (exp(X)-1)| < |X| 2^(-70.6)
309
|		 for all |X| <= 0.251.
310
|		 Note that 0.251 is slightly bigger than 1/4.
311
|		 c) To fully preserve accuracy, the polynomial is computed
312
|		 as	X + ( S*B1 +	Q ) where S = X*X and
313
|			Q	=	X*S*(B2 + X*(B3 + ... + X*B12))
314
|		 d) To fully utilize the pipeline, Q is separated into
315
|		 two independent pieces of roughly equal complexity
316
|			Q = [ X*S*(B2 + S*(B4 + ... + S*B12)) ] +
317
|				[ S*S*(B3 + S*(B5 + ... + S*B11)) ]
318
|
319
|	Step 10.	Calculate exp(X)-1 for |X| >= 70 log 2.
320
|		10.1 If X >= 70log2 , exp(X) - 1 = exp(X) for all practical
321
|		 purposes. Therefore, go to Step 1 of setox.
322
|		10.2 If X <= -70log2, exp(X) - 1 = -1 for all practical purposes.
323
|		 ans := -1
324
|		 Restore user FPCR
325
|		 Return ans := ans + 2^(-126). Exit.
326
|	Notes:	10.2 will always create an inexact and return -1 + tiny
327
|		 in the user rounding precision and mode.
328
|
329
|
330

331
|		Copyright (C) Motorola, Inc. 1990
332
|			All Rights Reserved
333
|
334
|       For details on the license for this file, please see the
335
|       file, README, in this same directory.
336

337
|setox	idnt	2,1 | Motorola 040 Floating Point Software Package
338

339
	|section	8
340

341
#include "fpsp.h"
342

343
L2:	.long	0x3FDC0000,0x82E30865,0x4361C4C6,0x00000000
344

345
EXPA3:	.long	0x3FA55555,0x55554431
346
EXPA2:	.long	0x3FC55555,0x55554018
347

348
HUGE:	.long	0x7FFE0000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
349
TINY:	.long	0x00010000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
350

351
EM1A4:	.long	0x3F811111,0x11174385
352
EM1A3:	.long	0x3FA55555,0x55554F5A
353

354
EM1A2:	.long	0x3FC55555,0x55555555,0x00000000,0x00000000
355

356
EM1B8:	.long	0x3EC71DE3,0xA5774682
357
EM1B7:	.long	0x3EFA01A0,0x19D7CB68
358

359
EM1B6:	.long	0x3F2A01A0,0x1A019DF3
360
EM1B5:	.long	0x3F56C16C,0x16C170E2
361

362
EM1B4:	.long	0x3F811111,0x11111111
363
EM1B3:	.long	0x3FA55555,0x55555555
364

365
EM1B2:	.long	0x3FFC0000,0xAAAAAAAA,0xAAAAAAAB
366
	.long	0x00000000
367

368
TWO140:	.long	0x48B00000,0x00000000
369
TWON140:	.long	0x37300000,0x00000000
370

371
EXPTBL:
372
	.long	0x3FFF0000,0x80000000,0x00000000,0x00000000
373
	.long	0x3FFF0000,0x8164D1F3,0xBC030774,0x9F841A9B
374
	.long	0x3FFF0000,0x82CD8698,0xAC2BA1D8,0x9FC1D5B9
375
	.long	0x3FFF0000,0x843A28C3,0xACDE4048,0xA0728369
376
	.long	0x3FFF0000,0x85AAC367,0xCC487B14,0x1FC5C95C
377
	.long	0x3FFF0000,0x871F6196,0x9E8D1010,0x1EE85C9F
378
	.long	0x3FFF0000,0x88980E80,0x92DA8528,0x9FA20729
379
	.long	0x3FFF0000,0x8A14D575,0x496EFD9C,0xA07BF9AF
380
	.long	0x3FFF0000,0x8B95C1E3,0xEA8BD6E8,0xA0020DCF
381
	.long	0x3FFF0000,0x8D1ADF5B,0x7E5BA9E4,0x205A63DA
382
	.long	0x3FFF0000,0x8EA4398B,0x45CD53C0,0x1EB70051
383
	.long	0x3FFF0000,0x9031DC43,0x1466B1DC,0x1F6EB029
384
	.long	0x3FFF0000,0x91C3D373,0xAB11C338,0xA0781494
385
	.long	0x3FFF0000,0x935A2B2F,0x13E6E92C,0x9EB319B0
386
	.long	0x3FFF0000,0x94F4EFA8,0xFEF70960,0x2017457D
387
	.long	0x3FFF0000,0x96942D37,0x20185A00,0x1F11D537
388
	.long	0x3FFF0000,0x9837F051,0x8DB8A970,0x9FB952DD
389
	.long	0x3FFF0000,0x99E04593,0x20B7FA64,0x1FE43087
390
	.long	0x3FFF0000,0x9B8D39B9,0xD54E5538,0x1FA2A818
391
	.long	0x3FFF0000,0x9D3ED9A7,0x2CFFB750,0x1FDE494D
392
	.long	0x3FFF0000,0x9EF53260,0x91A111AC,0x20504890
393
	.long	0x3FFF0000,0xA0B0510F,0xB9714FC4,0xA073691C
394
	.long	0x3FFF0000,0xA2704303,0x0C496818,0x1F9B7A05
395
	.long	0x3FFF0000,0xA43515AE,0x09E680A0,0xA0797126
396
	.long	0x3FFF0000,0xA5FED6A9,0xB15138EC,0xA071A140
397
	.long	0x3FFF0000,0xA7CD93B4,0xE9653568,0x204F62DA
398
	.long	0x3FFF0000,0xA9A15AB4,0xEA7C0EF8,0x1F283C4A
399
	.long	0x3FFF0000,0xAB7A39B5,0xA93ED338,0x9F9A7FDC
400
	.long	0x3FFF0000,0xAD583EEA,0x42A14AC8,0xA05B3FAC
401
	.long	0x3FFF0000,0xAF3B78AD,0x690A4374,0x1FDF2610
402
	.long	0x3FFF0000,0xB123F581,0xD2AC2590,0x9F705F90
403
	.long	0x3FFF0000,0xB311C412,0xA9112488,0x201F678A
404
	.long	0x3FFF0000,0xB504F333,0xF9DE6484,0x1F32FB13
405
	.long	0x3FFF0000,0xB6FD91E3,0x28D17790,0x20038B30
406
	.long	0x3FFF0000,0xB8FBAF47,0x62FB9EE8,0x200DC3CC
407
	.long	0x3FFF0000,0xBAFF5AB2,0x133E45FC,0x9F8B2AE6
408
	.long	0x3FFF0000,0xBD08A39F,0x580C36C0,0xA02BBF70
409
	.long	0x3FFF0000,0xBF1799B6,0x7A731084,0xA00BF518
410
	.long	0x3FFF0000,0xC12C4CCA,0x66709458,0xA041DD41
411
	.long	0x3FFF0000,0xC346CCDA,0x24976408,0x9FDF137B
412
	.long	0x3FFF0000,0xC5672A11,0x5506DADC,0x201F1568
413
	.long	0x3FFF0000,0xC78D74C8,0xABB9B15C,0x1FC13A2E
414
	.long	0x3FFF0000,0xC9B9BD86,0x6E2F27A4,0xA03F8F03
415
	.long	0x3FFF0000,0xCBEC14FE,0xF2727C5C,0x1FF4907D
416
	.long	0x3FFF0000,0xCE248C15,0x1F8480E4,0x9E6E53E4
417
	.long	0x3FFF0000,0xD06333DA,0xEF2B2594,0x1FD6D45C
418
	.long	0x3FFF0000,0xD2A81D91,0xF12AE45C,0xA076EDB9
419
	.long	0x3FFF0000,0xD4F35AAB,0xCFEDFA20,0x9FA6DE21
420
	.long	0x3FFF0000,0xD744FCCA,0xD69D6AF4,0x1EE69A2F
421
	.long	0x3FFF0000,0xD99D15C2,0x78AFD7B4,0x207F439F
422
	.long	0x3FFF0000,0xDBFBB797,0xDAF23754,0x201EC207
423
	.long	0x3FFF0000,0xDE60F482,0x5E0E9124,0x9E8BE175
424
	.long	0x3FFF0000,0xE0CCDEEC,0x2A94E110,0x20032C4B
425
	.long	0x3FFF0000,0xE33F8972,0xBE8A5A50,0x2004DFF5
426
	.long	0x3FFF0000,0xE5B906E7,0x7C8348A8,0x1E72F47A
427
	.long	0x3FFF0000,0xE8396A50,0x3C4BDC68,0x1F722F22
428
	.long	0x3FFF0000,0xEAC0C6E7,0xDD243930,0xA017E945
429
	.long	0x3FFF0000,0xED4F301E,0xD9942B84,0x1F401A5B
430
	.long	0x3FFF0000,0xEFE4B99B,0xDCDAF5CC,0x9FB9A9E3
431
	.long	0x3FFF0000,0xF281773C,0x59FFB138,0x20744C05
432
	.long	0x3FFF0000,0xF5257D15,0x2486CC2C,0x1F773A19
433
	.long	0x3FFF0000,0xF7D0DF73,0x0AD13BB8,0x1FFE90D5
434
	.long	0x3FFF0000,0xFA83B2DB,0x722A033C,0xA041ED22
435
	.long	0x3FFF0000,0xFD3E0C0C,0xF486C174,0x1F853F3A
436

437
	.set	ADJFLAG,L_SCR2
438
	.set	SCALE,FP_SCR1
439
	.set	ADJSCALE,FP_SCR2
440
	.set	SC,FP_SCR3
441
	.set	ONEBYSC,FP_SCR4
442

443
	| xref	t_frcinx
444
	|xref	t_extdnrm
445
	|xref	t_unfl
446
	|xref	t_ovfl
447

448
	.global	setoxd
449
setoxd:
450
|--entry point for EXP(X), X is denormalized
451
	movel		(%a0),%d0
452
	andil		#0x80000000,%d0
453
	oril		#0x00800000,%d0		| ...sign(X)*2^(-126)
454
	movel		%d0,-(%sp)
455
	fmoves		#0x3F800000,%fp0
456
	fmovel		%d1,%fpcr
457
	fadds		(%sp)+,%fp0
458
	bra		t_frcinx
459

460
	.global	setox
461
setox:
462
|--entry point for EXP(X), here X is finite, non-zero, and not NaN's
463

464
|--Step 1.
465
	movel		(%a0),%d0	 | ...load part of input X
466
	andil		#0x7FFF0000,%d0	| ...biased expo. of X
467
	cmpil		#0x3FBE0000,%d0	| ...2^(-65)
468
	bges		EXPC1		| ...normal case
469
	bra		EXPSM
470

471
EXPC1:
472
|--The case |X| >= 2^(-65)
473
	movew		4(%a0),%d0	| ...expo. and partial sig. of |X|
474
	cmpil		#0x400CB167,%d0	| ...16380 log2 trunc. 16 bits
475
	blts		EXPMAIN	 | ...normal case
476
	bra		EXPBIG
477

478
EXPMAIN:
479
|--Step 2.
480
|--This is the normal branch:	2^(-65) <= |X| < 16380 log2.
481
	fmovex		(%a0),%fp0	| ...load input from (a0)
482

483
	fmovex		%fp0,%fp1
484
	fmuls		#0x42B8AA3B,%fp0	| ...64/log2 * X
485
	fmovemx	%fp2-%fp2/%fp3,-(%a7)		| ...save fp2
486
	movel		#0,ADJFLAG(%a6)
487
	fmovel		%fp0,%d0		| ...N = int( X * 64/log2 )
488
	lea		EXPTBL,%a1
489
	fmovel		%d0,%fp0		| ...convert to floating-format
490

491
	movel		%d0,L_SCR1(%a6)	| ...save N temporarily
492
	andil		#0x3F,%d0		| ...D0 is J = N mod 64
493
	lsll		#4,%d0
494
	addal		%d0,%a1		| ...address of 2^(J/64)
495
	movel		L_SCR1(%a6),%d0
496
	asrl		#6,%d0		| ...D0 is M
497
	addiw		#0x3FFF,%d0	| ...biased expo. of 2^(M)
498
	movew		L2,L_SCR1(%a6)	| ...prefetch L2, no need in CB
499

500
EXPCONT1:
501
|--Step 3.
502
|--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
503
|--a0 points to 2^(J/64), D0 is biased expo. of 2^(M)
504
	fmovex		%fp0,%fp2
505
	fmuls		#0xBC317218,%fp0	| ...N * L1, L1 = lead(-log2/64)
506
	fmulx		L2,%fp2		| ...N * L2, L1+L2 = -log2/64
507
	faddx		%fp1,%fp0		| ...X + N*L1
508
	faddx		%fp2,%fp0		| ...fp0 is R, reduced arg.
509
|	MOVE.W		#$3FA5,EXPA3	...load EXPA3 in cache
510

511
|--Step 4.
512
|--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
513
|-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))
514
|--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
515
|--[R+R*S*(A2+S*A4)] + [S*(A1+S*(A3+S*A5))]
516

517
	fmovex		%fp0,%fp1
518
	fmulx		%fp1,%fp1		| ...fp1 IS S = R*R
519

520
	fmoves		#0x3AB60B70,%fp2	| ...fp2 IS A5
521
|	MOVE.W		#0,2(%a1)	...load 2^(J/64) in cache
522

523
	fmulx		%fp1,%fp2		| ...fp2 IS S*A5
524
	fmovex		%fp1,%fp3
525
	fmuls		#0x3C088895,%fp3	| ...fp3 IS S*A4
526

527
	faddd		EXPA3,%fp2	| ...fp2 IS A3+S*A5
528
	faddd		EXPA2,%fp3	| ...fp3 IS A2+S*A4
529

530
	fmulx		%fp1,%fp2		| ...fp2 IS S*(A3+S*A5)
531
	movew		%d0,SCALE(%a6)	| ...SCALE is 2^(M) in extended
532
	clrw		SCALE+2(%a6)
533
	movel		#0x80000000,SCALE+4(%a6)
534
	clrl		SCALE+8(%a6)
535

536
	fmulx		%fp1,%fp3		| ...fp3 IS S*(A2+S*A4)
537

538
	fadds		#0x3F000000,%fp2	| ...fp2 IS A1+S*(A3+S*A5)
539
	fmulx		%fp0,%fp3		| ...fp3 IS R*S*(A2+S*A4)
540

541
	fmulx		%fp1,%fp2		| ...fp2 IS S*(A1+S*(A3+S*A5))
542
	faddx		%fp3,%fp0		| ...fp0 IS R+R*S*(A2+S*A4),
543
|					...fp3 released
544

545
	fmovex		(%a1)+,%fp1	| ...fp1 is lead. pt. of 2^(J/64)
546
	faddx		%fp2,%fp0		| ...fp0 is EXP(R) - 1
547
|					...fp2 released
548

549
|--Step 5
550
|--final reconstruction process
551
|--EXP(X) = 2^M * ( 2^(J/64) + 2^(J/64)*(EXP(R)-1) )
552

553
	fmulx		%fp1,%fp0		| ...2^(J/64)*(Exp(R)-1)
554
	fmovemx	(%a7)+,%fp2-%fp2/%fp3	| ...fp2 restored
555
	fadds		(%a1),%fp0	| ...accurate 2^(J/64)
556

557
	faddx		%fp1,%fp0		| ...2^(J/64) + 2^(J/64)*...
558
	movel		ADJFLAG(%a6),%d0
559

560
|--Step 6
561
	tstl		%d0
562
	beqs		NORMAL
563
ADJUST:
564
	fmulx		ADJSCALE(%a6),%fp0
565
NORMAL:
566
	fmovel		%d1,%FPCR		| ...restore user FPCR
567
	fmulx		SCALE(%a6),%fp0	| ...multiply 2^(M)
568
	bra		t_frcinx
569

570
EXPSM:
571
|--Step 7
572
	fmovemx	(%a0),%fp0-%fp0	| ...in case X is denormalized
573
	fmovel		%d1,%FPCR
574
	fadds		#0x3F800000,%fp0	| ...1+X in user mode
575
	bra		t_frcinx
576

577
EXPBIG:
578
|--Step 8
579
	cmpil		#0x400CB27C,%d0	| ...16480 log2
580
	bgts		EXP2BIG
581
|--Steps 8.2 -- 8.6
582
	fmovex		(%a0),%fp0	| ...load input from (a0)
583

584
	fmovex		%fp0,%fp1
585
	fmuls		#0x42B8AA3B,%fp0	| ...64/log2 * X
586
	fmovemx	 %fp2-%fp2/%fp3,-(%a7)		| ...save fp2
587
	movel		#1,ADJFLAG(%a6)
588
	fmovel		%fp0,%d0		| ...N = int( X * 64/log2 )
589
	lea		EXPTBL,%a1
590
	fmovel		%d0,%fp0		| ...convert to floating-format
591
	movel		%d0,L_SCR1(%a6)			| ...save N temporarily
592
	andil		#0x3F,%d0		 | ...D0 is J = N mod 64
593
	lsll		#4,%d0
594
	addal		%d0,%a1			| ...address of 2^(J/64)
595
	movel		L_SCR1(%a6),%d0
596
	asrl		#6,%d0			| ...D0 is K
597
	movel		%d0,L_SCR1(%a6)			| ...save K temporarily
598
	asrl		#1,%d0			| ...D0 is M1
599
	subl		%d0,L_SCR1(%a6)			| ...a1 is M
600
	addiw		#0x3FFF,%d0		| ...biased expo. of 2^(M1)
601
	movew		%d0,ADJSCALE(%a6)		| ...ADJSCALE := 2^(M1)
602
	clrw		ADJSCALE+2(%a6)
603
	movel		#0x80000000,ADJSCALE+4(%a6)
604
	clrl		ADJSCALE+8(%a6)
605
	movel		L_SCR1(%a6),%d0			| ...D0 is M
606
	addiw		#0x3FFF,%d0		| ...biased expo. of 2^(M)
607
	bra		EXPCONT1		| ...go back to Step 3
608

609
EXP2BIG:
610
|--Step 9
611
	fmovel		%d1,%FPCR
612
	movel		(%a0),%d0
613
	bclrb		#sign_bit,(%a0)		| ...setox always returns positive
614
	cmpil		#0,%d0
615
	blt		t_unfl
616
	bra		t_ovfl
617

618
	.global	setoxm1d
619
setoxm1d:
620
|--entry point for EXPM1(X), here X is denormalized
621
|--Step 0.
622
	bra		t_extdnrm
623

624

625
	.global	setoxm1
626
setoxm1:
627
|--entry point for EXPM1(X), here X is finite, non-zero, non-NaN
628

629
|--Step 1.
630
|--Step 1.1
631
	movel		(%a0),%d0	 | ...load part of input X
632
	andil		#0x7FFF0000,%d0	| ...biased expo. of X
633
	cmpil		#0x3FFD0000,%d0	| ...1/4
634
	bges		EM1CON1	 | ...|X| >= 1/4
635
	bra		EM1SM
636

637
EM1CON1:
638
|--Step 1.3
639
|--The case |X| >= 1/4
640
	movew		4(%a0),%d0	| ...expo. and partial sig. of |X|
641
	cmpil		#0x4004C215,%d0	| ...70log2 rounded up to 16 bits
642
	bles		EM1MAIN	 | ...1/4 <= |X| <= 70log2
643
	bra		EM1BIG
644

645
EM1MAIN:
646
|--Step 2.
647
|--This is the case:	1/4 <= |X| <= 70 log2.
648
	fmovex		(%a0),%fp0	| ...load input from (a0)
649

650
	fmovex		%fp0,%fp1
651
	fmuls		#0x42B8AA3B,%fp0	| ...64/log2 * X
652
	fmovemx	%fp2-%fp2/%fp3,-(%a7)		| ...save fp2
653
|	MOVE.W		#$3F81,EM1A4		...prefetch in CB mode
654
	fmovel		%fp0,%d0		| ...N = int( X * 64/log2 )
655
	lea		EXPTBL,%a1
656
	fmovel		%d0,%fp0		| ...convert to floating-format
657

658
	movel		%d0,L_SCR1(%a6)			| ...save N temporarily
659
	andil		#0x3F,%d0		 | ...D0 is J = N mod 64
660
	lsll		#4,%d0
661
	addal		%d0,%a1			| ...address of 2^(J/64)
662
	movel		L_SCR1(%a6),%d0
663
	asrl		#6,%d0			| ...D0 is M
664
	movel		%d0,L_SCR1(%a6)			| ...save a copy of M
665
|	MOVE.W		#$3FDC,L2		...prefetch L2 in CB mode
666

667
|--Step 3.
668
|--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
669
|--a0 points to 2^(J/64), D0 and a1 both contain M
670
	fmovex		%fp0,%fp2
671
	fmuls		#0xBC317218,%fp0	| ...N * L1, L1 = lead(-log2/64)
672
	fmulx		L2,%fp2		| ...N * L2, L1+L2 = -log2/64
673
	faddx		%fp1,%fp0	 | ...X + N*L1
674
	faddx		%fp2,%fp0	 | ...fp0 is R, reduced arg.
675
|	MOVE.W		#$3FC5,EM1A2		...load EM1A2 in cache
676
	addiw		#0x3FFF,%d0		| ...D0 is biased expo. of 2^M
677

678
|--Step 4.
679
|--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
680
|-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*(A5 + R*A6)))))
681
|--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
682
|--[R*S*(A2+S*(A4+S*A6))] + [R+S*(A1+S*(A3+S*A5))]
683

684
	fmovex		%fp0,%fp1
685
	fmulx		%fp1,%fp1		| ...fp1 IS S = R*R
686

687
	fmoves		#0x3950097B,%fp2	| ...fp2 IS a6
688
|	MOVE.W		#0,2(%a1)	...load 2^(J/64) in cache
689

690
	fmulx		%fp1,%fp2		| ...fp2 IS S*A6
691
	fmovex		%fp1,%fp3
692
	fmuls		#0x3AB60B6A,%fp3	| ...fp3 IS S*A5
693

694
	faddd		EM1A4,%fp2	| ...fp2 IS A4+S*A6
695
	faddd		EM1A3,%fp3	| ...fp3 IS A3+S*A5
696
	movew		%d0,SC(%a6)		| ...SC is 2^(M) in extended
697
	clrw		SC+2(%a6)
698
	movel		#0x80000000,SC+4(%a6)
699
	clrl		SC+8(%a6)
700

701
	fmulx		%fp1,%fp2		| ...fp2 IS S*(A4+S*A6)
702
	movel		L_SCR1(%a6),%d0		| ...D0 is	M
703
	negw		%d0		| ...D0 is -M
704
	fmulx		%fp1,%fp3		| ...fp3 IS S*(A3+S*A5)
705
	addiw		#0x3FFF,%d0	| ...biased expo. of 2^(-M)
706
	faddd		EM1A2,%fp2	| ...fp2 IS A2+S*(A4+S*A6)
707
	fadds		#0x3F000000,%fp3	| ...fp3 IS A1+S*(A3+S*A5)
708

709
	fmulx		%fp1,%fp2		| ...fp2 IS S*(A2+S*(A4+S*A6))
710
	oriw		#0x8000,%d0	| ...signed/expo. of -2^(-M)
711
	movew		%d0,ONEBYSC(%a6)	| ...OnebySc is -2^(-M)
712
	clrw		ONEBYSC+2(%a6)
713
	movel		#0x80000000,ONEBYSC+4(%a6)
714
	clrl		ONEBYSC+8(%a6)
715
	fmulx		%fp3,%fp1		| ...fp1 IS S*(A1+S*(A3+S*A5))
716
|					...fp3 released
717

718
	fmulx		%fp0,%fp2		| ...fp2 IS R*S*(A2+S*(A4+S*A6))
719
	faddx		%fp1,%fp0		| ...fp0 IS R+S*(A1+S*(A3+S*A5))
720
|					...fp1 released
721

722
	faddx		%fp2,%fp0		| ...fp0 IS EXP(R)-1
723
|					...fp2 released
724
	fmovemx	(%a7)+,%fp2-%fp2/%fp3	| ...fp2 restored
725

726
|--Step 5
727
|--Compute 2^(J/64)*p
728

729
	fmulx		(%a1),%fp0	| ...2^(J/64)*(Exp(R)-1)
730

731
|--Step 6
732
|--Step 6.1
733
	movel		L_SCR1(%a6),%d0		| ...retrieve M
734
	cmpil		#63,%d0
735
	bles		MLE63
736
|--Step 6.2	M >= 64
737
	fmoves		12(%a1),%fp1	| ...fp1 is t
738
	faddx		ONEBYSC(%a6),%fp1	| ...fp1 is t+OnebySc
739
	faddx		%fp1,%fp0		| ...p+(t+OnebySc), fp1 released
740
	faddx		(%a1),%fp0	| ...T+(p+(t+OnebySc))
741
	bras		EM1SCALE
742
MLE63:
743
|--Step 6.3	M <= 63
744
	cmpil		#-3,%d0
745
	bges		MGEN3
746
MLTN3:
747
|--Step 6.4	M <= -4
748
	fadds		12(%a1),%fp0	| ...p+t
749
	faddx		(%a1),%fp0	| ...T+(p+t)
750
	faddx		ONEBYSC(%a6),%fp0	| ...OnebySc + (T+(p+t))
751
	bras		EM1SCALE
752
MGEN3:
753
|--Step 6.5	-3 <= M <= 63
754
	fmovex		(%a1)+,%fp1	| ...fp1 is T
755
	fadds		(%a1),%fp0	| ...fp0 is p+t
756
	faddx		ONEBYSC(%a6),%fp1	| ...fp1 is T+OnebySc
757
	faddx		%fp1,%fp0		| ...(T+OnebySc)+(p+t)
758

759
EM1SCALE:
760
|--Step 6.6
761
	fmovel		%d1,%FPCR
762
	fmulx		SC(%a6),%fp0
763

764
	bra		t_frcinx
765

766
EM1SM:
767
|--Step 7	|X| < 1/4.
768
	cmpil		#0x3FBE0000,%d0	| ...2^(-65)
769
	bges		EM1POLY
770

771
EM1TINY:
772
|--Step 8	|X| < 2^(-65)
773
	cmpil		#0x00330000,%d0	| ...2^(-16312)
774
	blts		EM12TINY
775
|--Step 8.2
776
	movel		#0x80010000,SC(%a6)	| ...SC is -2^(-16382)
777
	movel		#0x80000000,SC+4(%a6)
778
	clrl		SC+8(%a6)
779
	fmovex		(%a0),%fp0
780
	fmovel		%d1,%FPCR
781
	faddx		SC(%a6),%fp0
782

783
	bra		t_frcinx
784

785
EM12TINY:
786
|--Step 8.3
787
	fmovex		(%a0),%fp0
788
	fmuld		TWO140,%fp0
789
	movel		#0x80010000,SC(%a6)
790
	movel		#0x80000000,SC+4(%a6)
791
	clrl		SC+8(%a6)
792
	faddx		SC(%a6),%fp0
793
	fmovel		%d1,%FPCR
794
	fmuld		TWON140,%fp0
795

796
	bra		t_frcinx
797

798
EM1POLY:
799
|--Step 9	exp(X)-1 by a simple polynomial
800
	fmovex		(%a0),%fp0	| ...fp0 is X
801
	fmulx		%fp0,%fp0		| ...fp0 is S := X*X
802
	fmovemx	%fp2-%fp2/%fp3,-(%a7)	| ...save fp2
803
	fmoves		#0x2F30CAA8,%fp1	| ...fp1 is B12
804
	fmulx		%fp0,%fp1		| ...fp1 is S*B12
805
	fmoves		#0x310F8290,%fp2	| ...fp2 is B11
806
	fadds		#0x32D73220,%fp1	| ...fp1 is B10+S*B12
807

808
	fmulx		%fp0,%fp2		| ...fp2 is S*B11
809
	fmulx		%fp0,%fp1		| ...fp1 is S*(B10 + ...
810

811
	fadds		#0x3493F281,%fp2	| ...fp2 is B9+S*...
812
	faddd		EM1B8,%fp1	| ...fp1 is B8+S*...
813

814
	fmulx		%fp0,%fp2		| ...fp2 is S*(B9+...
815
	fmulx		%fp0,%fp1		| ...fp1 is S*(B8+...
816

817
	faddd		EM1B7,%fp2	| ...fp2 is B7+S*...
818
	faddd		EM1B6,%fp1	| ...fp1 is B6+S*...
819

820
	fmulx		%fp0,%fp2		| ...fp2 is S*(B7+...
821
	fmulx		%fp0,%fp1		| ...fp1 is S*(B6+...
822

823
	faddd		EM1B5,%fp2	| ...fp2 is B5+S*...
824
	faddd		EM1B4,%fp1	| ...fp1 is B4+S*...
825

826
	fmulx		%fp0,%fp2		| ...fp2 is S*(B5+...
827
	fmulx		%fp0,%fp1		| ...fp1 is S*(B4+...
828

829
	faddd		EM1B3,%fp2	| ...fp2 is B3+S*...
830
	faddx		EM1B2,%fp1	| ...fp1 is B2+S*...
831

832
	fmulx		%fp0,%fp2		| ...fp2 is S*(B3+...
833
	fmulx		%fp0,%fp1		| ...fp1 is S*(B2+...
834

835
	fmulx		%fp0,%fp2		| ...fp2 is S*S*(B3+...)
836
	fmulx		(%a0),%fp1	| ...fp1 is X*S*(B2...
837

838
	fmuls		#0x3F000000,%fp0	| ...fp0 is S*B1
839
	faddx		%fp2,%fp1		| ...fp1 is Q
840
|					...fp2 released
841

842
	fmovemx	(%a7)+,%fp2-%fp2/%fp3	| ...fp2 restored
843

844
	faddx		%fp1,%fp0		| ...fp0 is S*B1+Q
845
|					...fp1 released
846

847
	fmovel		%d1,%FPCR
848
	faddx		(%a0),%fp0
849

850
	bra		t_frcinx
851

852
EM1BIG:
853
|--Step 10	|X| > 70 log2
854
	movel		(%a0),%d0
855
	cmpil		#0,%d0
856
	bgt		EXPC1
857
|--Step 10.2
858
	fmoves		#0xBF800000,%fp0	| ...fp0 is -1
859
	fmovel		%d1,%FPCR
860
	fadds		#0x00800000,%fp0	| ...-1 + 2^(-126)
861

862
	bra		t_frcinx
863

864
	|end
865

866
Product

Resources

Company