Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/m68k/fpsp040/stan.S
10817 views
1
|
2
| stan.sa 3.3 7/29/91
3
|
4
| The entry point stan computes the tangent of
5
| an input argument;
6
| stand does the same except for denormalized input.
7
|
8
| Input: Double-extended number X in location pointed to
9
| by address register a0.
10
|
11
| Output: The value tan(X) returned in floating-point register Fp0.
12
|
13
| Accuracy and Monotonicity: The returned result is within 3 ulp in
14
| 64 significant bit, i.e. within 0.5001 ulp to 53 bits if the
15
| result is subsequently rounded to double precision. The
16
| result is provably monotonic in double precision.
17
|
18
| Speed: The program sTAN takes approximately 170 cycles for
19
| input argument X such that |X| < 15Pi, which is the usual
20
| situation.
21
|
22
| Algorithm:
23
|
24
| 1. If |X| >= 15Pi or |X| < 2**(-40), go to 6.
25
|
26
| 2. Decompose X as X = N(Pi/2) + r where |r| <= Pi/4. Let
27
| k = N mod 2, so in particular, k = 0 or 1.
28
|
29
| 3. If k is odd, go to 5.
30
|
31
| 4. (k is even) Tan(X) = tan(r) and tan(r) is approximated by a
32
| rational function U/V where
33
| U = r + r*s*(P1 + s*(P2 + s*P3)), and
34
| V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r.
35
| Exit.
36
|
37
| 4. (k is odd) Tan(X) = -cot(r). Since tan(r) is approximated by a
38
| rational function U/V where
39
| U = r + r*s*(P1 + s*(P2 + s*P3)), and
40
| V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r,
41
| -Cot(r) = -V/U. Exit.
42
|
43
| 6. If |X| > 1, go to 8.
44
|
45
| 7. (|X|<2**(-40)) Tan(X) = X. Exit.
46
|
47
| 8. Overwrite X by X := X rem 2Pi. Now that |X| <= Pi, go back to 2.
48
|
49
50
| Copyright (C) Motorola, Inc. 1990
51
| All Rights Reserved
52
|
53
| For details on the license for this file, please see the
54
| file, README, in this same directory.
55
56
|STAN idnt 2,1 | Motorola 040 Floating Point Software Package
57
58
|section 8
59
60
#include "fpsp.h"
61
62
BOUNDS1: .long 0x3FD78000,0x4004BC7E
63
TWOBYPI: .long 0x3FE45F30,0x6DC9C883
64
65
TANQ4: .long 0x3EA0B759,0xF50F8688
66
TANP3: .long 0xBEF2BAA5,0xA8924F04
67
68
TANQ3: .long 0xBF346F59,0xB39BA65F,0x00000000,0x00000000
69
70
TANP2: .long 0x3FF60000,0xE073D3FC,0x199C4A00,0x00000000
71
72
TANQ2: .long 0x3FF90000,0xD23CD684,0x15D95FA1,0x00000000
73
74
TANP1: .long 0xBFFC0000,0x8895A6C5,0xFB423BCA,0x00000000
75
76
TANQ1: .long 0xBFFD0000,0xEEF57E0D,0xA84BC8CE,0x00000000
77
78
INVTWOPI: .long 0x3FFC0000,0xA2F9836E,0x4E44152A,0x00000000
79
80
TWOPI1: .long 0x40010000,0xC90FDAA2,0x00000000,0x00000000
81
TWOPI2: .long 0x3FDF0000,0x85A308D4,0x00000000,0x00000000
82
83
|--N*PI/2, -32 <= N <= 32, IN A LEADING TERM IN EXT. AND TRAILING
84
|--TERM IN SGL. NOTE THAT PI IS 64-BIT LONG, THUS N*PI/2 IS AT
85
|--MOST 69 BITS LONG.
86
.global PITBL
87
PITBL:
88
.long 0xC0040000,0xC90FDAA2,0x2168C235,0x21800000
89
.long 0xC0040000,0xC2C75BCD,0x105D7C23,0xA0D00000
90
.long 0xC0040000,0xBC7EDCF7,0xFF523611,0xA1E80000
91
.long 0xC0040000,0xB6365E22,0xEE46F000,0x21480000
92
.long 0xC0040000,0xAFEDDF4D,0xDD3BA9EE,0xA1200000
93
.long 0xC0040000,0xA9A56078,0xCC3063DD,0x21FC0000
94
.long 0xC0040000,0xA35CE1A3,0xBB251DCB,0x21100000
95
.long 0xC0040000,0x9D1462CE,0xAA19D7B9,0xA1580000
96
.long 0xC0040000,0x96CBE3F9,0x990E91A8,0x21E00000
97
.long 0xC0040000,0x90836524,0x88034B96,0x20B00000
98
.long 0xC0040000,0x8A3AE64F,0x76F80584,0xA1880000
99
.long 0xC0040000,0x83F2677A,0x65ECBF73,0x21C40000
100
.long 0xC0030000,0xFB53D14A,0xA9C2F2C2,0x20000000
101
.long 0xC0030000,0xEEC2D3A0,0x87AC669F,0x21380000
102
.long 0xC0030000,0xE231D5F6,0x6595DA7B,0xA1300000
103
.long 0xC0030000,0xD5A0D84C,0x437F4E58,0x9FC00000
104
.long 0xC0030000,0xC90FDAA2,0x2168C235,0x21000000
105
.long 0xC0030000,0xBC7EDCF7,0xFF523611,0xA1680000
106
.long 0xC0030000,0xAFEDDF4D,0xDD3BA9EE,0xA0A00000
107
.long 0xC0030000,0xA35CE1A3,0xBB251DCB,0x20900000
108
.long 0xC0030000,0x96CBE3F9,0x990E91A8,0x21600000
109
.long 0xC0030000,0x8A3AE64F,0x76F80584,0xA1080000
110
.long 0xC0020000,0xFB53D14A,0xA9C2F2C2,0x1F800000
111
.long 0xC0020000,0xE231D5F6,0x6595DA7B,0xA0B00000
112
.long 0xC0020000,0xC90FDAA2,0x2168C235,0x20800000
113
.long 0xC0020000,0xAFEDDF4D,0xDD3BA9EE,0xA0200000
114
.long 0xC0020000,0x96CBE3F9,0x990E91A8,0x20E00000
115
.long 0xC0010000,0xFB53D14A,0xA9C2F2C2,0x1F000000
116
.long 0xC0010000,0xC90FDAA2,0x2168C235,0x20000000
117
.long 0xC0010000,0x96CBE3F9,0x990E91A8,0x20600000
118
.long 0xC0000000,0xC90FDAA2,0x2168C235,0x1F800000
119
.long 0xBFFF0000,0xC90FDAA2,0x2168C235,0x1F000000
120
.long 0x00000000,0x00000000,0x00000000,0x00000000
121
.long 0x3FFF0000,0xC90FDAA2,0x2168C235,0x9F000000
122
.long 0x40000000,0xC90FDAA2,0x2168C235,0x9F800000
123
.long 0x40010000,0x96CBE3F9,0x990E91A8,0xA0600000
124
.long 0x40010000,0xC90FDAA2,0x2168C235,0xA0000000
125
.long 0x40010000,0xFB53D14A,0xA9C2F2C2,0x9F000000
126
.long 0x40020000,0x96CBE3F9,0x990E91A8,0xA0E00000
127
.long 0x40020000,0xAFEDDF4D,0xDD3BA9EE,0x20200000
128
.long 0x40020000,0xC90FDAA2,0x2168C235,0xA0800000
129
.long 0x40020000,0xE231D5F6,0x6595DA7B,0x20B00000
130
.long 0x40020000,0xFB53D14A,0xA9C2F2C2,0x9F800000
131
.long 0x40030000,0x8A3AE64F,0x76F80584,0x21080000
132
.long 0x40030000,0x96CBE3F9,0x990E91A8,0xA1600000
133
.long 0x40030000,0xA35CE1A3,0xBB251DCB,0xA0900000
134
.long 0x40030000,0xAFEDDF4D,0xDD3BA9EE,0x20A00000
135
.long 0x40030000,0xBC7EDCF7,0xFF523611,0x21680000
136
.long 0x40030000,0xC90FDAA2,0x2168C235,0xA1000000
137
.long 0x40030000,0xD5A0D84C,0x437F4E58,0x1FC00000
138
.long 0x40030000,0xE231D5F6,0x6595DA7B,0x21300000
139
.long 0x40030000,0xEEC2D3A0,0x87AC669F,0xA1380000
140
.long 0x40030000,0xFB53D14A,0xA9C2F2C2,0xA0000000
141
.long 0x40040000,0x83F2677A,0x65ECBF73,0xA1C40000
142
.long 0x40040000,0x8A3AE64F,0x76F80584,0x21880000
143
.long 0x40040000,0x90836524,0x88034B96,0xA0B00000
144
.long 0x40040000,0x96CBE3F9,0x990E91A8,0xA1E00000
145
.long 0x40040000,0x9D1462CE,0xAA19D7B9,0x21580000
146
.long 0x40040000,0xA35CE1A3,0xBB251DCB,0xA1100000
147
.long 0x40040000,0xA9A56078,0xCC3063DD,0xA1FC0000
148
.long 0x40040000,0xAFEDDF4D,0xDD3BA9EE,0x21200000
149
.long 0x40040000,0xB6365E22,0xEE46F000,0xA1480000
150
.long 0x40040000,0xBC7EDCF7,0xFF523611,0x21E80000
151
.long 0x40040000,0xC2C75BCD,0x105D7C23,0x20D00000
152
.long 0x40040000,0xC90FDAA2,0x2168C235,0xA1800000
153
154
.set INARG,FP_SCR4
155
156
.set TWOTO63,L_SCR1
157
.set ENDFLAG,L_SCR2
158
.set N,L_SCR3
159
160
| xref t_frcinx
161
|xref t_extdnrm
162
163
.global stand
164
stand:
165
|--TAN(X) = X FOR DENORMALIZED X
166
167
bra t_extdnrm
168
169
.global stan
170
stan:
171
fmovex (%a0),%fp0 | ...LOAD INPUT
172
173
movel (%a0),%d0
174
movew 4(%a0),%d0
175
andil #0x7FFFFFFF,%d0
176
177
cmpil #0x3FD78000,%d0 | ...|X| >= 2**(-40)?
178
bges TANOK1
179
bra TANSM
180
TANOK1:
181
cmpil #0x4004BC7E,%d0 | ...|X| < 15 PI?
182
blts TANMAIN
183
bra REDUCEX
184
185
186
TANMAIN:
187
|--THIS IS THE USUAL CASE, |X| <= 15 PI.
188
|--THE ARGUMENT REDUCTION IS DONE BY TABLE LOOK UP.
189
fmovex %fp0,%fp1
190
fmuld TWOBYPI,%fp1 | ...X*2/PI
191
192
|--HIDE THE NEXT TWO INSTRUCTIONS
193
leal PITBL+0x200,%a1 | ...TABLE OF N*PI/2, N = -32,...,32
194
195
|--FP1 IS NOW READY
196
fmovel %fp1,%d0 | ...CONVERT TO INTEGER
197
198
asll #4,%d0
199
addal %d0,%a1 | ...ADDRESS N*PIBY2 IN Y1, Y2
200
201
fsubx (%a1)+,%fp0 | ...X-Y1
202
|--HIDE THE NEXT ONE
203
204
fsubs (%a1),%fp0 | ...FP0 IS R = (X-Y1)-Y2
205
206
rorl #5,%d0
207
andil #0x80000000,%d0 | ...D0 WAS ODD IFF D0 < 0
208
209
TANCONT:
210
211
cmpil #0,%d0
212
blt NODD
213
214
fmovex %fp0,%fp1
215
fmulx %fp1,%fp1 | ...S = R*R
216
217
fmoved TANQ4,%fp3
218
fmoved TANP3,%fp2
219
220
fmulx %fp1,%fp3 | ...SQ4
221
fmulx %fp1,%fp2 | ...SP3
222
223
faddd TANQ3,%fp3 | ...Q3+SQ4
224
faddx TANP2,%fp2 | ...P2+SP3
225
226
fmulx %fp1,%fp3 | ...S(Q3+SQ4)
227
fmulx %fp1,%fp2 | ...S(P2+SP3)
228
229
faddx TANQ2,%fp3 | ...Q2+S(Q3+SQ4)
230
faddx TANP1,%fp2 | ...P1+S(P2+SP3)
231
232
fmulx %fp1,%fp3 | ...S(Q2+S(Q3+SQ4))
233
fmulx %fp1,%fp2 | ...S(P1+S(P2+SP3))
234
235
faddx TANQ1,%fp3 | ...Q1+S(Q2+S(Q3+SQ4))
236
fmulx %fp0,%fp2 | ...RS(P1+S(P2+SP3))
237
238
fmulx %fp3,%fp1 | ...S(Q1+S(Q2+S(Q3+SQ4)))
239
240
241
faddx %fp2,%fp0 | ...R+RS(P1+S(P2+SP3))
242
243
244
fadds #0x3F800000,%fp1 | ...1+S(Q1+...)
245
246
fmovel %d1,%fpcr |restore users exceptions
247
fdivx %fp1,%fp0 |last inst - possible exception set
248
249
bra t_frcinx
250
251
NODD:
252
fmovex %fp0,%fp1
253
fmulx %fp0,%fp0 | ...S = R*R
254
255
fmoved TANQ4,%fp3
256
fmoved TANP3,%fp2
257
258
fmulx %fp0,%fp3 | ...SQ4
259
fmulx %fp0,%fp2 | ...SP3
260
261
faddd TANQ3,%fp3 | ...Q3+SQ4
262
faddx TANP2,%fp2 | ...P2+SP3
263
264
fmulx %fp0,%fp3 | ...S(Q3+SQ4)
265
fmulx %fp0,%fp2 | ...S(P2+SP3)
266
267
faddx TANQ2,%fp3 | ...Q2+S(Q3+SQ4)
268
faddx TANP1,%fp2 | ...P1+S(P2+SP3)
269
270
fmulx %fp0,%fp3 | ...S(Q2+S(Q3+SQ4))
271
fmulx %fp0,%fp2 | ...S(P1+S(P2+SP3))
272
273
faddx TANQ1,%fp3 | ...Q1+S(Q2+S(Q3+SQ4))
274
fmulx %fp1,%fp2 | ...RS(P1+S(P2+SP3))
275
276
fmulx %fp3,%fp0 | ...S(Q1+S(Q2+S(Q3+SQ4)))
277
278
279
faddx %fp2,%fp1 | ...R+RS(P1+S(P2+SP3))
280
fadds #0x3F800000,%fp0 | ...1+S(Q1+...)
281
282
283
fmovex %fp1,-(%sp)
284
eoril #0x80000000,(%sp)
285
286
fmovel %d1,%fpcr |restore users exceptions
287
fdivx (%sp)+,%fp0 |last inst - possible exception set
288
289
bra t_frcinx
290
291
TANBORS:
292
|--IF |X| > 15PI, WE USE THE GENERAL ARGUMENT REDUCTION.
293
|--IF |X| < 2**(-40), RETURN X OR 1.
294
cmpil #0x3FFF8000,%d0
295
bgts REDUCEX
296
297
TANSM:
298
299
fmovex %fp0,-(%sp)
300
fmovel %d1,%fpcr |restore users exceptions
301
fmovex (%sp)+,%fp0 |last inst - possible exception set
302
303
bra t_frcinx
304
305
306
REDUCEX:
307
|--WHEN REDUCEX IS USED, THE CODE WILL INEVITABLY BE SLOW.
308
|--THIS REDUCTION METHOD, HOWEVER, IS MUCH FASTER THAN USING
309
|--THE REMAINDER INSTRUCTION WHICH IS NOW IN SOFTWARE.
310
311
fmovemx %fp2-%fp5,-(%a7) | ...save FP2 through FP5
312
movel %d2,-(%a7)
313
fmoves #0x00000000,%fp1
314
315
|--If compact form of abs(arg) in d0=$7ffeffff, argument is so large that
316
|--there is a danger of unwanted overflow in first LOOP iteration. In this
317
|--case, reduce argument by one remainder step to make subsequent reduction
318
|--safe.
319
cmpil #0x7ffeffff,%d0 |is argument dangerously large?
320
bnes LOOP
321
movel #0x7ffe0000,FP_SCR2(%a6) |yes
322
| ;create 2**16383*PI/2
323
movel #0xc90fdaa2,FP_SCR2+4(%a6)
324
clrl FP_SCR2+8(%a6)
325
ftstx %fp0 |test sign of argument
326
movel #0x7fdc0000,FP_SCR3(%a6) |create low half of 2**16383*
327
| ;PI/2 at FP_SCR3
328
movel #0x85a308d3,FP_SCR3+4(%a6)
329
clrl FP_SCR3+8(%a6)
330
fblt red_neg
331
orw #0x8000,FP_SCR2(%a6) |positive arg
332
orw #0x8000,FP_SCR3(%a6)
333
red_neg:
334
faddx FP_SCR2(%a6),%fp0 |high part of reduction is exact
335
fmovex %fp0,%fp1 |save high result in fp1
336
faddx FP_SCR3(%a6),%fp0 |low part of reduction
337
fsubx %fp0,%fp1 |determine low component of result
338
faddx FP_SCR3(%a6),%fp1 |fp0/fp1 are reduced argument.
339
340
|--ON ENTRY, FP0 IS X, ON RETURN, FP0 IS X REM PI/2, |X| <= PI/4.
341
|--integer quotient will be stored in N
342
|--Intermediate remainder is 66-bit long; (R,r) in (FP0,FP1)
343
344
LOOP:
345
fmovex %fp0,INARG(%a6) | ...+-2**K * F, 1 <= F < 2
346
movew INARG(%a6),%d0
347
movel %d0,%a1 | ...save a copy of D0
348
andil #0x00007FFF,%d0
349
subil #0x00003FFF,%d0 | ...D0 IS K
350
cmpil #28,%d0
351
bles LASTLOOP
352
CONTLOOP:
353
subil #27,%d0 | ...D0 IS L := K-27
354
movel #0,ENDFLAG(%a6)
355
bras WORK
356
LASTLOOP:
357
clrl %d0 | ...D0 IS L := 0
358
movel #1,ENDFLAG(%a6)
359
360
WORK:
361
|--FIND THE REMAINDER OF (R,r) W.R.T. 2**L * (PI/2). L IS SO CHOSEN
362
|--THAT INT( X * (2/PI) / 2**(L) ) < 2**29.
363
364
|--CREATE 2**(-L) * (2/PI), SIGN(INARG)*2**(63),
365
|--2**L * (PIby2_1), 2**L * (PIby2_2)
366
367
movel #0x00003FFE,%d2 | ...BIASED EXPO OF 2/PI
368
subl %d0,%d2 | ...BIASED EXPO OF 2**(-L)*(2/PI)
369
370
movel #0xA2F9836E,FP_SCR1+4(%a6)
371
movel #0x4E44152A,FP_SCR1+8(%a6)
372
movew %d2,FP_SCR1(%a6) | ...FP_SCR1 is 2**(-L)*(2/PI)
373
374
fmovex %fp0,%fp2
375
fmulx FP_SCR1(%a6),%fp2
376
|--WE MUST NOW FIND INT(FP2). SINCE WE NEED THIS VALUE IN
377
|--FLOATING POINT FORMAT, THE TWO FMOVE'S FMOVE.L FP <--> N
378
|--WILL BE TOO INEFFICIENT. THE WAY AROUND IT IS THAT
379
|--(SIGN(INARG)*2**63 + FP2) - SIGN(INARG)*2**63 WILL GIVE
380
|--US THE DESIRED VALUE IN FLOATING POINT.
381
382
|--HIDE SIX CYCLES OF INSTRUCTION
383
movel %a1,%d2
384
swap %d2
385
andil #0x80000000,%d2
386
oril #0x5F000000,%d2 | ...D2 IS SIGN(INARG)*2**63 IN SGL
387
movel %d2,TWOTO63(%a6)
388
389
movel %d0,%d2
390
addil #0x00003FFF,%d2 | ...BIASED EXPO OF 2**L * (PI/2)
391
392
|--FP2 IS READY
393
fadds TWOTO63(%a6),%fp2 | ...THE FRACTIONAL PART OF FP1 IS ROUNDED
394
395
|--HIDE 4 CYCLES OF INSTRUCTION; creating 2**(L)*Piby2_1 and 2**(L)*Piby2_2
396
movew %d2,FP_SCR2(%a6)
397
clrw FP_SCR2+2(%a6)
398
movel #0xC90FDAA2,FP_SCR2+4(%a6)
399
clrl FP_SCR2+8(%a6) | ...FP_SCR2 is 2**(L) * Piby2_1
400
401
|--FP2 IS READY
402
fsubs TWOTO63(%a6),%fp2 | ...FP2 is N
403
404
addil #0x00003FDD,%d0
405
movew %d0,FP_SCR3(%a6)
406
clrw FP_SCR3+2(%a6)
407
movel #0x85A308D3,FP_SCR3+4(%a6)
408
clrl FP_SCR3+8(%a6) | ...FP_SCR3 is 2**(L) * Piby2_2
409
410
movel ENDFLAG(%a6),%d0
411
412
|--We are now ready to perform (R+r) - N*P1 - N*P2, P1 = 2**(L) * Piby2_1 and
413
|--P2 = 2**(L) * Piby2_2
414
fmovex %fp2,%fp4
415
fmulx FP_SCR2(%a6),%fp4 | ...W = N*P1
416
fmovex %fp2,%fp5
417
fmulx FP_SCR3(%a6),%fp5 | ...w = N*P2
418
fmovex %fp4,%fp3
419
|--we want P+p = W+w but |p| <= half ulp of P
420
|--Then, we need to compute A := R-P and a := r-p
421
faddx %fp5,%fp3 | ...FP3 is P
422
fsubx %fp3,%fp4 | ...W-P
423
424
fsubx %fp3,%fp0 | ...FP0 is A := R - P
425
faddx %fp5,%fp4 | ...FP4 is p = (W-P)+w
426
427
fmovex %fp0,%fp3 | ...FP3 A
428
fsubx %fp4,%fp1 | ...FP1 is a := r - p
429
430
|--Now we need to normalize (A,a) to "new (R,r)" where R+r = A+a but
431
|--|r| <= half ulp of R.
432
faddx %fp1,%fp0 | ...FP0 is R := A+a
433
|--No need to calculate r if this is the last loop
434
cmpil #0,%d0
435
bgt RESTORE
436
437
|--Need to calculate r
438
fsubx %fp0,%fp3 | ...A-R
439
faddx %fp3,%fp1 | ...FP1 is r := (A-R)+a
440
bra LOOP
441
442
RESTORE:
443
fmovel %fp2,N(%a6)
444
movel (%a7)+,%d2
445
fmovemx (%a7)+,%fp2-%fp5
446
447
448
movel N(%a6),%d0
449
rorl #1,%d0
450
451
452
bra TANCONT
453
454
|end
455
456