Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/lib/libc/softfloat/bits32/softfloat.c
39530 views
1
/* $NetBSD: softfloat.c,v 1.1 2002/05/21 23:51:07 bjh21 Exp $ */
2
3
/*
4
* This version hacked for use with gcc -msoft-float by bjh21.
5
* (Mostly a case of #ifdefing out things GCC doesn't need or provides
6
* itself).
7
*/
8
9
/*
10
* Things you may want to define:
11
*
12
* SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13
* -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14
* properly renamed.
15
*/
16
17
/*
18
* This differs from the standard bits32/softfloat.c in that float64
19
* is defined to be a 64-bit integer rather than a structure. The
20
* structure is float64s, with translation between the two going via
21
* float64u.
22
*/
23
24
/*
25
===============================================================================
26
27
This C source file is part of the SoftFloat IEC/IEEE Floating-Point
28
Arithmetic Package, Release 2a.
29
30
Written by John R. Hauser. This work was made possible in part by the
31
International Computer Science Institute, located at Suite 600, 1947 Center
32
Street, Berkeley, California 94704. Funding was partially provided by the
33
National Science Foundation under grant MIP-9311980. The original version
34
of this code was written as part of a project to build a fixed-point vector
35
processor in collaboration with the University of California at Berkeley,
36
overseen by Profs. Nelson Morgan and John Wawrzynek. More information
37
is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
38
arithmetic/SoftFloat.html'.
39
40
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
41
has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
42
TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
43
PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
44
AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
45
46
Derivative works are acceptable, even for commercial purposes, so long as
47
(1) they include prominent notice that the work is derivative, and (2) they
48
include prominent notice akin to these four paragraphs for those parts of
49
this code that are retained.
50
51
===============================================================================
52
*/
53
54
#ifdef SOFTFLOAT_FOR_GCC
55
#include "softfloat-for-gcc.h"
56
#endif
57
58
#include "milieu.h"
59
#include "softfloat.h"
60
61
/*
62
* Conversions between floats as stored in memory and floats as
63
* SoftFloat uses them
64
*/
65
#ifndef FLOAT64_DEMANGLE
66
#define FLOAT64_DEMANGLE(a) (a)
67
#endif
68
#ifndef FLOAT64_MANGLE
69
#define FLOAT64_MANGLE(a) (a)
70
#endif
71
72
/*
73
-------------------------------------------------------------------------------
74
Floating-point rounding mode and exception flags.
75
-------------------------------------------------------------------------------
76
*/
77
int float_rounding_mode = float_round_nearest_even;
78
int float_exception_flags = 0;
79
80
/*
81
-------------------------------------------------------------------------------
82
Primitive arithmetic functions, including multi-word arithmetic, and
83
division and square root approximations. (Can be specialized to target if
84
desired.)
85
-------------------------------------------------------------------------------
86
*/
87
#include "softfloat-macros"
88
89
/*
90
-------------------------------------------------------------------------------
91
Functions and definitions to determine: (1) whether tininess for underflow
92
is detected before or after rounding by default, (2) what (if anything)
93
happens when exceptions are raised, (3) how signaling NaNs are distinguished
94
from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
95
are propagated from function inputs to output. These details are target-
96
specific.
97
-------------------------------------------------------------------------------
98
*/
99
#include "softfloat-specialize"
100
101
/*
102
-------------------------------------------------------------------------------
103
Returns the fraction bits of the single-precision floating-point value `a'.
104
-------------------------------------------------------------------------------
105
*/
106
INLINE bits32 extractFloat32Frac( float32 a )
107
{
108
109
return a & 0x007FFFFF;
110
111
}
112
113
/*
114
-------------------------------------------------------------------------------
115
Returns the exponent bits of the single-precision floating-point value `a'.
116
-------------------------------------------------------------------------------
117
*/
118
INLINE int16 extractFloat32Exp( float32 a )
119
{
120
121
return ( a>>23 ) & 0xFF;
122
123
}
124
125
/*
126
-------------------------------------------------------------------------------
127
Returns the sign bit of the single-precision floating-point value `a'.
128
-------------------------------------------------------------------------------
129
*/
130
INLINE flag extractFloat32Sign( float32 a )
131
{
132
133
return a>>31;
134
135
}
136
137
/*
138
-------------------------------------------------------------------------------
139
Normalizes the subnormal single-precision floating-point value represented
140
by the denormalized significand `aSig'. The normalized exponent and
141
significand are stored at the locations pointed to by `zExpPtr' and
142
`zSigPtr', respectively.
143
-------------------------------------------------------------------------------
144
*/
145
static void
146
normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
147
{
148
int8 shiftCount;
149
150
shiftCount = countLeadingZeros32( aSig ) - 8;
151
*zSigPtr = aSig<<shiftCount;
152
*zExpPtr = 1 - shiftCount;
153
154
}
155
156
/*
157
-------------------------------------------------------------------------------
158
Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
159
single-precision floating-point value, returning the result. After being
160
shifted into the proper positions, the three fields are simply added
161
together to form the result. This means that any integer portion of `zSig'
162
will be added into the exponent. Since a properly normalized significand
163
will have an integer portion equal to 1, the `zExp' input should be 1 less
164
than the desired result exponent whenever `zSig' is a complete, normalized
165
significand.
166
-------------------------------------------------------------------------------
167
*/
168
INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
169
{
170
171
return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
172
173
}
174
175
/*
176
-------------------------------------------------------------------------------
177
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
178
and significand `zSig', and returns the proper single-precision floating-
179
point value corresponding to the abstract input. Ordinarily, the abstract
180
value is simply rounded and packed into the single-precision format, with
181
the inexact exception raised if the abstract input cannot be represented
182
exactly. However, if the abstract value is too large, the overflow and
183
inexact exceptions are raised and an infinity or maximal finite value is
184
returned. If the abstract value is too small, the input value is rounded to
185
a subnormal number, and the underflow and inexact exceptions are raised if
186
the abstract input cannot be represented exactly as a subnormal single-
187
precision floating-point number.
188
The input significand `zSig' has its binary point between bits 30
189
and 29, which is 7 bits to the left of the usual location. This shifted
190
significand must be normalized or smaller. If `zSig' is not normalized,
191
`zExp' must be 0; in that case, the result returned is a subnormal number,
192
and it must not require rounding. In the usual case that `zSig' is
193
normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
194
The handling of underflow and overflow follows the IEC/IEEE Standard for
195
Binary Floating-Point Arithmetic.
196
-------------------------------------------------------------------------------
197
*/
198
static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
199
{
200
int8 roundingMode;
201
flag roundNearestEven;
202
int8 roundIncrement, roundBits;
203
flag isTiny;
204
205
roundingMode = float_rounding_mode;
206
roundNearestEven = roundingMode == float_round_nearest_even;
207
roundIncrement = 0x40;
208
if ( ! roundNearestEven ) {
209
if ( roundingMode == float_round_to_zero ) {
210
roundIncrement = 0;
211
}
212
else {
213
roundIncrement = 0x7F;
214
if ( zSign ) {
215
if ( roundingMode == float_round_up ) roundIncrement = 0;
216
}
217
else {
218
if ( roundingMode == float_round_down ) roundIncrement = 0;
219
}
220
}
221
}
222
roundBits = zSig & 0x7F;
223
if ( 0xFD <= (bits16) zExp ) {
224
if ( ( 0xFD < zExp )
225
|| ( ( zExp == 0xFD )
226
&& ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
227
) {
228
float_raise( float_flag_overflow | float_flag_inexact );
229
return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
230
}
231
if ( zExp < 0 ) {
232
isTiny =
233
( float_detect_tininess == float_tininess_before_rounding )
234
|| ( zExp < -1 )
235
|| ( zSig + roundIncrement < 0x80000000 );
236
shift32RightJamming( zSig, - zExp, &zSig );
237
zExp = 0;
238
roundBits = zSig & 0x7F;
239
if ( isTiny && roundBits ) float_raise( float_flag_underflow );
240
}
241
}
242
if ( roundBits ) float_exception_flags |= float_flag_inexact;
243
zSig = ( zSig + roundIncrement )>>7;
244
zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
245
if ( zSig == 0 ) zExp = 0;
246
return packFloat32( zSign, zExp, zSig );
247
248
}
249
250
/*
251
-------------------------------------------------------------------------------
252
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
253
and significand `zSig', and returns the proper single-precision floating-
254
point value corresponding to the abstract input. This routine is just like
255
`roundAndPackFloat32' except that `zSig' does not have to be normalized.
256
Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
257
floating-point exponent.
258
-------------------------------------------------------------------------------
259
*/
260
static float32
261
normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
262
{
263
int8 shiftCount;
264
265
shiftCount = countLeadingZeros32( zSig ) - 1;
266
return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
267
268
}
269
270
/*
271
-------------------------------------------------------------------------------
272
Returns the least-significant 32 fraction bits of the double-precision
273
floating-point value `a'.
274
-------------------------------------------------------------------------------
275
*/
276
INLINE bits32 extractFloat64Frac1( float64 a )
277
{
278
279
return FLOAT64_DEMANGLE(a) & LIT64( 0x00000000FFFFFFFF );
280
281
}
282
283
/*
284
-------------------------------------------------------------------------------
285
Returns the most-significant 20 fraction bits of the double-precision
286
floating-point value `a'.
287
-------------------------------------------------------------------------------
288
*/
289
INLINE bits32 extractFloat64Frac0( float64 a )
290
{
291
292
return ( FLOAT64_DEMANGLE(a)>>32 ) & 0x000FFFFF;
293
294
}
295
296
/*
297
-------------------------------------------------------------------------------
298
Returns the exponent bits of the double-precision floating-point value `a'.
299
-------------------------------------------------------------------------------
300
*/
301
INLINE int16 extractFloat64Exp( float64 a )
302
{
303
304
return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
305
306
}
307
308
/*
309
-------------------------------------------------------------------------------
310
Returns the sign bit of the double-precision floating-point value `a'.
311
-------------------------------------------------------------------------------
312
*/
313
INLINE flag extractFloat64Sign( float64 a )
314
{
315
316
return FLOAT64_DEMANGLE(a)>>63;
317
318
}
319
320
/*
321
-------------------------------------------------------------------------------
322
Normalizes the subnormal double-precision floating-point value represented
323
by the denormalized significand formed by the concatenation of `aSig0' and
324
`aSig1'. The normalized exponent is stored at the location pointed to by
325
`zExpPtr'. The most significant 21 bits of the normalized significand are
326
stored at the location pointed to by `zSig0Ptr', and the least significant
327
32 bits of the normalized significand are stored at the location pointed to
328
by `zSig1Ptr'.
329
-------------------------------------------------------------------------------
330
*/
331
static void
332
normalizeFloat64Subnormal(
333
bits32 aSig0,
334
bits32 aSig1,
335
int16 *zExpPtr,
336
bits32 *zSig0Ptr,
337
bits32 *zSig1Ptr
338
)
339
{
340
int8 shiftCount;
341
342
if ( aSig0 == 0 ) {
343
shiftCount = countLeadingZeros32( aSig1 ) - 11;
344
if ( shiftCount < 0 ) {
345
*zSig0Ptr = aSig1>>( - shiftCount );
346
*zSig1Ptr = aSig1<<( shiftCount & 31 );
347
}
348
else {
349
*zSig0Ptr = aSig1<<shiftCount;
350
*zSig1Ptr = 0;
351
}
352
*zExpPtr = - shiftCount - 31;
353
}
354
else {
355
shiftCount = countLeadingZeros32( aSig0 ) - 11;
356
shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
357
*zExpPtr = 1 - shiftCount;
358
}
359
360
}
361
362
/*
363
-------------------------------------------------------------------------------
364
Packs the sign `zSign', the exponent `zExp', and the significand formed by
365
the concatenation of `zSig0' and `zSig1' into a double-precision floating-
366
point value, returning the result. After being shifted into the proper
367
positions, the three fields `zSign', `zExp', and `zSig0' are simply added
368
together to form the most significant 32 bits of the result. This means
369
that any integer portion of `zSig0' will be added into the exponent. Since
370
a properly normalized significand will have an integer portion equal to 1,
371
the `zExp' input should be 1 less than the desired result exponent whenever
372
`zSig0' and `zSig1' concatenated form a complete, normalized significand.
373
-------------------------------------------------------------------------------
374
*/
375
INLINE float64
376
packFloat64( flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 )
377
{
378
379
return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
380
( ( (bits64) zExp )<<52 ) +
381
( ( (bits64) zSig0 )<<32 ) + zSig1 );
382
383
384
}
385
386
/*
387
-------------------------------------------------------------------------------
388
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
389
and extended significand formed by the concatenation of `zSig0', `zSig1',
390
and `zSig2', and returns the proper double-precision floating-point value
391
corresponding to the abstract input. Ordinarily, the abstract value is
392
simply rounded and packed into the double-precision format, with the inexact
393
exception raised if the abstract input cannot be represented exactly.
394
However, if the abstract value is too large, the overflow and inexact
395
exceptions are raised and an infinity or maximal finite value is returned.
396
If the abstract value is too small, the input value is rounded to a
397
subnormal number, and the underflow and inexact exceptions are raised if the
398
abstract input cannot be represented exactly as a subnormal double-precision
399
floating-point number.
400
The input significand must be normalized or smaller. If the input
401
significand is not normalized, `zExp' must be 0; in that case, the result
402
returned is a subnormal number, and it must not require rounding. In the
403
usual case that the input significand is normalized, `zExp' must be 1 less
404
than the ``true'' floating-point exponent. The handling of underflow and
405
overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
406
-------------------------------------------------------------------------------
407
*/
408
static float64
409
roundAndPackFloat64(
410
flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1, bits32 zSig2 )
411
{
412
int8 roundingMode;
413
flag roundNearestEven, increment, isTiny;
414
415
roundingMode = float_rounding_mode;
416
roundNearestEven = ( roundingMode == float_round_nearest_even );
417
increment = ( (sbits32) zSig2 < 0 );
418
if ( ! roundNearestEven ) {
419
if ( roundingMode == float_round_to_zero ) {
420
increment = 0;
421
}
422
else {
423
if ( zSign ) {
424
increment = ( roundingMode == float_round_down ) && zSig2;
425
}
426
else {
427
increment = ( roundingMode == float_round_up ) && zSig2;
428
}
429
}
430
}
431
if ( 0x7FD <= (bits16) zExp ) {
432
if ( ( 0x7FD < zExp )
433
|| ( ( zExp == 0x7FD )
434
&& eq64( 0x001FFFFF, 0xFFFFFFFF, zSig0, zSig1 )
435
&& increment
436
)
437
) {
438
float_raise( float_flag_overflow | float_flag_inexact );
439
if ( ( roundingMode == float_round_to_zero )
440
|| ( zSign && ( roundingMode == float_round_up ) )
441
|| ( ! zSign && ( roundingMode == float_round_down ) )
442
) {
443
return packFloat64( zSign, 0x7FE, 0x000FFFFF, 0xFFFFFFFF );
444
}
445
return packFloat64( zSign, 0x7FF, 0, 0 );
446
}
447
if ( zExp < 0 ) {
448
isTiny =
449
( float_detect_tininess == float_tininess_before_rounding )
450
|| ( zExp < -1 )
451
|| ! increment
452
|| lt64( zSig0, zSig1, 0x001FFFFF, 0xFFFFFFFF );
453
shift64ExtraRightJamming(
454
zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
455
zExp = 0;
456
if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
457
if ( roundNearestEven ) {
458
increment = ( (sbits32) zSig2 < 0 );
459
}
460
else {
461
if ( zSign ) {
462
increment = ( roundingMode == float_round_down ) && zSig2;
463
}
464
else {
465
increment = ( roundingMode == float_round_up ) && zSig2;
466
}
467
}
468
}
469
}
470
if ( zSig2 ) float_exception_flags |= float_flag_inexact;
471
if ( increment ) {
472
add64( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
473
zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
474
}
475
else {
476
if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
477
}
478
return packFloat64( zSign, zExp, zSig0, zSig1 );
479
480
}
481
482
/*
483
-------------------------------------------------------------------------------
484
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
485
and significand formed by the concatenation of `zSig0' and `zSig1', and
486
returns the proper double-precision floating-point value corresponding
487
to the abstract input. This routine is just like `roundAndPackFloat64'
488
except that the input significand has fewer bits and does not have to be
489
normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
490
point exponent.
491
-------------------------------------------------------------------------------
492
*/
493
static float64
494
normalizeRoundAndPackFloat64(
495
flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 )
496
{
497
int8 shiftCount;
498
bits32 zSig2;
499
500
if ( zSig0 == 0 ) {
501
zSig0 = zSig1;
502
zSig1 = 0;
503
zExp -= 32;
504
}
505
shiftCount = countLeadingZeros32( zSig0 ) - 11;
506
if ( 0 <= shiftCount ) {
507
zSig2 = 0;
508
shortShift64Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
509
}
510
else {
511
shift64ExtraRightJamming(
512
zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
513
}
514
zExp -= shiftCount;
515
return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
516
517
}
518
519
/*
520
-------------------------------------------------------------------------------
521
Returns the result of converting the 32-bit two's complement integer `a' to
522
the single-precision floating-point format. The conversion is performed
523
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
524
-------------------------------------------------------------------------------
525
*/
526
float32 int32_to_float32( int32 a )
527
{
528
flag zSign;
529
530
if ( a == 0 ) return 0;
531
if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
532
zSign = ( a < 0 );
533
return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
534
535
}
536
537
/*
538
-------------------------------------------------------------------------------
539
Returns the result of converting the 32-bit two's complement integer `a' to
540
the double-precision floating-point format. The conversion is performed
541
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
542
-------------------------------------------------------------------------------
543
*/
544
float64 int32_to_float64( int32 a )
545
{
546
flag zSign;
547
bits32 absA;
548
int8 shiftCount;
549
bits32 zSig0, zSig1;
550
551
if ( a == 0 ) return packFloat64( 0, 0, 0, 0 );
552
zSign = ( a < 0 );
553
absA = zSign ? - a : a;
554
shiftCount = countLeadingZeros32( absA ) - 11;
555
if ( 0 <= shiftCount ) {
556
zSig0 = absA<<shiftCount;
557
zSig1 = 0;
558
}
559
else {
560
shift64Right( absA, 0, - shiftCount, &zSig0, &zSig1 );
561
}
562
return packFloat64( zSign, 0x412 - shiftCount, zSig0, zSig1 );
563
564
}
565
566
#ifndef SOFTFLOAT_FOR_GCC
567
/*
568
-------------------------------------------------------------------------------
569
Returns the result of converting the single-precision floating-point value
570
`a' to the 32-bit two's complement integer format. The conversion is
571
performed according to the IEC/IEEE Standard for Binary Floating-Point
572
Arithmetic---which means in particular that the conversion is rounded
573
according to the current rounding mode. If `a' is a NaN, the largest
574
positive integer is returned. Otherwise, if the conversion overflows, the
575
largest integer with the same sign as `a' is returned.
576
-------------------------------------------------------------------------------
577
*/
578
int32 float32_to_int32( float32 a )
579
{
580
flag aSign;
581
int16 aExp, shiftCount;
582
bits32 aSig, aSigExtra;
583
int32 z;
584
int8 roundingMode;
585
586
aSig = extractFloat32Frac( a );
587
aExp = extractFloat32Exp( a );
588
aSign = extractFloat32Sign( a );
589
shiftCount = aExp - 0x96;
590
if ( 0 <= shiftCount ) {
591
if ( 0x9E <= aExp ) {
592
if ( a != 0xCF000000 ) {
593
float_raise( float_flag_invalid );
594
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
595
return 0x7FFFFFFF;
596
}
597
}
598
return (sbits32) 0x80000000;
599
}
600
z = ( aSig | 0x00800000 )<<shiftCount;
601
if ( aSign ) z = - z;
602
}
603
else {
604
if ( aExp < 0x7E ) {
605
aSigExtra = aExp | aSig;
606
z = 0;
607
}
608
else {
609
aSig |= 0x00800000;
610
aSigExtra = aSig<<( shiftCount & 31 );
611
z = aSig>>( - shiftCount );
612
}
613
if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
614
roundingMode = float_rounding_mode;
615
if ( roundingMode == float_round_nearest_even ) {
616
if ( (sbits32) aSigExtra < 0 ) {
617
++z;
618
if ( (bits32) ( aSigExtra<<1 ) == 0 ) z &= ~1;
619
}
620
if ( aSign ) z = - z;
621
}
622
else {
623
aSigExtra = ( aSigExtra != 0 );
624
if ( aSign ) {
625
z += ( roundingMode == float_round_down ) & aSigExtra;
626
z = - z;
627
}
628
else {
629
z += ( roundingMode == float_round_up ) & aSigExtra;
630
}
631
}
632
}
633
return z;
634
635
}
636
#endif
637
638
/*
639
-------------------------------------------------------------------------------
640
Returns the result of converting the single-precision floating-point value
641
`a' to the 32-bit two's complement integer format. The conversion is
642
performed according to the IEC/IEEE Standard for Binary Floating-Point
643
Arithmetic, except that the conversion is always rounded toward zero.
644
If `a' is a NaN, the largest positive integer is returned. Otherwise, if
645
the conversion overflows, the largest integer with the same sign as `a' is
646
returned.
647
-------------------------------------------------------------------------------
648
*/
649
int32 float32_to_int32_round_to_zero( float32 a )
650
{
651
flag aSign;
652
int16 aExp, shiftCount;
653
bits32 aSig;
654
int32 z;
655
656
aSig = extractFloat32Frac( a );
657
aExp = extractFloat32Exp( a );
658
aSign = extractFloat32Sign( a );
659
shiftCount = aExp - 0x9E;
660
if ( 0 <= shiftCount ) {
661
if ( a != 0xCF000000 ) {
662
float_raise( float_flag_invalid );
663
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
664
}
665
return (sbits32) 0x80000000;
666
}
667
else if ( aExp <= 0x7E ) {
668
if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
669
return 0;
670
}
671
aSig = ( aSig | 0x00800000 )<<8;
672
z = aSig>>( - shiftCount );
673
if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
674
float_exception_flags |= float_flag_inexact;
675
}
676
if ( aSign ) z = - z;
677
return z;
678
679
}
680
681
/*
682
-------------------------------------------------------------------------------
683
Returns the result of converting the single-precision floating-point value
684
`a' to the double-precision floating-point format. The conversion is
685
performed according to the IEC/IEEE Standard for Binary Floating-Point
686
Arithmetic.
687
-------------------------------------------------------------------------------
688
*/
689
float64 float32_to_float64( float32 a )
690
{
691
flag aSign;
692
int16 aExp;
693
bits32 aSig, zSig0, zSig1;
694
695
aSig = extractFloat32Frac( a );
696
aExp = extractFloat32Exp( a );
697
aSign = extractFloat32Sign( a );
698
if ( aExp == 0xFF ) {
699
if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
700
return packFloat64( aSign, 0x7FF, 0, 0 );
701
}
702
if ( aExp == 0 ) {
703
if ( aSig == 0 ) return packFloat64( aSign, 0, 0, 0 );
704
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
705
--aExp;
706
}
707
shift64Right( aSig, 0, 3, &zSig0, &zSig1 );
708
return packFloat64( aSign, aExp + 0x380, zSig0, zSig1 );
709
710
}
711
712
#ifndef SOFTFLOAT_FOR_GCC
713
/*
714
-------------------------------------------------------------------------------
715
Rounds the single-precision floating-point value `a' to an integer,
716
and returns the result as a single-precision floating-point value. The
717
operation is performed according to the IEC/IEEE Standard for Binary
718
Floating-Point Arithmetic.
719
-------------------------------------------------------------------------------
720
*/
721
float32 float32_round_to_int( float32 a )
722
{
723
flag aSign;
724
int16 aExp;
725
bits32 lastBitMask, roundBitsMask;
726
int8 roundingMode;
727
float32 z;
728
729
aExp = extractFloat32Exp( a );
730
if ( 0x96 <= aExp ) {
731
if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
732
return propagateFloat32NaN( a, a );
733
}
734
return a;
735
}
736
if ( aExp <= 0x7E ) {
737
if ( (bits32) ( a<<1 ) == 0 ) return a;
738
float_exception_flags |= float_flag_inexact;
739
aSign = extractFloat32Sign( a );
740
switch ( float_rounding_mode ) {
741
case float_round_nearest_even:
742
if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
743
return packFloat32( aSign, 0x7F, 0 );
744
}
745
break;
746
case float_round_to_zero:
747
break;
748
case float_round_down:
749
return aSign ? 0xBF800000 : 0;
750
case float_round_up:
751
return aSign ? 0x80000000 : 0x3F800000;
752
}
753
return packFloat32( aSign, 0, 0 );
754
}
755
lastBitMask = 1;
756
lastBitMask <<= 0x96 - aExp;
757
roundBitsMask = lastBitMask - 1;
758
z = a;
759
roundingMode = float_rounding_mode;
760
if ( roundingMode == float_round_nearest_even ) {
761
z += lastBitMask>>1;
762
if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
763
}
764
else if ( roundingMode != float_round_to_zero ) {
765
if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
766
z += roundBitsMask;
767
}
768
}
769
z &= ~ roundBitsMask;
770
if ( z != a ) float_exception_flags |= float_flag_inexact;
771
return z;
772
773
}
774
#endif
775
776
/*
777
-------------------------------------------------------------------------------
778
Returns the result of adding the absolute values of the single-precision
779
floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
780
before being returned. `zSign' is ignored if the result is a NaN.
781
The addition is performed according to the IEC/IEEE Standard for Binary
782
Floating-Point Arithmetic.
783
-------------------------------------------------------------------------------
784
*/
785
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
786
{
787
int16 aExp, bExp, zExp;
788
bits32 aSig, bSig, zSig;
789
int16 expDiff;
790
791
aSig = extractFloat32Frac( a );
792
aExp = extractFloat32Exp( a );
793
bSig = extractFloat32Frac( b );
794
bExp = extractFloat32Exp( b );
795
expDiff = aExp - bExp;
796
aSig <<= 6;
797
bSig <<= 6;
798
if ( 0 < expDiff ) {
799
if ( aExp == 0xFF ) {
800
if ( aSig ) return propagateFloat32NaN( a, b );
801
return a;
802
}
803
if ( bExp == 0 ) {
804
--expDiff;
805
}
806
else {
807
bSig |= 0x20000000;
808
}
809
shift32RightJamming( bSig, expDiff, &bSig );
810
zExp = aExp;
811
}
812
else if ( expDiff < 0 ) {
813
if ( bExp == 0xFF ) {
814
if ( bSig ) return propagateFloat32NaN( a, b );
815
return packFloat32( zSign, 0xFF, 0 );
816
}
817
if ( aExp == 0 ) {
818
++expDiff;
819
}
820
else {
821
aSig |= 0x20000000;
822
}
823
shift32RightJamming( aSig, - expDiff, &aSig );
824
zExp = bExp;
825
}
826
else {
827
if ( aExp == 0xFF ) {
828
if ( aSig | bSig ) return propagateFloat32NaN( a, b );
829
return a;
830
}
831
if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
832
zSig = 0x40000000 + aSig + bSig;
833
zExp = aExp;
834
goto roundAndPack;
835
}
836
aSig |= 0x20000000;
837
zSig = ( aSig + bSig )<<1;
838
--zExp;
839
if ( (sbits32) zSig < 0 ) {
840
zSig = aSig + bSig;
841
++zExp;
842
}
843
roundAndPack:
844
return roundAndPackFloat32( zSign, zExp, zSig );
845
846
}
847
848
/*
849
-------------------------------------------------------------------------------
850
Returns the result of subtracting the absolute values of the single-
851
precision floating-point values `a' and `b'. If `zSign' is 1, the
852
difference is negated before being returned. `zSign' is ignored if the
853
result is a NaN. The subtraction is performed according to the IEC/IEEE
854
Standard for Binary Floating-Point Arithmetic.
855
-------------------------------------------------------------------------------
856
*/
857
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
858
{
859
int16 aExp, bExp, zExp;
860
bits32 aSig, bSig, zSig;
861
int16 expDiff;
862
863
aSig = extractFloat32Frac( a );
864
aExp = extractFloat32Exp( a );
865
bSig = extractFloat32Frac( b );
866
bExp = extractFloat32Exp( b );
867
expDiff = aExp - bExp;
868
aSig <<= 7;
869
bSig <<= 7;
870
if ( 0 < expDiff ) goto aExpBigger;
871
if ( expDiff < 0 ) goto bExpBigger;
872
if ( aExp == 0xFF ) {
873
if ( aSig | bSig ) return propagateFloat32NaN( a, b );
874
float_raise( float_flag_invalid );
875
return float32_default_nan;
876
}
877
if ( aExp == 0 ) {
878
aExp = 1;
879
bExp = 1;
880
}
881
if ( bSig < aSig ) goto aBigger;
882
if ( aSig < bSig ) goto bBigger;
883
return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
884
bExpBigger:
885
if ( bExp == 0xFF ) {
886
if ( bSig ) return propagateFloat32NaN( a, b );
887
return packFloat32( zSign ^ 1, 0xFF, 0 );
888
}
889
if ( aExp == 0 ) {
890
++expDiff;
891
}
892
else {
893
aSig |= 0x40000000;
894
}
895
shift32RightJamming( aSig, - expDiff, &aSig );
896
bSig |= 0x40000000;
897
bBigger:
898
zSig = bSig - aSig;
899
zExp = bExp;
900
zSign ^= 1;
901
goto normalizeRoundAndPack;
902
aExpBigger:
903
if ( aExp == 0xFF ) {
904
if ( aSig ) return propagateFloat32NaN( a, b );
905
return a;
906
}
907
if ( bExp == 0 ) {
908
--expDiff;
909
}
910
else {
911
bSig |= 0x40000000;
912
}
913
shift32RightJamming( bSig, expDiff, &bSig );
914
aSig |= 0x40000000;
915
aBigger:
916
zSig = aSig - bSig;
917
zExp = aExp;
918
normalizeRoundAndPack:
919
--zExp;
920
return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
921
922
}
923
924
/*
925
-------------------------------------------------------------------------------
926
Returns the result of adding the single-precision floating-point values `a'
927
and `b'. The operation is performed according to the IEC/IEEE Standard for
928
Binary Floating-Point Arithmetic.
929
-------------------------------------------------------------------------------
930
*/
931
float32 float32_add( float32 a, float32 b )
932
{
933
flag aSign, bSign;
934
935
aSign = extractFloat32Sign( a );
936
bSign = extractFloat32Sign( b );
937
if ( aSign == bSign ) {
938
return addFloat32Sigs( a, b, aSign );
939
}
940
else {
941
return subFloat32Sigs( a, b, aSign );
942
}
943
944
}
945
946
/*
947
-------------------------------------------------------------------------------
948
Returns the result of subtracting the single-precision floating-point values
949
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
950
for Binary Floating-Point Arithmetic.
951
-------------------------------------------------------------------------------
952
*/
953
float32 float32_sub( float32 a, float32 b )
954
{
955
flag aSign, bSign;
956
957
aSign = extractFloat32Sign( a );
958
bSign = extractFloat32Sign( b );
959
if ( aSign == bSign ) {
960
return subFloat32Sigs( a, b, aSign );
961
}
962
else {
963
return addFloat32Sigs( a, b, aSign );
964
}
965
966
}
967
968
/*
969
-------------------------------------------------------------------------------
970
Returns the result of multiplying the single-precision floating-point values
971
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
972
for Binary Floating-Point Arithmetic.
973
-------------------------------------------------------------------------------
974
*/
975
float32 float32_mul( float32 a, float32 b )
976
{
977
flag aSign, bSign, zSign;
978
int16 aExp, bExp, zExp;
979
bits32 aSig, bSig, zSig0, zSig1;
980
981
aSig = extractFloat32Frac( a );
982
aExp = extractFloat32Exp( a );
983
aSign = extractFloat32Sign( a );
984
bSig = extractFloat32Frac( b );
985
bExp = extractFloat32Exp( b );
986
bSign = extractFloat32Sign( b );
987
zSign = aSign ^ bSign;
988
if ( aExp == 0xFF ) {
989
if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
990
return propagateFloat32NaN( a, b );
991
}
992
if ( ( bExp | bSig ) == 0 ) {
993
float_raise( float_flag_invalid );
994
return float32_default_nan;
995
}
996
return packFloat32( zSign, 0xFF, 0 );
997
}
998
if ( bExp == 0xFF ) {
999
if ( bSig ) return propagateFloat32NaN( a, b );
1000
if ( ( aExp | aSig ) == 0 ) {
1001
float_raise( float_flag_invalid );
1002
return float32_default_nan;
1003
}
1004
return packFloat32( zSign, 0xFF, 0 );
1005
}
1006
if ( aExp == 0 ) {
1007
if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1008
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1009
}
1010
if ( bExp == 0 ) {
1011
if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1012
normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1013
}
1014
zExp = aExp + bExp - 0x7F;
1015
aSig = ( aSig | 0x00800000 )<<7;
1016
bSig = ( bSig | 0x00800000 )<<8;
1017
mul32To64( aSig, bSig, &zSig0, &zSig1 );
1018
zSig0 |= ( zSig1 != 0 );
1019
if ( 0 <= (sbits32) ( zSig0<<1 ) ) {
1020
zSig0 <<= 1;
1021
--zExp;
1022
}
1023
return roundAndPackFloat32( zSign, zExp, zSig0 );
1024
1025
}
1026
1027
/*
1028
-------------------------------------------------------------------------------
1029
Returns the result of dividing the single-precision floating-point value `a'
1030
by the corresponding value `b'. The operation is performed according to the
1031
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1032
-------------------------------------------------------------------------------
1033
*/
1034
float32 float32_div( float32 a, float32 b )
1035
{
1036
flag aSign, bSign, zSign;
1037
int16 aExp, bExp, zExp;
1038
bits32 aSig, bSig, zSig, rem0, rem1, term0, term1;
1039
1040
aSig = extractFloat32Frac( a );
1041
aExp = extractFloat32Exp( a );
1042
aSign = extractFloat32Sign( a );
1043
bSig = extractFloat32Frac( b );
1044
bExp = extractFloat32Exp( b );
1045
bSign = extractFloat32Sign( b );
1046
zSign = aSign ^ bSign;
1047
if ( aExp == 0xFF ) {
1048
if ( aSig ) return propagateFloat32NaN( a, b );
1049
if ( bExp == 0xFF ) {
1050
if ( bSig ) return propagateFloat32NaN( a, b );
1051
float_raise( float_flag_invalid );
1052
return float32_default_nan;
1053
}
1054
return packFloat32( zSign, 0xFF, 0 );
1055
}
1056
if ( bExp == 0xFF ) {
1057
if ( bSig ) return propagateFloat32NaN( a, b );
1058
return packFloat32( zSign, 0, 0 );
1059
}
1060
if ( bExp == 0 ) {
1061
if ( bSig == 0 ) {
1062
if ( ( aExp | aSig ) == 0 ) {
1063
float_raise( float_flag_invalid );
1064
return float32_default_nan;
1065
}
1066
float_raise( float_flag_divbyzero );
1067
return packFloat32( zSign, 0xFF, 0 );
1068
}
1069
normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1070
}
1071
if ( aExp == 0 ) {
1072
if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1073
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1074
}
1075
zExp = aExp - bExp + 0x7D;
1076
aSig = ( aSig | 0x00800000 )<<7;
1077
bSig = ( bSig | 0x00800000 )<<8;
1078
if ( bSig <= ( aSig + aSig ) ) {
1079
aSig >>= 1;
1080
++zExp;
1081
}
1082
zSig = estimateDiv64To32( aSig, 0, bSig );
1083
if ( ( zSig & 0x3F ) <= 2 ) {
1084
mul32To64( bSig, zSig, &term0, &term1 );
1085
sub64( aSig, 0, term0, term1, &rem0, &rem1 );
1086
while ( (sbits32) rem0 < 0 ) {
1087
--zSig;
1088
add64( rem0, rem1, 0, bSig, &rem0, &rem1 );
1089
}
1090
zSig |= ( rem1 != 0 );
1091
}
1092
return roundAndPackFloat32( zSign, zExp, zSig );
1093
1094
}
1095
1096
#ifndef SOFTFLOAT_FOR_GCC
1097
/*
1098
-------------------------------------------------------------------------------
1099
Returns the remainder of the single-precision floating-point value `a'
1100
with respect to the corresponding value `b'. The operation is performed
1101
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1102
-------------------------------------------------------------------------------
1103
*/
1104
float32 float32_rem( float32 a, float32 b )
1105
{
1106
flag aSign, bSign, zSign;
1107
int16 aExp, bExp, expDiff;
1108
bits32 aSig, bSig, q, allZero, alternateASig;
1109
sbits32 sigMean;
1110
1111
aSig = extractFloat32Frac( a );
1112
aExp = extractFloat32Exp( a );
1113
aSign = extractFloat32Sign( a );
1114
bSig = extractFloat32Frac( b );
1115
bExp = extractFloat32Exp( b );
1116
bSign = extractFloat32Sign( b );
1117
if ( aExp == 0xFF ) {
1118
if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1119
return propagateFloat32NaN( a, b );
1120
}
1121
float_raise( float_flag_invalid );
1122
return float32_default_nan;
1123
}
1124
if ( bExp == 0xFF ) {
1125
if ( bSig ) return propagateFloat32NaN( a, b );
1126
return a;
1127
}
1128
if ( bExp == 0 ) {
1129
if ( bSig == 0 ) {
1130
float_raise( float_flag_invalid );
1131
return float32_default_nan;
1132
}
1133
normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1134
}
1135
if ( aExp == 0 ) {
1136
if ( aSig == 0 ) return a;
1137
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1138
}
1139
expDiff = aExp - bExp;
1140
aSig = ( aSig | 0x00800000 )<<8;
1141
bSig = ( bSig | 0x00800000 )<<8;
1142
if ( expDiff < 0 ) {
1143
if ( expDiff < -1 ) return a;
1144
aSig >>= 1;
1145
}
1146
q = ( bSig <= aSig );
1147
if ( q ) aSig -= bSig;
1148
expDiff -= 32;
1149
while ( 0 < expDiff ) {
1150
q = estimateDiv64To32( aSig, 0, bSig );
1151
q = ( 2 < q ) ? q - 2 : 0;
1152
aSig = - ( ( bSig>>2 ) * q );
1153
expDiff -= 30;
1154
}
1155
expDiff += 32;
1156
if ( 0 < expDiff ) {
1157
q = estimateDiv64To32( aSig, 0, bSig );
1158
q = ( 2 < q ) ? q - 2 : 0;
1159
q >>= 32 - expDiff;
1160
bSig >>= 2;
1161
aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1162
}
1163
else {
1164
aSig >>= 2;
1165
bSig >>= 2;
1166
}
1167
do {
1168
alternateASig = aSig;
1169
++q;
1170
aSig -= bSig;
1171
} while ( 0 <= (sbits32) aSig );
1172
sigMean = aSig + alternateASig;
1173
if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1174
aSig = alternateASig;
1175
}
1176
zSign = ( (sbits32) aSig < 0 );
1177
if ( zSign ) aSig = - aSig;
1178
return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1179
1180
}
1181
#endif
1182
1183
#ifndef SOFTFLOAT_FOR_GCC
1184
/*
1185
-------------------------------------------------------------------------------
1186
Returns the square root of the single-precision floating-point value `a'.
1187
The operation is performed according to the IEC/IEEE Standard for Binary
1188
Floating-Point Arithmetic.
1189
-------------------------------------------------------------------------------
1190
*/
1191
float32 float32_sqrt( float32 a )
1192
{
1193
flag aSign;
1194
int16 aExp, zExp;
1195
bits32 aSig, zSig, rem0, rem1, term0, term1;
1196
1197
aSig = extractFloat32Frac( a );
1198
aExp = extractFloat32Exp( a );
1199
aSign = extractFloat32Sign( a );
1200
if ( aExp == 0xFF ) {
1201
if ( aSig ) return propagateFloat32NaN( a, 0 );
1202
if ( ! aSign ) return a;
1203
float_raise( float_flag_invalid );
1204
return float32_default_nan;
1205
}
1206
if ( aSign ) {
1207
if ( ( aExp | aSig ) == 0 ) return a;
1208
float_raise( float_flag_invalid );
1209
return float32_default_nan;
1210
}
1211
if ( aExp == 0 ) {
1212
if ( aSig == 0 ) return 0;
1213
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1214
}
1215
zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1216
aSig = ( aSig | 0x00800000 )<<8;
1217
zSig = estimateSqrt32( aExp, aSig ) + 2;
1218
if ( ( zSig & 0x7F ) <= 5 ) {
1219
if ( zSig < 2 ) {
1220
zSig = 0x7FFFFFFF;
1221
goto roundAndPack;
1222
}
1223
else {
1224
aSig >>= aExp & 1;
1225
mul32To64( zSig, zSig, &term0, &term1 );
1226
sub64( aSig, 0, term0, term1, &rem0, &rem1 );
1227
while ( (sbits32) rem0 < 0 ) {
1228
--zSig;
1229
shortShift64Left( 0, zSig, 1, &term0, &term1 );
1230
term1 |= 1;
1231
add64( rem0, rem1, term0, term1, &rem0, &rem1 );
1232
}
1233
zSig |= ( ( rem0 | rem1 ) != 0 );
1234
}
1235
}
1236
shift32RightJamming( zSig, 1, &zSig );
1237
roundAndPack:
1238
return roundAndPackFloat32( 0, zExp, zSig );
1239
1240
}
1241
#endif
1242
1243
/*
1244
-------------------------------------------------------------------------------
1245
Returns 1 if the single-precision floating-point value `a' is equal to
1246
the corresponding value `b', and 0 otherwise. The comparison is performed
1247
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1248
-------------------------------------------------------------------------------
1249
*/
1250
flag float32_eq( float32 a, float32 b )
1251
{
1252
1253
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1254
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1255
) {
1256
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1257
float_raise( float_flag_invalid );
1258
}
1259
return 0;
1260
}
1261
return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1262
1263
}
1264
1265
/*
1266
-------------------------------------------------------------------------------
1267
Returns 1 if the single-precision floating-point value `a' is less than
1268
or equal to the corresponding value `b', and 0 otherwise. The comparison
1269
is performed according to the IEC/IEEE Standard for Binary Floating-Point
1270
Arithmetic.
1271
-------------------------------------------------------------------------------
1272
*/
1273
flag float32_le( float32 a, float32 b )
1274
{
1275
flag aSign, bSign;
1276
1277
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1278
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1279
) {
1280
float_raise( float_flag_invalid );
1281
return 0;
1282
}
1283
aSign = extractFloat32Sign( a );
1284
bSign = extractFloat32Sign( b );
1285
if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1286
return ( a == b ) || ( aSign ^ ( a < b ) );
1287
1288
}
1289
1290
/*
1291
-------------------------------------------------------------------------------
1292
Returns 1 if the single-precision floating-point value `a' is less than
1293
the corresponding value `b', and 0 otherwise. The comparison is performed
1294
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1295
-------------------------------------------------------------------------------
1296
*/
1297
flag float32_lt( float32 a, float32 b )
1298
{
1299
flag aSign, bSign;
1300
1301
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1302
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1303
) {
1304
float_raise( float_flag_invalid );
1305
return 0;
1306
}
1307
aSign = extractFloat32Sign( a );
1308
bSign = extractFloat32Sign( b );
1309
if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1310
return ( a != b ) && ( aSign ^ ( a < b ) );
1311
1312
}
1313
1314
#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1315
/*
1316
-------------------------------------------------------------------------------
1317
Returns 1 if the single-precision floating-point value `a' is equal to
1318
the corresponding value `b', and 0 otherwise. The invalid exception is
1319
raised if either operand is a NaN. Otherwise, the comparison is performed
1320
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1321
-------------------------------------------------------------------------------
1322
*/
1323
flag float32_eq_signaling( float32 a, float32 b )
1324
{
1325
1326
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1327
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1328
) {
1329
float_raise( float_flag_invalid );
1330
return 0;
1331
}
1332
return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1333
1334
}
1335
1336
/*
1337
-------------------------------------------------------------------------------
1338
Returns 1 if the single-precision floating-point value `a' is less than or
1339
equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1340
cause an exception. Otherwise, the comparison is performed according to the
1341
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1342
-------------------------------------------------------------------------------
1343
*/
1344
flag float32_le_quiet( float32 a, float32 b )
1345
{
1346
flag aSign, bSign;
1347
int16 aExp, bExp;
1348
1349
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1350
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1351
) {
1352
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1353
float_raise( float_flag_invalid );
1354
}
1355
return 0;
1356
}
1357
aSign = extractFloat32Sign( a );
1358
bSign = extractFloat32Sign( b );
1359
if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1360
return ( a == b ) || ( aSign ^ ( a < b ) );
1361
1362
}
1363
1364
/*
1365
-------------------------------------------------------------------------------
1366
Returns 1 if the single-precision floating-point value `a' is less than
1367
the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1368
exception. Otherwise, the comparison is performed according to the IEC/IEEE
1369
Standard for Binary Floating-Point Arithmetic.
1370
-------------------------------------------------------------------------------
1371
*/
1372
flag float32_lt_quiet( float32 a, float32 b )
1373
{
1374
flag aSign, bSign;
1375
1376
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1377
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1378
) {
1379
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1380
float_raise( float_flag_invalid );
1381
}
1382
return 0;
1383
}
1384
aSign = extractFloat32Sign( a );
1385
bSign = extractFloat32Sign( b );
1386
if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1387
return ( a != b ) && ( aSign ^ ( a < b ) );
1388
1389
}
1390
#endif /* !SOFTFLOAT_FOR_GCC */
1391
1392
#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1393
/*
1394
-------------------------------------------------------------------------------
1395
Returns the result of converting the double-precision floating-point value
1396
`a' to the 32-bit two's complement integer format. The conversion is
1397
performed according to the IEC/IEEE Standard for Binary Floating-Point
1398
Arithmetic---which means in particular that the conversion is rounded
1399
according to the current rounding mode. If `a' is a NaN, the largest
1400
positive integer is returned. Otherwise, if the conversion overflows, the
1401
largest integer with the same sign as `a' is returned.
1402
-------------------------------------------------------------------------------
1403
*/
1404
int32 float64_to_int32( float64 a )
1405
{
1406
flag aSign;
1407
int16 aExp, shiftCount;
1408
bits32 aSig0, aSig1, absZ, aSigExtra;
1409
int32 z;
1410
int8 roundingMode;
1411
1412
aSig1 = extractFloat64Frac1( a );
1413
aSig0 = extractFloat64Frac0( a );
1414
aExp = extractFloat64Exp( a );
1415
aSign = extractFloat64Sign( a );
1416
shiftCount = aExp - 0x413;
1417
if ( 0 <= shiftCount ) {
1418
if ( 0x41E < aExp ) {
1419
if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;
1420
goto invalid;
1421
}
1422
shortShift64Left(
1423
aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );
1424
if ( 0x80000000 < absZ ) goto invalid;
1425
}
1426
else {
1427
aSig1 = ( aSig1 != 0 );
1428
if ( aExp < 0x3FE ) {
1429
aSigExtra = aExp | aSig0 | aSig1;
1430
absZ = 0;
1431
}
1432
else {
1433
aSig0 |= 0x00100000;
1434
aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;
1435
absZ = aSig0>>( - shiftCount );
1436
}
1437
}
1438
roundingMode = float_rounding_mode;
1439
if ( roundingMode == float_round_nearest_even ) {
1440
if ( (sbits32) aSigExtra < 0 ) {
1441
++absZ;
1442
if ( (bits32) ( aSigExtra<<1 ) == 0 ) absZ &= ~1;
1443
}
1444
z = aSign ? - absZ : absZ;
1445
}
1446
else {
1447
aSigExtra = ( aSigExtra != 0 );
1448
if ( aSign ) {
1449
z = - ( absZ
1450
+ ( ( roundingMode == float_round_down ) & aSigExtra ) );
1451
}
1452
else {
1453
z = absZ + ( ( roundingMode == float_round_up ) & aSigExtra );
1454
}
1455
}
1456
if ( ( aSign ^ ( z < 0 ) ) && z ) {
1457
invalid:
1458
float_raise( float_flag_invalid );
1459
return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1460
}
1461
if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
1462
return z;
1463
1464
}
1465
#endif /* !SOFTFLOAT_FOR_GCC */
1466
1467
/*
1468
-------------------------------------------------------------------------------
1469
Returns the result of converting the double-precision floating-point value
1470
`a' to the 32-bit two's complement integer format. The conversion is
1471
performed according to the IEC/IEEE Standard for Binary Floating-Point
1472
Arithmetic, except that the conversion is always rounded toward zero.
1473
If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1474
the conversion overflows, the largest integer with the same sign as `a' is
1475
returned.
1476
-------------------------------------------------------------------------------
1477
*/
1478
int32 float64_to_int32_round_to_zero( float64 a )
1479
{
1480
flag aSign;
1481
int16 aExp, shiftCount;
1482
bits32 aSig0, aSig1, absZ, aSigExtra;
1483
int32 z;
1484
1485
aSig1 = extractFloat64Frac1( a );
1486
aSig0 = extractFloat64Frac0( a );
1487
aExp = extractFloat64Exp( a );
1488
aSign = extractFloat64Sign( a );
1489
shiftCount = aExp - 0x413;
1490
if ( 0 <= shiftCount ) {
1491
if ( 0x41E < aExp ) {
1492
if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;
1493
goto invalid;
1494
}
1495
shortShift64Left(
1496
aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );
1497
}
1498
else {
1499
if ( aExp < 0x3FF ) {
1500
if ( aExp | aSig0 | aSig1 ) {
1501
float_exception_flags |= float_flag_inexact;
1502
}
1503
return 0;
1504
}
1505
aSig0 |= 0x00100000;
1506
aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;
1507
absZ = aSig0>>( - shiftCount );
1508
}
1509
z = aSign ? - absZ : absZ;
1510
if ( ( aSign ^ ( z < 0 ) ) && z ) {
1511
invalid:
1512
float_raise( float_flag_invalid );
1513
return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1514
}
1515
if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
1516
return z;
1517
1518
}
1519
1520
/*
1521
-------------------------------------------------------------------------------
1522
Returns the result of converting the double-precision floating-point value
1523
`a' to the single-precision floating-point format. The conversion is
1524
performed according to the IEC/IEEE Standard for Binary Floating-Point
1525
Arithmetic.
1526
-------------------------------------------------------------------------------
1527
*/
1528
float32 float64_to_float32( float64 a )
1529
{
1530
flag aSign;
1531
int16 aExp;
1532
bits32 aSig0, aSig1, zSig;
1533
bits32 allZero;
1534
1535
aSig1 = extractFloat64Frac1( a );
1536
aSig0 = extractFloat64Frac0( a );
1537
aExp = extractFloat64Exp( a );
1538
aSign = extractFloat64Sign( a );
1539
if ( aExp == 0x7FF ) {
1540
if ( aSig0 | aSig1 ) {
1541
return commonNaNToFloat32( float64ToCommonNaN( a ) );
1542
}
1543
return packFloat32( aSign, 0xFF, 0 );
1544
}
1545
shift64RightJamming( aSig0, aSig1, 22, &allZero, &zSig );
1546
if ( aExp ) zSig |= 0x40000000;
1547
return roundAndPackFloat32( aSign, aExp - 0x381, zSig );
1548
1549
}
1550
1551
#ifndef SOFTFLOAT_FOR_GCC
1552
/*
1553
-------------------------------------------------------------------------------
1554
Rounds the double-precision floating-point value `a' to an integer,
1555
and returns the result as a double-precision floating-point value. The
1556
operation is performed according to the IEC/IEEE Standard for Binary
1557
Floating-Point Arithmetic.
1558
-------------------------------------------------------------------------------
1559
*/
1560
float64 float64_round_to_int( float64 a )
1561
{
1562
flag aSign;
1563
int16 aExp;
1564
bits32 lastBitMask, roundBitsMask;
1565
int8 roundingMode;
1566
float64 z;
1567
1568
aExp = extractFloat64Exp( a );
1569
if ( 0x413 <= aExp ) {
1570
if ( 0x433 <= aExp ) {
1571
if ( ( aExp == 0x7FF )
1572
&& ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) {
1573
return propagateFloat64NaN( a, a );
1574
}
1575
return a;
1576
}
1577
lastBitMask = 1;
1578
lastBitMask = ( lastBitMask<<( 0x432 - aExp ) )<<1;
1579
roundBitsMask = lastBitMask - 1;
1580
z = a;
1581
roundingMode = float_rounding_mode;
1582
if ( roundingMode == float_round_nearest_even ) {
1583
if ( lastBitMask ) {
1584
add64( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
1585
if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
1586
}
1587
else {
1588
if ( (sbits32) z.low < 0 ) {
1589
++z.high;
1590
if ( (bits32) ( z.low<<1 ) == 0 ) z.high &= ~1;
1591
}
1592
}
1593
}
1594
else if ( roundingMode != float_round_to_zero ) {
1595
if ( extractFloat64Sign( z )
1596
^ ( roundingMode == float_round_up ) ) {
1597
add64( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
1598
}
1599
}
1600
z.low &= ~ roundBitsMask;
1601
}
1602
else {
1603
if ( aExp <= 0x3FE ) {
1604
if ( ( ( (bits32) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
1605
float_exception_flags |= float_flag_inexact;
1606
aSign = extractFloat64Sign( a );
1607
switch ( float_rounding_mode ) {
1608
case float_round_nearest_even:
1609
if ( ( aExp == 0x3FE )
1610
&& ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) )
1611
) {
1612
return packFloat64( aSign, 0x3FF, 0, 0 );
1613
}
1614
break;
1615
case float_round_down:
1616
return
1617
aSign ? packFloat64( 1, 0x3FF, 0, 0 )
1618
: packFloat64( 0, 0, 0, 0 );
1619
case float_round_up:
1620
return
1621
aSign ? packFloat64( 1, 0, 0, 0 )
1622
: packFloat64( 0, 0x3FF, 0, 0 );
1623
}
1624
return packFloat64( aSign, 0, 0, 0 );
1625
}
1626
lastBitMask = 1;
1627
lastBitMask <<= 0x413 - aExp;
1628
roundBitsMask = lastBitMask - 1;
1629
z.low = 0;
1630
z.high = a.high;
1631
roundingMode = float_rounding_mode;
1632
if ( roundingMode == float_round_nearest_even ) {
1633
z.high += lastBitMask>>1;
1634
if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
1635
z.high &= ~ lastBitMask;
1636
}
1637
}
1638
else if ( roundingMode != float_round_to_zero ) {
1639
if ( extractFloat64Sign( z )
1640
^ ( roundingMode == float_round_up ) ) {
1641
z.high |= ( a.low != 0 );
1642
z.high += roundBitsMask;
1643
}
1644
}
1645
z.high &= ~ roundBitsMask;
1646
}
1647
if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
1648
float_exception_flags |= float_flag_inexact;
1649
}
1650
return z;
1651
1652
}
1653
#endif
1654
1655
/*
1656
-------------------------------------------------------------------------------
1657
Returns the result of adding the absolute values of the double-precision
1658
floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1659
before being returned. `zSign' is ignored if the result is a NaN.
1660
The addition is performed according to the IEC/IEEE Standard for Binary
1661
Floating-Point Arithmetic.
1662
-------------------------------------------------------------------------------
1663
*/
1664
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
1665
{
1666
int16 aExp, bExp, zExp;
1667
bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
1668
int16 expDiff;
1669
1670
aSig1 = extractFloat64Frac1( a );
1671
aSig0 = extractFloat64Frac0( a );
1672
aExp = extractFloat64Exp( a );
1673
bSig1 = extractFloat64Frac1( b );
1674
bSig0 = extractFloat64Frac0( b );
1675
bExp = extractFloat64Exp( b );
1676
expDiff = aExp - bExp;
1677
if ( 0 < expDiff ) {
1678
if ( aExp == 0x7FF ) {
1679
if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1680
return a;
1681
}
1682
if ( bExp == 0 ) {
1683
--expDiff;
1684
}
1685
else {
1686
bSig0 |= 0x00100000;
1687
}
1688
shift64ExtraRightJamming(
1689
bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
1690
zExp = aExp;
1691
}
1692
else if ( expDiff < 0 ) {
1693
if ( bExp == 0x7FF ) {
1694
if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1695
return packFloat64( zSign, 0x7FF, 0, 0 );
1696
}
1697
if ( aExp == 0 ) {
1698
++expDiff;
1699
}
1700
else {
1701
aSig0 |= 0x00100000;
1702
}
1703
shift64ExtraRightJamming(
1704
aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
1705
zExp = bExp;
1706
}
1707
else {
1708
if ( aExp == 0x7FF ) {
1709
if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
1710
return propagateFloat64NaN( a, b );
1711
}
1712
return a;
1713
}
1714
add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1715
if ( aExp == 0 ) return packFloat64( zSign, 0, zSig0, zSig1 );
1716
zSig2 = 0;
1717
zSig0 |= 0x00200000;
1718
zExp = aExp;
1719
goto shiftRight1;
1720
}
1721
aSig0 |= 0x00100000;
1722
add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1723
--zExp;
1724
if ( zSig0 < 0x00200000 ) goto roundAndPack;
1725
++zExp;
1726
shiftRight1:
1727
shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
1728
roundAndPack:
1729
return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
1730
1731
}
1732
1733
/*
1734
-------------------------------------------------------------------------------
1735
Returns the result of subtracting the absolute values of the double-
1736
precision floating-point values `a' and `b'. If `zSign' is 1, the
1737
difference is negated before being returned. `zSign' is ignored if the
1738
result is a NaN. The subtraction is performed according to the IEC/IEEE
1739
Standard for Binary Floating-Point Arithmetic.
1740
-------------------------------------------------------------------------------
1741
*/
1742
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
1743
{
1744
int16 aExp, bExp, zExp;
1745
bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
1746
int16 expDiff;
1747
1748
aSig1 = extractFloat64Frac1( a );
1749
aSig0 = extractFloat64Frac0( a );
1750
aExp = extractFloat64Exp( a );
1751
bSig1 = extractFloat64Frac1( b );
1752
bSig0 = extractFloat64Frac0( b );
1753
bExp = extractFloat64Exp( b );
1754
expDiff = aExp - bExp;
1755
shortShift64Left( aSig0, aSig1, 10, &aSig0, &aSig1 );
1756
shortShift64Left( bSig0, bSig1, 10, &bSig0, &bSig1 );
1757
if ( 0 < expDiff ) goto aExpBigger;
1758
if ( expDiff < 0 ) goto bExpBigger;
1759
if ( aExp == 0x7FF ) {
1760
if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
1761
return propagateFloat64NaN( a, b );
1762
}
1763
float_raise( float_flag_invalid );
1764
return float64_default_nan;
1765
}
1766
if ( aExp == 0 ) {
1767
aExp = 1;
1768
bExp = 1;
1769
}
1770
if ( bSig0 < aSig0 ) goto aBigger;
1771
if ( aSig0 < bSig0 ) goto bBigger;
1772
if ( bSig1 < aSig1 ) goto aBigger;
1773
if ( aSig1 < bSig1 ) goto bBigger;
1774
return packFloat64( float_rounding_mode == float_round_down, 0, 0, 0 );
1775
bExpBigger:
1776
if ( bExp == 0x7FF ) {
1777
if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1778
return packFloat64( zSign ^ 1, 0x7FF, 0, 0 );
1779
}
1780
if ( aExp == 0 ) {
1781
++expDiff;
1782
}
1783
else {
1784
aSig0 |= 0x40000000;
1785
}
1786
shift64RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
1787
bSig0 |= 0x40000000;
1788
bBigger:
1789
sub64( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
1790
zExp = bExp;
1791
zSign ^= 1;
1792
goto normalizeRoundAndPack;
1793
aExpBigger:
1794
if ( aExp == 0x7FF ) {
1795
if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1796
return a;
1797
}
1798
if ( bExp == 0 ) {
1799
--expDiff;
1800
}
1801
else {
1802
bSig0 |= 0x40000000;
1803
}
1804
shift64RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
1805
aSig0 |= 0x40000000;
1806
aBigger:
1807
sub64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1808
zExp = aExp;
1809
normalizeRoundAndPack:
1810
--zExp;
1811
return normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1 );
1812
1813
}
1814
1815
/*
1816
-------------------------------------------------------------------------------
1817
Returns the result of adding the double-precision floating-point values `a'
1818
and `b'. The operation is performed according to the IEC/IEEE Standard for
1819
Binary Floating-Point Arithmetic.
1820
-------------------------------------------------------------------------------
1821
*/
1822
float64 float64_add( float64 a, float64 b )
1823
{
1824
flag aSign, bSign;
1825
1826
aSign = extractFloat64Sign( a );
1827
bSign = extractFloat64Sign( b );
1828
if ( aSign == bSign ) {
1829
return addFloat64Sigs( a, b, aSign );
1830
}
1831
else {
1832
return subFloat64Sigs( a, b, aSign );
1833
}
1834
1835
}
1836
1837
/*
1838
-------------------------------------------------------------------------------
1839
Returns the result of subtracting the double-precision floating-point values
1840
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
1841
for Binary Floating-Point Arithmetic.
1842
-------------------------------------------------------------------------------
1843
*/
1844
float64 float64_sub( float64 a, float64 b )
1845
{
1846
flag aSign, bSign;
1847
1848
aSign = extractFloat64Sign( a );
1849
bSign = extractFloat64Sign( b );
1850
if ( aSign == bSign ) {
1851
return subFloat64Sigs( a, b, aSign );
1852
}
1853
else {
1854
return addFloat64Sigs( a, b, aSign );
1855
}
1856
1857
}
1858
1859
/*
1860
-------------------------------------------------------------------------------
1861
Returns the result of multiplying the double-precision floating-point values
1862
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
1863
for Binary Floating-Point Arithmetic.
1864
-------------------------------------------------------------------------------
1865
*/
1866
float64 float64_mul( float64 a, float64 b )
1867
{
1868
flag aSign, bSign, zSign;
1869
int16 aExp, bExp, zExp;
1870
bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
1871
1872
aSig1 = extractFloat64Frac1( a );
1873
aSig0 = extractFloat64Frac0( a );
1874
aExp = extractFloat64Exp( a );
1875
aSign = extractFloat64Sign( a );
1876
bSig1 = extractFloat64Frac1( b );
1877
bSig0 = extractFloat64Frac0( b );
1878
bExp = extractFloat64Exp( b );
1879
bSign = extractFloat64Sign( b );
1880
zSign = aSign ^ bSign;
1881
if ( aExp == 0x7FF ) {
1882
if ( ( aSig0 | aSig1 )
1883
|| ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) {
1884
return propagateFloat64NaN( a, b );
1885
}
1886
if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
1887
return packFloat64( zSign, 0x7FF, 0, 0 );
1888
}
1889
if ( bExp == 0x7FF ) {
1890
if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1891
if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
1892
invalid:
1893
float_raise( float_flag_invalid );
1894
return float64_default_nan;
1895
}
1896
return packFloat64( zSign, 0x7FF, 0, 0 );
1897
}
1898
if ( aExp == 0 ) {
1899
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1900
normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
1901
}
1902
if ( bExp == 0 ) {
1903
if ( ( bSig0 | bSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1904
normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
1905
}
1906
zExp = aExp + bExp - 0x400;
1907
aSig0 |= 0x00100000;
1908
shortShift64Left( bSig0, bSig1, 12, &bSig0, &bSig1 );
1909
mul64To128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
1910
add64( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
1911
zSig2 |= ( zSig3 != 0 );
1912
if ( 0x00200000 <= zSig0 ) {
1913
shift64ExtraRightJamming(
1914
zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
1915
++zExp;
1916
}
1917
return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
1918
1919
}
1920
1921
/*
1922
-------------------------------------------------------------------------------
1923
Returns the result of dividing the double-precision floating-point value `a'
1924
by the corresponding value `b'. The operation is performed according to the
1925
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1926
-------------------------------------------------------------------------------
1927
*/
1928
float64 float64_div( float64 a, float64 b )
1929
{
1930
flag aSign, bSign, zSign;
1931
int16 aExp, bExp, zExp;
1932
bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
1933
bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
1934
1935
aSig1 = extractFloat64Frac1( a );
1936
aSig0 = extractFloat64Frac0( a );
1937
aExp = extractFloat64Exp( a );
1938
aSign = extractFloat64Sign( a );
1939
bSig1 = extractFloat64Frac1( b );
1940
bSig0 = extractFloat64Frac0( b );
1941
bExp = extractFloat64Exp( b );
1942
bSign = extractFloat64Sign( b );
1943
zSign = aSign ^ bSign;
1944
if ( aExp == 0x7FF ) {
1945
if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1946
if ( bExp == 0x7FF ) {
1947
if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1948
goto invalid;
1949
}
1950
return packFloat64( zSign, 0x7FF, 0, 0 );
1951
}
1952
if ( bExp == 0x7FF ) {
1953
if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1954
return packFloat64( zSign, 0, 0, 0 );
1955
}
1956
if ( bExp == 0 ) {
1957
if ( ( bSig0 | bSig1 ) == 0 ) {
1958
if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
1959
invalid:
1960
float_raise( float_flag_invalid );
1961
return float64_default_nan;
1962
}
1963
float_raise( float_flag_divbyzero );
1964
return packFloat64( zSign, 0x7FF, 0, 0 );
1965
}
1966
normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
1967
}
1968
if ( aExp == 0 ) {
1969
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1970
normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
1971
}
1972
zExp = aExp - bExp + 0x3FD;
1973
shortShift64Left( aSig0 | 0x00100000, aSig1, 11, &aSig0, &aSig1 );
1974
shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 );
1975
if ( le64( bSig0, bSig1, aSig0, aSig1 ) ) {
1976
shift64Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
1977
++zExp;
1978
}
1979
zSig0 = estimateDiv64To32( aSig0, aSig1, bSig0 );
1980
mul64By32To96( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
1981
sub96( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
1982
while ( (sbits32) rem0 < 0 ) {
1983
--zSig0;
1984
add96( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
1985
}
1986
zSig1 = estimateDiv64To32( rem1, rem2, bSig0 );
1987
if ( ( zSig1 & 0x3FF ) <= 4 ) {
1988
mul64By32To96( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
1989
sub96( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
1990
while ( (sbits32) rem1 < 0 ) {
1991
--zSig1;
1992
add96( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
1993
}
1994
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
1995
}
1996
shift64ExtraRightJamming( zSig0, zSig1, 0, 11, &zSig0, &zSig1, &zSig2 );
1997
return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
1998
1999
}
2000
2001
#ifndef SOFTFLOAT_FOR_GCC
2002
/*
2003
-------------------------------------------------------------------------------
2004
Returns the remainder of the double-precision floating-point value `a'
2005
with respect to the corresponding value `b'. The operation is performed
2006
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2007
-------------------------------------------------------------------------------
2008
*/
2009
float64 float64_rem( float64 a, float64 b )
2010
{
2011
flag aSign, bSign, zSign;
2012
int16 aExp, bExp, expDiff;
2013
bits32 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
2014
bits32 allZero, alternateASig0, alternateASig1, sigMean1;
2015
sbits32 sigMean0;
2016
float64 z;
2017
2018
aSig1 = extractFloat64Frac1( a );
2019
aSig0 = extractFloat64Frac0( a );
2020
aExp = extractFloat64Exp( a );
2021
aSign = extractFloat64Sign( a );
2022
bSig1 = extractFloat64Frac1( b );
2023
bSig0 = extractFloat64Frac0( b );
2024
bExp = extractFloat64Exp( b );
2025
bSign = extractFloat64Sign( b );
2026
if ( aExp == 0x7FF ) {
2027
if ( ( aSig0 | aSig1 )
2028
|| ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) {
2029
return propagateFloat64NaN( a, b );
2030
}
2031
goto invalid;
2032
}
2033
if ( bExp == 0x7FF ) {
2034
if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
2035
return a;
2036
}
2037
if ( bExp == 0 ) {
2038
if ( ( bSig0 | bSig1 ) == 0 ) {
2039
invalid:
2040
float_raise( float_flag_invalid );
2041
return float64_default_nan;
2042
}
2043
normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
2044
}
2045
if ( aExp == 0 ) {
2046
if ( ( aSig0 | aSig1 ) == 0 ) return a;
2047
normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2048
}
2049
expDiff = aExp - bExp;
2050
if ( expDiff < -1 ) return a;
2051
shortShift64Left(
2052
aSig0 | 0x00100000, aSig1, 11 - ( expDiff < 0 ), &aSig0, &aSig1 );
2053
shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 );
2054
q = le64( bSig0, bSig1, aSig0, aSig1 );
2055
if ( q ) sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2056
expDiff -= 32;
2057
while ( 0 < expDiff ) {
2058
q = estimateDiv64To32( aSig0, aSig1, bSig0 );
2059
q = ( 4 < q ) ? q - 4 : 0;
2060
mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 );
2061
shortShift96Left( term0, term1, term2, 29, &term1, &term2, &allZero );
2062
shortShift64Left( aSig0, aSig1, 29, &aSig0, &allZero );
2063
sub64( aSig0, 0, term1, term2, &aSig0, &aSig1 );
2064
expDiff -= 29;
2065
}
2066
if ( -32 < expDiff ) {
2067
q = estimateDiv64To32( aSig0, aSig1, bSig0 );
2068
q = ( 4 < q ) ? q - 4 : 0;
2069
q >>= - expDiff;
2070
shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 );
2071
expDiff += 24;
2072
if ( expDiff < 0 ) {
2073
shift64Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
2074
}
2075
else {
2076
shortShift64Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
2077
}
2078
mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 );
2079
sub64( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
2080
}
2081
else {
2082
shift64Right( aSig0, aSig1, 8, &aSig0, &aSig1 );
2083
shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 );
2084
}
2085
do {
2086
alternateASig0 = aSig0;
2087
alternateASig1 = aSig1;
2088
++q;
2089
sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2090
} while ( 0 <= (sbits32) aSig0 );
2091
add64(
2092
aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
2093
if ( ( sigMean0 < 0 )
2094
|| ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
2095
aSig0 = alternateASig0;
2096
aSig1 = alternateASig1;
2097
}
2098
zSign = ( (sbits32) aSig0 < 0 );
2099
if ( zSign ) sub64( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
2100
return
2101
normalizeRoundAndPackFloat64( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
2102
2103
}
2104
#endif
2105
2106
#ifndef SOFTFLOAT_FOR_GCC
2107
/*
2108
-------------------------------------------------------------------------------
2109
Returns the square root of the double-precision floating-point value `a'.
2110
The operation is performed according to the IEC/IEEE Standard for Binary
2111
Floating-Point Arithmetic.
2112
-------------------------------------------------------------------------------
2113
*/
2114
float64 float64_sqrt( float64 a )
2115
{
2116
flag aSign;
2117
int16 aExp, zExp;
2118
bits32 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
2119
bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
2120
float64 z;
2121
2122
aSig1 = extractFloat64Frac1( a );
2123
aSig0 = extractFloat64Frac0( a );
2124
aExp = extractFloat64Exp( a );
2125
aSign = extractFloat64Sign( a );
2126
if ( aExp == 0x7FF ) {
2127
if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, a );
2128
if ( ! aSign ) return a;
2129
goto invalid;
2130
}
2131
if ( aSign ) {
2132
if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
2133
invalid:
2134
float_raise( float_flag_invalid );
2135
return float64_default_nan;
2136
}
2137
if ( aExp == 0 ) {
2138
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( 0, 0, 0, 0 );
2139
normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2140
}
2141
zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2142
aSig0 |= 0x00100000;
2143
shortShift64Left( aSig0, aSig1, 11, &term0, &term1 );
2144
zSig0 = ( estimateSqrt32( aExp, term0 )>>1 ) + 1;
2145
if ( zSig0 == 0 ) zSig0 = 0x7FFFFFFF;
2146
doubleZSig0 = zSig0 + zSig0;
2147
shortShift64Left( aSig0, aSig1, 9 - ( aExp & 1 ), &aSig0, &aSig1 );
2148
mul32To64( zSig0, zSig0, &term0, &term1 );
2149
sub64( aSig0, aSig1, term0, term1, &rem0, &rem1 );
2150
while ( (sbits32) rem0 < 0 ) {
2151
--zSig0;
2152
doubleZSig0 -= 2;
2153
add64( rem0, rem1, 0, doubleZSig0 | 1, &rem0, &rem1 );
2154
}
2155
zSig1 = estimateDiv64To32( rem1, 0, doubleZSig0 );
2156
if ( ( zSig1 & 0x1FF ) <= 5 ) {
2157
if ( zSig1 == 0 ) zSig1 = 1;
2158
mul32To64( doubleZSig0, zSig1, &term1, &term2 );
2159
sub64( rem1, 0, term1, term2, &rem1, &rem2 );
2160
mul32To64( zSig1, zSig1, &term2, &term3 );
2161
sub96( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
2162
while ( (sbits32) rem1 < 0 ) {
2163
--zSig1;
2164
shortShift64Left( 0, zSig1, 1, &term2, &term3 );
2165
term3 |= 1;
2166
term2 |= doubleZSig0;
2167
add96( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
2168
}
2169
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
2170
}
2171
shift64ExtraRightJamming( zSig0, zSig1, 0, 10, &zSig0, &zSig1, &zSig2 );
2172
return roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2 );
2173
2174
}
2175
#endif
2176
2177
/*
2178
-------------------------------------------------------------------------------
2179
Returns 1 if the double-precision floating-point value `a' is equal to
2180
the corresponding value `b', and 0 otherwise. The comparison is performed
2181
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2182
-------------------------------------------------------------------------------
2183
*/
2184
flag float64_eq( float64 a, float64 b )
2185
{
2186
2187
if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2188
&& ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2189
|| ( ( extractFloat64Exp( b ) == 0x7FF )
2190
&& ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2191
) {
2192
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2193
float_raise( float_flag_invalid );
2194
}
2195
return 0;
2196
}
2197
return ( a == b ) ||
2198
( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
2199
2200
}
2201
2202
/*
2203
-------------------------------------------------------------------------------
2204
Returns 1 if the double-precision floating-point value `a' is less than
2205
or equal to the corresponding value `b', and 0 otherwise. The comparison
2206
is performed according to the IEC/IEEE Standard for Binary Floating-Point
2207
Arithmetic.
2208
-------------------------------------------------------------------------------
2209
*/
2210
flag float64_le( float64 a, float64 b )
2211
{
2212
flag aSign, bSign;
2213
2214
if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2215
&& ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2216
|| ( ( extractFloat64Exp( b ) == 0x7FF )
2217
&& ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2218
) {
2219
float_raise( float_flag_invalid );
2220
return 0;
2221
}
2222
aSign = extractFloat64Sign( a );
2223
bSign = extractFloat64Sign( b );
2224
if ( aSign != bSign )
2225
return aSign ||
2226
( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
2227
0 );
2228
return ( a == b ) ||
2229
( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
2230
}
2231
2232
/*
2233
-------------------------------------------------------------------------------
2234
Returns 1 if the double-precision floating-point value `a' is less than
2235
the corresponding value `b', and 0 otherwise. The comparison is performed
2236
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2237
-------------------------------------------------------------------------------
2238
*/
2239
flag float64_lt( float64 a, float64 b )
2240
{
2241
flag aSign, bSign;
2242
2243
if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2244
&& ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2245
|| ( ( extractFloat64Exp( b ) == 0x7FF )
2246
&& ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2247
) {
2248
float_raise( float_flag_invalid );
2249
return 0;
2250
}
2251
aSign = extractFloat64Sign( a );
2252
bSign = extractFloat64Sign( b );
2253
if ( aSign != bSign )
2254
return aSign &&
2255
( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
2256
0 );
2257
return ( a != b ) &&
2258
( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
2259
2260
}
2261
2262
#ifndef SOFTFLOAT_FOR_GCC
2263
/*
2264
-------------------------------------------------------------------------------
2265
Returns 1 if the double-precision floating-point value `a' is equal to
2266
the corresponding value `b', and 0 otherwise. The invalid exception is
2267
raised if either operand is a NaN. Otherwise, the comparison is performed
2268
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2269
-------------------------------------------------------------------------------
2270
*/
2271
flag float64_eq_signaling( float64 a, float64 b )
2272
{
2273
2274
if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2275
&& ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2276
|| ( ( extractFloat64Exp( b ) == 0x7FF )
2277
&& ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2278
) {
2279
float_raise( float_flag_invalid );
2280
return 0;
2281
}
2282
return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2283
2284
}
2285
2286
/*
2287
-------------------------------------------------------------------------------
2288
Returns 1 if the double-precision floating-point value `a' is less than or
2289
equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2290
cause an exception. Otherwise, the comparison is performed according to the
2291
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2292
-------------------------------------------------------------------------------
2293
*/
2294
flag float64_le_quiet( float64 a, float64 b )
2295
{
2296
flag aSign, bSign;
2297
2298
if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2299
&& ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2300
|| ( ( extractFloat64Exp( b ) == 0x7FF )
2301
&& ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2302
) {
2303
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2304
float_raise( float_flag_invalid );
2305
}
2306
return 0;
2307
}
2308
aSign = extractFloat64Sign( a );
2309
bSign = extractFloat64Sign( b );
2310
if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2311
return ( a == b ) || ( aSign ^ ( a < b ) );
2312
2313
}
2314
2315
/*
2316
-------------------------------------------------------------------------------
2317
Returns 1 if the double-precision floating-point value `a' is less than
2318
the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2319
exception. Otherwise, the comparison is performed according to the IEC/IEEE
2320
Standard for Binary Floating-Point Arithmetic.
2321
-------------------------------------------------------------------------------
2322
*/
2323
flag float64_lt_quiet( float64 a, float64 b )
2324
{
2325
flag aSign, bSign;
2326
2327
if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2328
&& ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2329
|| ( ( extractFloat64Exp( b ) == 0x7FF )
2330
&& ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2331
) {
2332
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2333
float_raise( float_flag_invalid );
2334
}
2335
return 0;
2336
}
2337
aSign = extractFloat64Sign( a );
2338
bSign = extractFloat64Sign( b );
2339
if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2340
return ( a != b ) && ( aSign ^ ( a < b ) );
2341
2342
}
2343
2344
#endif
2345
2346