Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/lib/libc/softfloat/bits64/softfloat.c
39530 views
1
/* $NetBSD: softfloat.c,v 1.8 2011/07/10 04:52:23 matt Exp $ */
2
3
/*
4
* This version hacked for use with gcc -msoft-float by bjh21.
5
* (Mostly a case of #ifdefing out things GCC doesn't need or provides
6
* itself).
7
*/
8
9
/*
10
* Things you may want to define:
11
*
12
* SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13
* -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14
* properly renamed.
15
*/
16
17
/*
18
===============================================================================
19
20
This C source file is part of the SoftFloat IEC/IEEE Floating-point
21
Arithmetic Package, Release 2a.
22
23
Written by John R. Hauser. This work was made possible in part by the
24
International Computer Science Institute, located at Suite 600, 1947 Center
25
Street, Berkeley, California 94704. Funding was partially provided by the
26
National Science Foundation under grant MIP-9311980. The original version
27
of this code was written as part of a project to build a fixed-point vector
28
processor in collaboration with the University of California at Berkeley,
29
overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30
is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31
arithmetic/SoftFloat.html'.
32
33
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34
has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35
TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36
PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37
AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38
39
Derivative works are acceptable, even for commercial purposes, so long as
40
(1) they include prominent notice that the work is derivative, and (2) they
41
include prominent notice akin to these four paragraphs for those parts of
42
this code that are retained.
43
44
===============================================================================
45
*/
46
47
#ifdef SOFTFLOAT_FOR_GCC
48
#include "softfloat-for-gcc.h"
49
#endif
50
51
#include "milieu.h"
52
#include "softfloat.h"
53
54
/*
55
* Conversions between floats as stored in memory and floats as
56
* SoftFloat uses them
57
*/
58
#ifndef FLOAT64_DEMANGLE
59
#define FLOAT64_DEMANGLE(a) (a)
60
#endif
61
#ifndef FLOAT64_MANGLE
62
#define FLOAT64_MANGLE(a) (a)
63
#endif
64
65
/*
66
-------------------------------------------------------------------------------
67
Floating-point rounding mode, extended double-precision rounding precision,
68
and exception flags.
69
-------------------------------------------------------------------------------
70
*/
71
int float_rounding_mode = float_round_nearest_even;
72
int float_exception_flags = 0;
73
#ifdef FLOATX80
74
int8 floatx80_rounding_precision = 80;
75
#endif
76
77
/*
78
-------------------------------------------------------------------------------
79
Primitive arithmetic functions, including multi-word arithmetic, and
80
division and square root approximations. (Can be specialized to target if
81
desired.)
82
-------------------------------------------------------------------------------
83
*/
84
#include "softfloat-macros"
85
86
/*
87
-------------------------------------------------------------------------------
88
Functions and definitions to determine: (1) whether tininess for underflow
89
is detected before or after rounding by default, (2) what (if anything)
90
happens when exceptions are raised, (3) how signaling NaNs are distinguished
91
from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
92
are propagated from function inputs to output. These details are target-
93
specific.
94
-------------------------------------------------------------------------------
95
*/
96
#include "softfloat-specialize"
97
98
#if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
99
/*
100
-------------------------------------------------------------------------------
101
Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
102
and 7, and returns the properly rounded 32-bit integer corresponding to the
103
input. If `zSign' is 1, the input is negated before being converted to an
104
integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
105
is simply rounded to an integer, with the inexact exception raised if the
106
input cannot be represented exactly as an integer. However, if the fixed-
107
point input is too large, the invalid exception is raised and the largest
108
positive or negative integer is returned.
109
-------------------------------------------------------------------------------
110
*/
111
static int32 roundAndPackInt32( flag zSign, bits64 absZ )
112
{
113
int8 roundingMode;
114
flag roundNearestEven;
115
int8 roundIncrement, roundBits;
116
int32 z;
117
118
roundingMode = float_rounding_mode;
119
roundNearestEven = ( roundingMode == float_round_nearest_even );
120
roundIncrement = 0x40;
121
if ( ! roundNearestEven ) {
122
if ( roundingMode == float_round_to_zero ) {
123
roundIncrement = 0;
124
}
125
else {
126
roundIncrement = 0x7F;
127
if ( zSign ) {
128
if ( roundingMode == float_round_up ) roundIncrement = 0;
129
}
130
else {
131
if ( roundingMode == float_round_down ) roundIncrement = 0;
132
}
133
}
134
}
135
roundBits = absZ & 0x7F;
136
absZ = ( absZ + roundIncrement )>>7;
137
absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
138
z = absZ;
139
if ( zSign ) z = - z;
140
if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
141
float_raise( float_flag_invalid );
142
return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
143
}
144
if ( roundBits ) float_exception_flags |= float_flag_inexact;
145
return z;
146
147
}
148
149
/*
150
-------------------------------------------------------------------------------
151
Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
152
`absZ1', with binary point between bits 63 and 64 (between the input words),
153
and returns the properly rounded 64-bit integer corresponding to the input.
154
If `zSign' is 1, the input is negated before being converted to an integer.
155
Ordinarily, the fixed-point input is simply rounded to an integer, with
156
the inexact exception raised if the input cannot be represented exactly as
157
an integer. However, if the fixed-point input is too large, the invalid
158
exception is raised and the largest positive or negative integer is
159
returned.
160
-------------------------------------------------------------------------------
161
*/
162
static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
163
{
164
int8 roundingMode;
165
flag roundNearestEven, increment;
166
int64 z;
167
168
roundingMode = float_rounding_mode;
169
roundNearestEven = ( roundingMode == float_round_nearest_even );
170
increment = ( (sbits64) absZ1 < 0 );
171
if ( ! roundNearestEven ) {
172
if ( roundingMode == float_round_to_zero ) {
173
increment = 0;
174
}
175
else {
176
if ( zSign ) {
177
increment = ( roundingMode == float_round_down ) && absZ1;
178
}
179
else {
180
increment = ( roundingMode == float_round_up ) && absZ1;
181
}
182
}
183
}
184
if ( increment ) {
185
++absZ0;
186
if ( absZ0 == 0 ) goto overflow;
187
absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
188
}
189
z = absZ0;
190
if ( zSign ) z = - z;
191
if ( z && ( ( z < 0 ) ^ zSign ) ) {
192
overflow:
193
float_raise( float_flag_invalid );
194
return
195
zSign ? (sbits64) LIT64( 0x8000000000000000 )
196
: LIT64( 0x7FFFFFFFFFFFFFFF );
197
}
198
if ( absZ1 ) float_exception_flags |= float_flag_inexact;
199
return z;
200
201
}
202
#endif
203
204
/*
205
-------------------------------------------------------------------------------
206
Returns the fraction bits of the single-precision floating-point value `a'.
207
-------------------------------------------------------------------------------
208
*/
209
INLINE bits32 extractFloat32Frac( float32 a )
210
{
211
212
return a & 0x007FFFFF;
213
214
}
215
216
/*
217
-------------------------------------------------------------------------------
218
Returns the exponent bits of the single-precision floating-point value `a'.
219
-------------------------------------------------------------------------------
220
*/
221
INLINE int16 extractFloat32Exp( float32 a )
222
{
223
224
return ( a>>23 ) & 0xFF;
225
226
}
227
228
/*
229
-------------------------------------------------------------------------------
230
Returns the sign bit of the single-precision floating-point value `a'.
231
-------------------------------------------------------------------------------
232
*/
233
INLINE flag extractFloat32Sign( float32 a )
234
{
235
236
return a>>31;
237
238
}
239
240
/*
241
-------------------------------------------------------------------------------
242
Normalizes the subnormal single-precision floating-point value represented
243
by the denormalized significand `aSig'. The normalized exponent and
244
significand are stored at the locations pointed to by `zExpPtr' and
245
`zSigPtr', respectively.
246
-------------------------------------------------------------------------------
247
*/
248
static void
249
normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
250
{
251
int8 shiftCount;
252
253
shiftCount = countLeadingZeros32( aSig ) - 8;
254
*zSigPtr = aSig<<shiftCount;
255
*zExpPtr = 1 - shiftCount;
256
257
}
258
259
/*
260
-------------------------------------------------------------------------------
261
Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
262
single-precision floating-point value, returning the result. After being
263
shifted into the proper positions, the three fields are simply added
264
together to form the result. This means that any integer portion of `zSig'
265
will be added into the exponent. Since a properly normalized significand
266
will have an integer portion equal to 1, the `zExp' input should be 1 less
267
than the desired result exponent whenever `zSig' is a complete, normalized
268
significand.
269
-------------------------------------------------------------------------------
270
*/
271
INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
272
{
273
274
return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
275
276
}
277
278
/*
279
-------------------------------------------------------------------------------
280
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
281
and significand `zSig', and returns the proper single-precision floating-
282
point value corresponding to the abstract input. Ordinarily, the abstract
283
value is simply rounded and packed into the single-precision format, with
284
the inexact exception raised if the abstract input cannot be represented
285
exactly. However, if the abstract value is too large, the overflow and
286
inexact exceptions are raised and an infinity or maximal finite value is
287
returned. If the abstract value is too small, the input value is rounded to
288
a subnormal number, and the underflow and inexact exceptions are raised if
289
the abstract input cannot be represented exactly as a subnormal single-
290
precision floating-point number.
291
The input significand `zSig' has its binary point between bits 30
292
and 29, which is 7 bits to the left of the usual location. This shifted
293
significand must be normalized or smaller. If `zSig' is not normalized,
294
`zExp' must be 0; in that case, the result returned is a subnormal number,
295
and it must not require rounding. In the usual case that `zSig' is
296
normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
297
The handling of underflow and overflow follows the IEC/IEEE Standard for
298
Binary Floating-Point Arithmetic.
299
-------------------------------------------------------------------------------
300
*/
301
static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
302
{
303
int8 roundingMode;
304
flag roundNearestEven;
305
int8 roundIncrement, roundBits;
306
flag isTiny;
307
308
roundingMode = float_rounding_mode;
309
roundNearestEven = ( roundingMode == float_round_nearest_even );
310
roundIncrement = 0x40;
311
if ( ! roundNearestEven ) {
312
if ( roundingMode == float_round_to_zero ) {
313
roundIncrement = 0;
314
}
315
else {
316
roundIncrement = 0x7F;
317
if ( zSign ) {
318
if ( roundingMode == float_round_up ) roundIncrement = 0;
319
}
320
else {
321
if ( roundingMode == float_round_down ) roundIncrement = 0;
322
}
323
}
324
}
325
roundBits = zSig & 0x7F;
326
if ( 0xFD <= (bits16) zExp ) {
327
if ( ( 0xFD < zExp )
328
|| ( ( zExp == 0xFD )
329
&& ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
330
) {
331
float_raise( float_flag_overflow | float_flag_inexact );
332
return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
333
}
334
if ( zExp < 0 ) {
335
isTiny =
336
( float_detect_tininess == float_tininess_before_rounding )
337
|| ( zExp < -1 )
338
|| ( zSig + roundIncrement < 0x80000000 );
339
shift32RightJamming( zSig, - zExp, &zSig );
340
zExp = 0;
341
roundBits = zSig & 0x7F;
342
if ( isTiny && roundBits ) float_raise( float_flag_underflow );
343
}
344
}
345
if ( roundBits ) float_exception_flags |= float_flag_inexact;
346
zSig = ( zSig + roundIncrement )>>7;
347
zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
348
if ( zSig == 0 ) zExp = 0;
349
return packFloat32( zSign, zExp, zSig );
350
351
}
352
353
/*
354
-------------------------------------------------------------------------------
355
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
356
and significand `zSig', and returns the proper single-precision floating-
357
point value corresponding to the abstract input. This routine is just like
358
`roundAndPackFloat32' except that `zSig' does not have to be normalized.
359
Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
360
floating-point exponent.
361
-------------------------------------------------------------------------------
362
*/
363
static float32
364
normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
365
{
366
int8 shiftCount;
367
368
shiftCount = countLeadingZeros32( zSig ) - 1;
369
return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
370
371
}
372
373
/*
374
-------------------------------------------------------------------------------
375
Returns the fraction bits of the double-precision floating-point value `a'.
376
-------------------------------------------------------------------------------
377
*/
378
INLINE bits64 extractFloat64Frac( float64 a )
379
{
380
381
return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
382
383
}
384
385
/*
386
-------------------------------------------------------------------------------
387
Returns the exponent bits of the double-precision floating-point value `a'.
388
-------------------------------------------------------------------------------
389
*/
390
INLINE int16 extractFloat64Exp( float64 a )
391
{
392
393
return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
394
395
}
396
397
/*
398
-------------------------------------------------------------------------------
399
Returns the sign bit of the double-precision floating-point value `a'.
400
-------------------------------------------------------------------------------
401
*/
402
INLINE flag extractFloat64Sign( float64 a )
403
{
404
405
return FLOAT64_DEMANGLE(a)>>63;
406
407
}
408
409
/*
410
-------------------------------------------------------------------------------
411
Normalizes the subnormal double-precision floating-point value represented
412
by the denormalized significand `aSig'. The normalized exponent and
413
significand are stored at the locations pointed to by `zExpPtr' and
414
`zSigPtr', respectively.
415
-------------------------------------------------------------------------------
416
*/
417
static void
418
normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
419
{
420
int8 shiftCount;
421
422
shiftCount = countLeadingZeros64( aSig ) - 11;
423
*zSigPtr = aSig<<shiftCount;
424
*zExpPtr = 1 - shiftCount;
425
426
}
427
428
/*
429
-------------------------------------------------------------------------------
430
Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
431
double-precision floating-point value, returning the result. After being
432
shifted into the proper positions, the three fields are simply added
433
together to form the result. This means that any integer portion of `zSig'
434
will be added into the exponent. Since a properly normalized significand
435
will have an integer portion equal to 1, the `zExp' input should be 1 less
436
than the desired result exponent whenever `zSig' is a complete, normalized
437
significand.
438
-------------------------------------------------------------------------------
439
*/
440
INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
441
{
442
443
return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
444
( ( (bits64) zExp )<<52 ) + zSig );
445
446
}
447
448
/*
449
-------------------------------------------------------------------------------
450
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
451
and significand `zSig', and returns the proper double-precision floating-
452
point value corresponding to the abstract input. Ordinarily, the abstract
453
value is simply rounded and packed into the double-precision format, with
454
the inexact exception raised if the abstract input cannot be represented
455
exactly. However, if the abstract value is too large, the overflow and
456
inexact exceptions are raised and an infinity or maximal finite value is
457
returned. If the abstract value is too small, the input value is rounded to
458
a subnormal number, and the underflow and inexact exceptions are raised if
459
the abstract input cannot be represented exactly as a subnormal double-
460
precision floating-point number.
461
The input significand `zSig' has its binary point between bits 62
462
and 61, which is 10 bits to the left of the usual location. This shifted
463
significand must be normalized or smaller. If `zSig' is not normalized,
464
`zExp' must be 0; in that case, the result returned is a subnormal number,
465
and it must not require rounding. In the usual case that `zSig' is
466
normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
467
The handling of underflow and overflow follows the IEC/IEEE Standard for
468
Binary Floating-Point Arithmetic.
469
-------------------------------------------------------------------------------
470
*/
471
static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
472
{
473
int8 roundingMode;
474
flag roundNearestEven;
475
int16 roundIncrement, roundBits;
476
flag isTiny;
477
478
roundingMode = float_rounding_mode;
479
roundNearestEven = ( roundingMode == float_round_nearest_even );
480
roundIncrement = 0x200;
481
if ( ! roundNearestEven ) {
482
if ( roundingMode == float_round_to_zero ) {
483
roundIncrement = 0;
484
}
485
else {
486
roundIncrement = 0x3FF;
487
if ( zSign ) {
488
if ( roundingMode == float_round_up ) roundIncrement = 0;
489
}
490
else {
491
if ( roundingMode == float_round_down ) roundIncrement = 0;
492
}
493
}
494
}
495
roundBits = zSig & 0x3FF;
496
if ( 0x7FD <= (bits16) zExp ) {
497
if ( ( 0x7FD < zExp )
498
|| ( ( zExp == 0x7FD )
499
&& ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
500
) {
501
float_raise( float_flag_overflow | float_flag_inexact );
502
return FLOAT64_MANGLE(
503
FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
504
( roundIncrement == 0 ));
505
}
506
if ( zExp < 0 ) {
507
isTiny =
508
( float_detect_tininess == float_tininess_before_rounding )
509
|| ( zExp < -1 )
510
|| ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
511
shift64RightJamming( zSig, - zExp, &zSig );
512
zExp = 0;
513
roundBits = zSig & 0x3FF;
514
if ( isTiny && roundBits ) float_raise( float_flag_underflow );
515
}
516
}
517
if ( roundBits ) float_exception_flags |= float_flag_inexact;
518
zSig = ( zSig + roundIncrement )>>10;
519
zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
520
if ( zSig == 0 ) zExp = 0;
521
return packFloat64( zSign, zExp, zSig );
522
523
}
524
525
/*
526
-------------------------------------------------------------------------------
527
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
528
and significand `zSig', and returns the proper double-precision floating-
529
point value corresponding to the abstract input. This routine is just like
530
`roundAndPackFloat64' except that `zSig' does not have to be normalized.
531
Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
532
floating-point exponent.
533
-------------------------------------------------------------------------------
534
*/
535
static float64
536
normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
537
{
538
int8 shiftCount;
539
540
shiftCount = countLeadingZeros64( zSig ) - 1;
541
return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
542
543
}
544
545
#ifdef FLOATX80
546
547
/*
548
-------------------------------------------------------------------------------
549
Returns the fraction bits of the extended double-precision floating-point
550
value `a'.
551
-------------------------------------------------------------------------------
552
*/
553
INLINE bits64 extractFloatx80Frac( floatx80 a )
554
{
555
556
return a.low;
557
558
}
559
560
/*
561
-------------------------------------------------------------------------------
562
Returns the exponent bits of the extended double-precision floating-point
563
value `a'.
564
-------------------------------------------------------------------------------
565
*/
566
INLINE int32 extractFloatx80Exp( floatx80 a )
567
{
568
569
return a.high & 0x7FFF;
570
571
}
572
573
/*
574
-------------------------------------------------------------------------------
575
Returns the sign bit of the extended double-precision floating-point value
576
`a'.
577
-------------------------------------------------------------------------------
578
*/
579
INLINE flag extractFloatx80Sign( floatx80 a )
580
{
581
582
return a.high>>15;
583
584
}
585
586
/*
587
-------------------------------------------------------------------------------
588
Normalizes the subnormal extended double-precision floating-point value
589
represented by the denormalized significand `aSig'. The normalized exponent
590
and significand are stored at the locations pointed to by `zExpPtr' and
591
`zSigPtr', respectively.
592
-------------------------------------------------------------------------------
593
*/
594
static void
595
normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
596
{
597
int8 shiftCount;
598
599
shiftCount = countLeadingZeros64( aSig );
600
*zSigPtr = aSig<<shiftCount;
601
*zExpPtr = 1 - shiftCount;
602
603
}
604
605
/*
606
-------------------------------------------------------------------------------
607
Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
608
extended double-precision floating-point value, returning the result.
609
-------------------------------------------------------------------------------
610
*/
611
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
612
{
613
floatx80 z;
614
615
z.low = zSig;
616
z.high = ( ( (bits16) zSign )<<15 ) + zExp;
617
return z;
618
619
}
620
621
/*
622
-------------------------------------------------------------------------------
623
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
624
and extended significand formed by the concatenation of `zSig0' and `zSig1',
625
and returns the proper extended double-precision floating-point value
626
corresponding to the abstract input. Ordinarily, the abstract value is
627
rounded and packed into the extended double-precision format, with the
628
inexact exception raised if the abstract input cannot be represented
629
exactly. However, if the abstract value is too large, the overflow and
630
inexact exceptions are raised and an infinity or maximal finite value is
631
returned. If the abstract value is too small, the input value is rounded to
632
a subnormal number, and the underflow and inexact exceptions are raised if
633
the abstract input cannot be represented exactly as a subnormal extended
634
double-precision floating-point number.
635
If `roundingPrecision' is 32 or 64, the result is rounded to the same
636
number of bits as single or double precision, respectively. Otherwise, the
637
result is rounded to the full precision of the extended double-precision
638
format.
639
The input significand must be normalized or smaller. If the input
640
significand is not normalized, `zExp' must be 0; in that case, the result
641
returned is a subnormal number, and it must not require rounding. The
642
handling of underflow and overflow follows the IEC/IEEE Standard for Binary
643
Floating-Point Arithmetic.
644
-------------------------------------------------------------------------------
645
*/
646
static floatx80
647
roundAndPackFloatx80(
648
int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
649
)
650
{
651
int8 roundingMode;
652
flag roundNearestEven, increment, isTiny;
653
int64 roundIncrement, roundMask, roundBits;
654
655
roundingMode = float_rounding_mode;
656
roundNearestEven = ( roundingMode == float_round_nearest_even );
657
if ( roundingPrecision == 80 ) goto precision80;
658
if ( roundingPrecision == 64 ) {
659
roundIncrement = LIT64( 0x0000000000000400 );
660
roundMask = LIT64( 0x00000000000007FF );
661
}
662
else if ( roundingPrecision == 32 ) {
663
roundIncrement = LIT64( 0x0000008000000000 );
664
roundMask = LIT64( 0x000000FFFFFFFFFF );
665
}
666
else {
667
goto precision80;
668
}
669
zSig0 |= ( zSig1 != 0 );
670
if ( ! roundNearestEven ) {
671
if ( roundingMode == float_round_to_zero ) {
672
roundIncrement = 0;
673
}
674
else {
675
roundIncrement = roundMask;
676
if ( zSign ) {
677
if ( roundingMode == float_round_up ) roundIncrement = 0;
678
}
679
else {
680
if ( roundingMode == float_round_down ) roundIncrement = 0;
681
}
682
}
683
}
684
roundBits = zSig0 & roundMask;
685
if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
686
if ( ( 0x7FFE < zExp )
687
|| ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
688
) {
689
goto overflow;
690
}
691
if ( zExp <= 0 ) {
692
isTiny =
693
( float_detect_tininess == float_tininess_before_rounding )
694
|| ( zExp < 0 )
695
|| ( zSig0 <= zSig0 + roundIncrement );
696
shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
697
zExp = 0;
698
roundBits = zSig0 & roundMask;
699
if ( isTiny && roundBits ) float_raise( float_flag_underflow );
700
if ( roundBits ) float_exception_flags |= float_flag_inexact;
701
zSig0 += roundIncrement;
702
if ( (sbits64) zSig0 < 0 ) zExp = 1;
703
roundIncrement = roundMask + 1;
704
if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
705
roundMask |= roundIncrement;
706
}
707
zSig0 &= ~ roundMask;
708
return packFloatx80( zSign, zExp, zSig0 );
709
}
710
}
711
if ( roundBits ) float_exception_flags |= float_flag_inexact;
712
zSig0 += roundIncrement;
713
if ( zSig0 < roundIncrement ) {
714
++zExp;
715
zSig0 = LIT64( 0x8000000000000000 );
716
}
717
roundIncrement = roundMask + 1;
718
if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
719
roundMask |= roundIncrement;
720
}
721
zSig0 &= ~ roundMask;
722
if ( zSig0 == 0 ) zExp = 0;
723
return packFloatx80( zSign, zExp, zSig0 );
724
precision80:
725
increment = ( (sbits64) zSig1 < 0 );
726
if ( ! roundNearestEven ) {
727
if ( roundingMode == float_round_to_zero ) {
728
increment = 0;
729
}
730
else {
731
if ( zSign ) {
732
increment = ( roundingMode == float_round_down ) && zSig1;
733
}
734
else {
735
increment = ( roundingMode == float_round_up ) && zSig1;
736
}
737
}
738
}
739
if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
740
if ( ( 0x7FFE < zExp )
741
|| ( ( zExp == 0x7FFE )
742
&& ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
743
&& increment
744
)
745
) {
746
roundMask = 0;
747
overflow:
748
float_raise( float_flag_overflow | float_flag_inexact );
749
if ( ( roundingMode == float_round_to_zero )
750
|| ( zSign && ( roundingMode == float_round_up ) )
751
|| ( ! zSign && ( roundingMode == float_round_down ) )
752
) {
753
return packFloatx80( zSign, 0x7FFE, ~ roundMask );
754
}
755
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
756
}
757
if ( zExp <= 0 ) {
758
isTiny =
759
( float_detect_tininess == float_tininess_before_rounding )
760
|| ( zExp < 0 )
761
|| ! increment
762
|| ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
763
shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
764
zExp = 0;
765
if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
766
if ( zSig1 ) float_exception_flags |= float_flag_inexact;
767
if ( roundNearestEven ) {
768
increment = ( (sbits64) zSig1 < 0 );
769
}
770
else {
771
if ( zSign ) {
772
increment = ( roundingMode == float_round_down ) && zSig1;
773
}
774
else {
775
increment = ( roundingMode == float_round_up ) && zSig1;
776
}
777
}
778
if ( increment ) {
779
++zSig0;
780
zSig0 &=
781
~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
782
if ( (sbits64) zSig0 < 0 ) zExp = 1;
783
}
784
return packFloatx80( zSign, zExp, zSig0 );
785
}
786
}
787
if ( zSig1 ) float_exception_flags |= float_flag_inexact;
788
if ( increment ) {
789
++zSig0;
790
if ( zSig0 == 0 ) {
791
++zExp;
792
zSig0 = LIT64( 0x8000000000000000 );
793
}
794
else {
795
zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
796
}
797
}
798
else {
799
if ( zSig0 == 0 ) zExp = 0;
800
}
801
return packFloatx80( zSign, zExp, zSig0 );
802
803
}
804
805
/*
806
-------------------------------------------------------------------------------
807
Takes an abstract floating-point value having sign `zSign', exponent
808
`zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
809
and returns the proper extended double-precision floating-point value
810
corresponding to the abstract input. This routine is just like
811
`roundAndPackFloatx80' except that the input significand does not have to be
812
normalized.
813
-------------------------------------------------------------------------------
814
*/
815
static floatx80
816
normalizeRoundAndPackFloatx80(
817
int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
818
)
819
{
820
int8 shiftCount;
821
822
if ( zSig0 == 0 ) {
823
zSig0 = zSig1;
824
zSig1 = 0;
825
zExp -= 64;
826
}
827
shiftCount = countLeadingZeros64( zSig0 );
828
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
829
zExp -= shiftCount;
830
return
831
roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
832
833
}
834
835
#endif
836
837
#ifdef FLOAT128
838
839
/*
840
-------------------------------------------------------------------------------
841
Returns the least-significant 64 fraction bits of the quadruple-precision
842
floating-point value `a'.
843
-------------------------------------------------------------------------------
844
*/
845
INLINE bits64 extractFloat128Frac1( float128 a )
846
{
847
848
return a.low;
849
850
}
851
852
/*
853
-------------------------------------------------------------------------------
854
Returns the most-significant 48 fraction bits of the quadruple-precision
855
floating-point value `a'.
856
-------------------------------------------------------------------------------
857
*/
858
INLINE bits64 extractFloat128Frac0( float128 a )
859
{
860
861
return a.high & LIT64( 0x0000FFFFFFFFFFFF );
862
863
}
864
865
/*
866
-------------------------------------------------------------------------------
867
Returns the exponent bits of the quadruple-precision floating-point value
868
`a'.
869
-------------------------------------------------------------------------------
870
*/
871
INLINE int32 extractFloat128Exp( float128 a )
872
{
873
874
return ( a.high>>48 ) & 0x7FFF;
875
876
}
877
878
/*
879
-------------------------------------------------------------------------------
880
Returns the sign bit of the quadruple-precision floating-point value `a'.
881
-------------------------------------------------------------------------------
882
*/
883
INLINE flag extractFloat128Sign( float128 a )
884
{
885
886
return a.high>>63;
887
888
}
889
890
/*
891
-------------------------------------------------------------------------------
892
Normalizes the subnormal quadruple-precision floating-point value
893
represented by the denormalized significand formed by the concatenation of
894
`aSig0' and `aSig1'. The normalized exponent is stored at the location
895
pointed to by `zExpPtr'. The most significant 49 bits of the normalized
896
significand are stored at the location pointed to by `zSig0Ptr', and the
897
least significant 64 bits of the normalized significand are stored at the
898
location pointed to by `zSig1Ptr'.
899
-------------------------------------------------------------------------------
900
*/
901
static void
902
normalizeFloat128Subnormal(
903
bits64 aSig0,
904
bits64 aSig1,
905
int32 *zExpPtr,
906
bits64 *zSig0Ptr,
907
bits64 *zSig1Ptr
908
)
909
{
910
int8 shiftCount;
911
912
if ( aSig0 == 0 ) {
913
shiftCount = countLeadingZeros64( aSig1 ) - 15;
914
if ( shiftCount < 0 ) {
915
*zSig0Ptr = aSig1>>( - shiftCount );
916
*zSig1Ptr = aSig1<<( shiftCount & 63 );
917
}
918
else {
919
*zSig0Ptr = aSig1<<shiftCount;
920
*zSig1Ptr = 0;
921
}
922
*zExpPtr = - shiftCount - 63;
923
}
924
else {
925
shiftCount = countLeadingZeros64( aSig0 ) - 15;
926
shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
927
*zExpPtr = 1 - shiftCount;
928
}
929
930
}
931
932
/*
933
-------------------------------------------------------------------------------
934
Packs the sign `zSign', the exponent `zExp', and the significand formed
935
by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
936
floating-point value, returning the result. After being shifted into the
937
proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
938
added together to form the most significant 32 bits of the result. This
939
means that any integer portion of `zSig0' will be added into the exponent.
940
Since a properly normalized significand will have an integer portion equal
941
to 1, the `zExp' input should be 1 less than the desired result exponent
942
whenever `zSig0' and `zSig1' concatenated form a complete, normalized
943
significand.
944
-------------------------------------------------------------------------------
945
*/
946
INLINE float128
947
packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
948
{
949
float128 z;
950
951
z.low = zSig1;
952
z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
953
return z;
954
955
}
956
957
/*
958
-------------------------------------------------------------------------------
959
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
960
and extended significand formed by the concatenation of `zSig0', `zSig1',
961
and `zSig2', and returns the proper quadruple-precision floating-point value
962
corresponding to the abstract input. Ordinarily, the abstract value is
963
simply rounded and packed into the quadruple-precision format, with the
964
inexact exception raised if the abstract input cannot be represented
965
exactly. However, if the abstract value is too large, the overflow and
966
inexact exceptions are raised and an infinity or maximal finite value is
967
returned. If the abstract value is too small, the input value is rounded to
968
a subnormal number, and the underflow and inexact exceptions are raised if
969
the abstract input cannot be represented exactly as a subnormal quadruple-
970
precision floating-point number.
971
The input significand must be normalized or smaller. If the input
972
significand is not normalized, `zExp' must be 0; in that case, the result
973
returned is a subnormal number, and it must not require rounding. In the
974
usual case that the input significand is normalized, `zExp' must be 1 less
975
than the ``true'' floating-point exponent. The handling of underflow and
976
overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
977
-------------------------------------------------------------------------------
978
*/
979
static float128
980
roundAndPackFloat128(
981
flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
982
{
983
int8 roundingMode;
984
flag roundNearestEven, increment, isTiny;
985
986
roundingMode = float_rounding_mode;
987
roundNearestEven = ( roundingMode == float_round_nearest_even );
988
increment = ( (sbits64) zSig2 < 0 );
989
if ( ! roundNearestEven ) {
990
if ( roundingMode == float_round_to_zero ) {
991
increment = 0;
992
}
993
else {
994
if ( zSign ) {
995
increment = ( roundingMode == float_round_down ) && zSig2;
996
}
997
else {
998
increment = ( roundingMode == float_round_up ) && zSig2;
999
}
1000
}
1001
}
1002
if ( 0x7FFD <= (bits32) zExp ) {
1003
if ( ( 0x7FFD < zExp )
1004
|| ( ( zExp == 0x7FFD )
1005
&& eq128(
1006
LIT64( 0x0001FFFFFFFFFFFF ),
1007
LIT64( 0xFFFFFFFFFFFFFFFF ),
1008
zSig0,
1009
zSig1
1010
)
1011
&& increment
1012
)
1013
) {
1014
float_raise( float_flag_overflow | float_flag_inexact );
1015
if ( ( roundingMode == float_round_to_zero )
1016
|| ( zSign && ( roundingMode == float_round_up ) )
1017
|| ( ! zSign && ( roundingMode == float_round_down ) )
1018
) {
1019
return
1020
packFloat128(
1021
zSign,
1022
0x7FFE,
1023
LIT64( 0x0000FFFFFFFFFFFF ),
1024
LIT64( 0xFFFFFFFFFFFFFFFF )
1025
);
1026
}
1027
return packFloat128( zSign, 0x7FFF, 0, 0 );
1028
}
1029
if ( zExp < 0 ) {
1030
isTiny =
1031
( float_detect_tininess == float_tininess_before_rounding )
1032
|| ( zExp < -1 )
1033
|| ! increment
1034
|| lt128(
1035
zSig0,
1036
zSig1,
1037
LIT64( 0x0001FFFFFFFFFFFF ),
1038
LIT64( 0xFFFFFFFFFFFFFFFF )
1039
);
1040
shift128ExtraRightJamming(
1041
zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1042
zExp = 0;
1043
if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1044
if ( roundNearestEven ) {
1045
increment = ( (sbits64) zSig2 < 0 );
1046
}
1047
else {
1048
if ( zSign ) {
1049
increment = ( roundingMode == float_round_down ) && zSig2;
1050
}
1051
else {
1052
increment = ( roundingMode == float_round_up ) && zSig2;
1053
}
1054
}
1055
}
1056
}
1057
if ( zSig2 ) float_exception_flags |= float_flag_inexact;
1058
if ( increment ) {
1059
add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1060
zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1061
}
1062
else {
1063
if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1064
}
1065
return packFloat128( zSign, zExp, zSig0, zSig1 );
1066
1067
}
1068
1069
/*
1070
-------------------------------------------------------------------------------
1071
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1072
and significand formed by the concatenation of `zSig0' and `zSig1', and
1073
returns the proper quadruple-precision floating-point value corresponding
1074
to the abstract input. This routine is just like `roundAndPackFloat128'
1075
except that the input significand has fewer bits and does not have to be
1076
normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1077
point exponent.
1078
-------------------------------------------------------------------------------
1079
*/
1080
static float128
1081
normalizeRoundAndPackFloat128(
1082
flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1083
{
1084
int8 shiftCount;
1085
bits64 zSig2;
1086
1087
if ( zSig0 == 0 ) {
1088
zSig0 = zSig1;
1089
zSig1 = 0;
1090
zExp -= 64;
1091
}
1092
shiftCount = countLeadingZeros64( zSig0 ) - 15;
1093
if ( 0 <= shiftCount ) {
1094
zSig2 = 0;
1095
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1096
}
1097
else {
1098
shift128ExtraRightJamming(
1099
zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1100
}
1101
zExp -= shiftCount;
1102
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1103
1104
}
1105
1106
#endif
1107
1108
/*
1109
-------------------------------------------------------------------------------
1110
Returns the result of converting the 32-bit two's complement integer `a'
1111
to the single-precision floating-point format. The conversion is performed
1112
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113
-------------------------------------------------------------------------------
1114
*/
1115
float32 int32_to_float32( int32 a )
1116
{
1117
flag zSign;
1118
1119
if ( a == 0 ) return 0;
1120
if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1121
zSign = ( a < 0 );
1122
return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1123
1124
}
1125
1126
#ifndef SOFTFLOAT_FOR_GCC /* __floatunsisf is in libgcc */
1127
float32 uint32_to_float32( uint32 a )
1128
{
1129
if ( a == 0 ) return 0;
1130
if ( a & (bits32) 0x80000000 )
1131
return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 );
1132
return normalizeRoundAndPackFloat32( 0, 0x9C, a );
1133
}
1134
#endif
1135
1136
1137
/*
1138
-------------------------------------------------------------------------------
1139
Returns the result of converting the 32-bit two's complement integer `a'
1140
to the double-precision floating-point format. The conversion is performed
1141
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1142
-------------------------------------------------------------------------------
1143
*/
1144
float64 int32_to_float64( int32 a )
1145
{
1146
flag zSign;
1147
uint32 absA;
1148
int8 shiftCount;
1149
bits64 zSig;
1150
1151
if ( a == 0 ) return 0;
1152
zSign = ( a < 0 );
1153
absA = zSign ? - a : a;
1154
shiftCount = countLeadingZeros32( absA ) + 21;
1155
zSig = absA;
1156
return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1157
1158
}
1159
1160
#ifndef SOFTFLOAT_FOR_GCC /* __floatunsidf is in libgcc */
1161
float64 uint32_to_float64( uint32 a )
1162
{
1163
int8 shiftCount;
1164
bits64 zSig = a;
1165
1166
if ( a == 0 ) return 0;
1167
shiftCount = countLeadingZeros32( a ) + 21;
1168
return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount );
1169
1170
}
1171
#endif
1172
1173
#ifdef FLOATX80
1174
1175
/*
1176
-------------------------------------------------------------------------------
1177
Returns the result of converting the 32-bit two's complement integer `a'
1178
to the extended double-precision floating-point format. The conversion
1179
is performed according to the IEC/IEEE Standard for Binary Floating-Point
1180
Arithmetic.
1181
-------------------------------------------------------------------------------
1182
*/
1183
floatx80 int32_to_floatx80( int32 a )
1184
{
1185
flag zSign;
1186
uint32 absA;
1187
int8 shiftCount;
1188
bits64 zSig;
1189
1190
if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1191
zSign = ( a < 0 );
1192
absA = zSign ? - a : a;
1193
shiftCount = countLeadingZeros32( absA ) + 32;
1194
zSig = absA;
1195
return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1196
1197
}
1198
1199
floatx80 uint32_to_floatx80( uint32 a )
1200
{
1201
int8 shiftCount;
1202
bits64 zSig = a;
1203
1204
if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1205
shiftCount = countLeadingZeros32( a ) + 32;
1206
return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount );
1207
1208
}
1209
1210
#endif
1211
1212
#ifdef FLOAT128
1213
1214
/*
1215
-------------------------------------------------------------------------------
1216
Returns the result of converting the 32-bit two's complement integer `a' to
1217
the quadruple-precision floating-point format. The conversion is performed
1218
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1219
-------------------------------------------------------------------------------
1220
*/
1221
float128 int32_to_float128( int32 a )
1222
{
1223
flag zSign;
1224
uint32 absA;
1225
int8 shiftCount;
1226
bits64 zSig0;
1227
1228
if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1229
zSign = ( a < 0 );
1230
absA = zSign ? - a : a;
1231
shiftCount = countLeadingZeros32( absA ) + 17;
1232
zSig0 = absA;
1233
return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1234
1235
}
1236
1237
float128 uint32_to_float128( uint32 a )
1238
{
1239
int8 shiftCount;
1240
bits64 zSig0 = a;
1241
1242
if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1243
shiftCount = countLeadingZeros32( a ) + 17;
1244
return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1245
1246
}
1247
1248
#endif
1249
1250
#ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1251
/*
1252
-------------------------------------------------------------------------------
1253
Returns the result of converting the 64-bit two's complement integer `a'
1254
to the single-precision floating-point format. The conversion is performed
1255
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1256
-------------------------------------------------------------------------------
1257
*/
1258
float32 int64_to_float32( int64 a )
1259
{
1260
flag zSign;
1261
uint64 absA;
1262
int8 shiftCount;
1263
1264
if ( a == 0 ) return 0;
1265
zSign = ( a < 0 );
1266
absA = zSign ? - a : a;
1267
shiftCount = countLeadingZeros64( absA ) - 40;
1268
if ( 0 <= shiftCount ) {
1269
return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1270
}
1271
else {
1272
shiftCount += 7;
1273
if ( shiftCount < 0 ) {
1274
shift64RightJamming( absA, - shiftCount, &absA );
1275
}
1276
else {
1277
absA <<= shiftCount;
1278
}
1279
return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1280
}
1281
1282
}
1283
1284
/*
1285
-------------------------------------------------------------------------------
1286
Returns the result of converting the 64-bit two's complement integer `a'
1287
to the double-precision floating-point format. The conversion is performed
1288
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1289
-------------------------------------------------------------------------------
1290
*/
1291
float64 int64_to_float64( int64 a )
1292
{
1293
flag zSign;
1294
1295
if ( a == 0 ) return 0;
1296
if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1297
return packFloat64( 1, 0x43E, 0 );
1298
}
1299
zSign = ( a < 0 );
1300
return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1301
1302
}
1303
1304
#ifdef FLOATX80
1305
1306
/*
1307
-------------------------------------------------------------------------------
1308
Returns the result of converting the 64-bit two's complement integer `a'
1309
to the extended double-precision floating-point format. The conversion
1310
is performed according to the IEC/IEEE Standard for Binary Floating-Point
1311
Arithmetic.
1312
-------------------------------------------------------------------------------
1313
*/
1314
floatx80 int64_to_floatx80( int64 a )
1315
{
1316
flag zSign;
1317
uint64 absA;
1318
int8 shiftCount;
1319
1320
if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1321
zSign = ( a < 0 );
1322
absA = zSign ? - a : a;
1323
shiftCount = countLeadingZeros64( absA );
1324
return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1325
1326
}
1327
1328
#endif
1329
1330
#endif /* !SOFTFLOAT_FOR_GCC */
1331
1332
#ifdef FLOAT128
1333
1334
/*
1335
-------------------------------------------------------------------------------
1336
Returns the result of converting the 64-bit two's complement integer `a' to
1337
the quadruple-precision floating-point format. The conversion is performed
1338
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1339
-------------------------------------------------------------------------------
1340
*/
1341
float128 int64_to_float128( int64 a )
1342
{
1343
flag zSign;
1344
uint64 absA;
1345
int8 shiftCount;
1346
int32 zExp;
1347
bits64 zSig0, zSig1;
1348
1349
if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1350
zSign = ( a < 0 );
1351
absA = zSign ? - a : a;
1352
shiftCount = countLeadingZeros64( absA ) + 49;
1353
zExp = 0x406E - shiftCount;
1354
if ( 64 <= shiftCount ) {
1355
zSig1 = 0;
1356
zSig0 = absA;
1357
shiftCount -= 64;
1358
}
1359
else {
1360
zSig1 = absA;
1361
zSig0 = 0;
1362
}
1363
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1364
return packFloat128( zSign, zExp, zSig0, zSig1 );
1365
1366
}
1367
1368
#endif
1369
1370
#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1371
/*
1372
-------------------------------------------------------------------------------
1373
Returns the result of converting the single-precision floating-point value
1374
`a' to the 32-bit two's complement integer format. The conversion is
1375
performed according to the IEC/IEEE Standard for Binary Floating-Point
1376
Arithmetic---which means in particular that the conversion is rounded
1377
according to the current rounding mode. If `a' is a NaN, the largest
1378
positive integer is returned. Otherwise, if the conversion overflows, the
1379
largest integer with the same sign as `a' is returned.
1380
-------------------------------------------------------------------------------
1381
*/
1382
int32 float32_to_int32( float32 a )
1383
{
1384
flag aSign;
1385
int16 aExp, shiftCount;
1386
bits32 aSig;
1387
bits64 aSig64;
1388
1389
aSig = extractFloat32Frac( a );
1390
aExp = extractFloat32Exp( a );
1391
aSign = extractFloat32Sign( a );
1392
if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1393
if ( aExp ) aSig |= 0x00800000;
1394
shiftCount = 0xAF - aExp;
1395
aSig64 = aSig;
1396
aSig64 <<= 32;
1397
if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1398
return roundAndPackInt32( aSign, aSig64 );
1399
1400
}
1401
#endif /* !SOFTFLOAT_FOR_GCC */
1402
1403
/*
1404
-------------------------------------------------------------------------------
1405
Returns the result of converting the single-precision floating-point value
1406
`a' to the 32-bit two's complement integer format. The conversion is
1407
performed according to the IEC/IEEE Standard for Binary Floating-Point
1408
Arithmetic, except that the conversion is always rounded toward zero.
1409
If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1410
the conversion overflows, the largest integer with the same sign as `a' is
1411
returned.
1412
-------------------------------------------------------------------------------
1413
*/
1414
int32 float32_to_int32_round_to_zero( float32 a )
1415
{
1416
flag aSign;
1417
int16 aExp, shiftCount;
1418
bits32 aSig;
1419
int32 z;
1420
1421
aSig = extractFloat32Frac( a );
1422
aExp = extractFloat32Exp( a );
1423
aSign = extractFloat32Sign( a );
1424
shiftCount = aExp - 0x9E;
1425
if ( 0 <= shiftCount ) {
1426
if ( a != 0xCF000000 ) {
1427
float_raise( float_flag_invalid );
1428
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1429
}
1430
return (sbits32) 0x80000000;
1431
}
1432
else if ( aExp <= 0x7E ) {
1433
if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1434
return 0;
1435
}
1436
aSig = ( aSig | 0x00800000 )<<8;
1437
z = aSig>>( - shiftCount );
1438
if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1439
float_exception_flags |= float_flag_inexact;
1440
}
1441
if ( aSign ) z = - z;
1442
return z;
1443
1444
}
1445
1446
#ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1447
/*
1448
-------------------------------------------------------------------------------
1449
Returns the result of converting the single-precision floating-point value
1450
`a' to the 64-bit two's complement integer format. The conversion is
1451
performed according to the IEC/IEEE Standard for Binary Floating-Point
1452
Arithmetic---which means in particular that the conversion is rounded
1453
according to the current rounding mode. If `a' is a NaN, the largest
1454
positive integer is returned. Otherwise, if the conversion overflows, the
1455
largest integer with the same sign as `a' is returned.
1456
-------------------------------------------------------------------------------
1457
*/
1458
int64 float32_to_int64( float32 a )
1459
{
1460
flag aSign;
1461
int16 aExp, shiftCount;
1462
bits32 aSig;
1463
bits64 aSig64, aSigExtra;
1464
1465
aSig = extractFloat32Frac( a );
1466
aExp = extractFloat32Exp( a );
1467
aSign = extractFloat32Sign( a );
1468
shiftCount = 0xBE - aExp;
1469
if ( shiftCount < 0 ) {
1470
float_raise( float_flag_invalid );
1471
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1472
return LIT64( 0x7FFFFFFFFFFFFFFF );
1473
}
1474
return (sbits64) LIT64( 0x8000000000000000 );
1475
}
1476
if ( aExp ) aSig |= 0x00800000;
1477
aSig64 = aSig;
1478
aSig64 <<= 40;
1479
shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1480
return roundAndPackInt64( aSign, aSig64, aSigExtra );
1481
1482
}
1483
1484
/*
1485
-------------------------------------------------------------------------------
1486
Returns the result of converting the single-precision floating-point value
1487
`a' to the 64-bit two's complement integer format. The conversion is
1488
performed according to the IEC/IEEE Standard for Binary Floating-Point
1489
Arithmetic, except that the conversion is always rounded toward zero. If
1490
`a' is a NaN, the largest positive integer is returned. Otherwise, if the
1491
conversion overflows, the largest integer with the same sign as `a' is
1492
returned.
1493
-------------------------------------------------------------------------------
1494
*/
1495
int64 float32_to_int64_round_to_zero( float32 a )
1496
{
1497
flag aSign;
1498
int16 aExp, shiftCount;
1499
bits32 aSig;
1500
bits64 aSig64;
1501
int64 z;
1502
1503
aSig = extractFloat32Frac( a );
1504
aExp = extractFloat32Exp( a );
1505
aSign = extractFloat32Sign( a );
1506
shiftCount = aExp - 0xBE;
1507
if ( 0 <= shiftCount ) {
1508
if ( a != 0xDF000000 ) {
1509
float_raise( float_flag_invalid );
1510
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1511
return LIT64( 0x7FFFFFFFFFFFFFFF );
1512
}
1513
}
1514
return (sbits64) LIT64( 0x8000000000000000 );
1515
}
1516
else if ( aExp <= 0x7E ) {
1517
if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1518
return 0;
1519
}
1520
aSig64 = aSig | 0x00800000;
1521
aSig64 <<= 40;
1522
z = aSig64>>( - shiftCount );
1523
if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1524
float_exception_flags |= float_flag_inexact;
1525
}
1526
if ( aSign ) z = - z;
1527
return z;
1528
1529
}
1530
#endif /* !SOFTFLOAT_FOR_GCC */
1531
1532
/*
1533
-------------------------------------------------------------------------------
1534
Returns the result of converting the single-precision floating-point value
1535
`a' to the double-precision floating-point format. The conversion is
1536
performed according to the IEC/IEEE Standard for Binary Floating-Point
1537
Arithmetic.
1538
-------------------------------------------------------------------------------
1539
*/
1540
float64 float32_to_float64( float32 a )
1541
{
1542
flag aSign;
1543
int16 aExp;
1544
bits32 aSig;
1545
1546
aSig = extractFloat32Frac( a );
1547
aExp = extractFloat32Exp( a );
1548
aSign = extractFloat32Sign( a );
1549
if ( aExp == 0xFF ) {
1550
if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1551
return packFloat64( aSign, 0x7FF, 0 );
1552
}
1553
if ( aExp == 0 ) {
1554
if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1555
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1556
--aExp;
1557
}
1558
return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1559
1560
}
1561
1562
#ifdef FLOATX80
1563
1564
/*
1565
-------------------------------------------------------------------------------
1566
Returns the result of converting the single-precision floating-point value
1567
`a' to the extended double-precision floating-point format. The conversion
1568
is performed according to the IEC/IEEE Standard for Binary Floating-Point
1569
Arithmetic.
1570
-------------------------------------------------------------------------------
1571
*/
1572
floatx80 float32_to_floatx80( float32 a )
1573
{
1574
flag aSign;
1575
int16 aExp;
1576
bits32 aSig;
1577
1578
aSig = extractFloat32Frac( a );
1579
aExp = extractFloat32Exp( a );
1580
aSign = extractFloat32Sign( a );
1581
if ( aExp == 0xFF ) {
1582
if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1583
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1584
}
1585
if ( aExp == 0 ) {
1586
if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1587
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1588
}
1589
aSig |= 0x00800000;
1590
return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1591
1592
}
1593
1594
#endif
1595
1596
#ifdef FLOAT128
1597
1598
/*
1599
-------------------------------------------------------------------------------
1600
Returns the result of converting the single-precision floating-point value
1601
`a' to the double-precision floating-point format. The conversion is
1602
performed according to the IEC/IEEE Standard for Binary Floating-Point
1603
Arithmetic.
1604
-------------------------------------------------------------------------------
1605
*/
1606
float128 float32_to_float128( float32 a )
1607
{
1608
flag aSign;
1609
int16 aExp;
1610
bits32 aSig;
1611
1612
aSig = extractFloat32Frac( a );
1613
aExp = extractFloat32Exp( a );
1614
aSign = extractFloat32Sign( a );
1615
if ( aExp == 0xFF ) {
1616
if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1617
return packFloat128( aSign, 0x7FFF, 0, 0 );
1618
}
1619
if ( aExp == 0 ) {
1620
if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1621
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1622
--aExp;
1623
}
1624
return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1625
1626
}
1627
1628
#endif
1629
1630
#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1631
/*
1632
-------------------------------------------------------------------------------
1633
Rounds the single-precision floating-point value `a' to an integer, and
1634
returns the result as a single-precision floating-point value. The
1635
operation is performed according to the IEC/IEEE Standard for Binary
1636
Floating-Point Arithmetic.
1637
-------------------------------------------------------------------------------
1638
*/
1639
float32 float32_round_to_int( float32 a )
1640
{
1641
flag aSign;
1642
int16 aExp;
1643
bits32 lastBitMask, roundBitsMask;
1644
int8 roundingMode;
1645
float32 z;
1646
1647
aExp = extractFloat32Exp( a );
1648
if ( 0x96 <= aExp ) {
1649
if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1650
return propagateFloat32NaN( a, a );
1651
}
1652
return a;
1653
}
1654
if ( aExp <= 0x7E ) {
1655
if ( (bits32) ( a<<1 ) == 0 ) return a;
1656
float_exception_flags |= float_flag_inexact;
1657
aSign = extractFloat32Sign( a );
1658
switch ( float_rounding_mode ) {
1659
case float_round_nearest_even:
1660
if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1661
return packFloat32( aSign, 0x7F, 0 );
1662
}
1663
break;
1664
case float_round_to_zero:
1665
break;
1666
case float_round_down:
1667
return aSign ? 0xBF800000 : 0;
1668
case float_round_up:
1669
return aSign ? 0x80000000 : 0x3F800000;
1670
}
1671
return packFloat32( aSign, 0, 0 );
1672
}
1673
lastBitMask = 1;
1674
lastBitMask <<= 0x96 - aExp;
1675
roundBitsMask = lastBitMask - 1;
1676
z = a;
1677
roundingMode = float_rounding_mode;
1678
if ( roundingMode == float_round_nearest_even ) {
1679
z += lastBitMask>>1;
1680
if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1681
}
1682
else if ( roundingMode != float_round_to_zero ) {
1683
if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1684
z += roundBitsMask;
1685
}
1686
}
1687
z &= ~ roundBitsMask;
1688
if ( z != a ) float_exception_flags |= float_flag_inexact;
1689
return z;
1690
1691
}
1692
#endif /* !SOFTFLOAT_FOR_GCC */
1693
1694
/*
1695
-------------------------------------------------------------------------------
1696
Returns the result of adding the absolute values of the single-precision
1697
floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1698
before being returned. `zSign' is ignored if the result is a NaN.
1699
The addition is performed according to the IEC/IEEE Standard for Binary
1700
Floating-Point Arithmetic.
1701
-------------------------------------------------------------------------------
1702
*/
1703
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1704
{
1705
int16 aExp, bExp, zExp;
1706
bits32 aSig, bSig, zSig;
1707
int16 expDiff;
1708
1709
aSig = extractFloat32Frac( a );
1710
aExp = extractFloat32Exp( a );
1711
bSig = extractFloat32Frac( b );
1712
bExp = extractFloat32Exp( b );
1713
expDiff = aExp - bExp;
1714
aSig <<= 6;
1715
bSig <<= 6;
1716
if ( 0 < expDiff ) {
1717
if ( aExp == 0xFF ) {
1718
if ( aSig ) return propagateFloat32NaN( a, b );
1719
return a;
1720
}
1721
if ( bExp == 0 ) {
1722
--expDiff;
1723
}
1724
else {
1725
bSig |= 0x20000000;
1726
}
1727
shift32RightJamming( bSig, expDiff, &bSig );
1728
zExp = aExp;
1729
}
1730
else if ( expDiff < 0 ) {
1731
if ( bExp == 0xFF ) {
1732
if ( bSig ) return propagateFloat32NaN( a, b );
1733
return packFloat32( zSign, 0xFF, 0 );
1734
}
1735
if ( aExp == 0 ) {
1736
++expDiff;
1737
}
1738
else {
1739
aSig |= 0x20000000;
1740
}
1741
shift32RightJamming( aSig, - expDiff, &aSig );
1742
zExp = bExp;
1743
}
1744
else {
1745
if ( aExp == 0xFF ) {
1746
if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1747
return a;
1748
}
1749
if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1750
zSig = 0x40000000 + aSig + bSig;
1751
zExp = aExp;
1752
goto roundAndPack;
1753
}
1754
aSig |= 0x20000000;
1755
zSig = ( aSig + bSig )<<1;
1756
--zExp;
1757
if ( (sbits32) zSig < 0 ) {
1758
zSig = aSig + bSig;
1759
++zExp;
1760
}
1761
roundAndPack:
1762
return roundAndPackFloat32( zSign, zExp, zSig );
1763
1764
}
1765
1766
/*
1767
-------------------------------------------------------------------------------
1768
Returns the result of subtracting the absolute values of the single-
1769
precision floating-point values `a' and `b'. If `zSign' is 1, the
1770
difference is negated before being returned. `zSign' is ignored if the
1771
result is a NaN. The subtraction is performed according to the IEC/IEEE
1772
Standard for Binary Floating-Point Arithmetic.
1773
-------------------------------------------------------------------------------
1774
*/
1775
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1776
{
1777
int16 aExp, bExp, zExp;
1778
bits32 aSig, bSig, zSig;
1779
int16 expDiff;
1780
1781
aSig = extractFloat32Frac( a );
1782
aExp = extractFloat32Exp( a );
1783
bSig = extractFloat32Frac( b );
1784
bExp = extractFloat32Exp( b );
1785
expDiff = aExp - bExp;
1786
aSig <<= 7;
1787
bSig <<= 7;
1788
if ( 0 < expDiff ) goto aExpBigger;
1789
if ( expDiff < 0 ) goto bExpBigger;
1790
if ( aExp == 0xFF ) {
1791
if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1792
float_raise( float_flag_invalid );
1793
return float32_default_nan;
1794
}
1795
if ( aExp == 0 ) {
1796
aExp = 1;
1797
bExp = 1;
1798
}
1799
if ( bSig < aSig ) goto aBigger;
1800
if ( aSig < bSig ) goto bBigger;
1801
return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1802
bExpBigger:
1803
if ( bExp == 0xFF ) {
1804
if ( bSig ) return propagateFloat32NaN( a, b );
1805
return packFloat32( zSign ^ 1, 0xFF, 0 );
1806
}
1807
if ( aExp == 0 ) {
1808
++expDiff;
1809
}
1810
else {
1811
aSig |= 0x40000000;
1812
}
1813
shift32RightJamming( aSig, - expDiff, &aSig );
1814
bSig |= 0x40000000;
1815
bBigger:
1816
zSig = bSig - aSig;
1817
zExp = bExp;
1818
zSign ^= 1;
1819
goto normalizeRoundAndPack;
1820
aExpBigger:
1821
if ( aExp == 0xFF ) {
1822
if ( aSig ) return propagateFloat32NaN( a, b );
1823
return a;
1824
}
1825
if ( bExp == 0 ) {
1826
--expDiff;
1827
}
1828
else {
1829
bSig |= 0x40000000;
1830
}
1831
shift32RightJamming( bSig, expDiff, &bSig );
1832
aSig |= 0x40000000;
1833
aBigger:
1834
zSig = aSig - bSig;
1835
zExp = aExp;
1836
normalizeRoundAndPack:
1837
--zExp;
1838
return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1839
1840
}
1841
1842
/*
1843
-------------------------------------------------------------------------------
1844
Returns the result of adding the single-precision floating-point values `a'
1845
and `b'. The operation is performed according to the IEC/IEEE Standard for
1846
Binary Floating-Point Arithmetic.
1847
-------------------------------------------------------------------------------
1848
*/
1849
float32 float32_add( float32 a, float32 b )
1850
{
1851
flag aSign, bSign;
1852
1853
aSign = extractFloat32Sign( a );
1854
bSign = extractFloat32Sign( b );
1855
if ( aSign == bSign ) {
1856
return addFloat32Sigs( a, b, aSign );
1857
}
1858
else {
1859
return subFloat32Sigs( a, b, aSign );
1860
}
1861
1862
}
1863
1864
/*
1865
-------------------------------------------------------------------------------
1866
Returns the result of subtracting the single-precision floating-point values
1867
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
1868
for Binary Floating-Point Arithmetic.
1869
-------------------------------------------------------------------------------
1870
*/
1871
float32 float32_sub( float32 a, float32 b )
1872
{
1873
flag aSign, bSign;
1874
1875
aSign = extractFloat32Sign( a );
1876
bSign = extractFloat32Sign( b );
1877
if ( aSign == bSign ) {
1878
return subFloat32Sigs( a, b, aSign );
1879
}
1880
else {
1881
return addFloat32Sigs( a, b, aSign );
1882
}
1883
1884
}
1885
1886
/*
1887
-------------------------------------------------------------------------------
1888
Returns the result of multiplying the single-precision floating-point values
1889
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
1890
for Binary Floating-Point Arithmetic.
1891
-------------------------------------------------------------------------------
1892
*/
1893
float32 float32_mul( float32 a, float32 b )
1894
{
1895
flag aSign, bSign, zSign;
1896
int16 aExp, bExp, zExp;
1897
bits32 aSig, bSig;
1898
bits64 zSig64;
1899
bits32 zSig;
1900
1901
aSig = extractFloat32Frac( a );
1902
aExp = extractFloat32Exp( a );
1903
aSign = extractFloat32Sign( a );
1904
bSig = extractFloat32Frac( b );
1905
bExp = extractFloat32Exp( b );
1906
bSign = extractFloat32Sign( b );
1907
zSign = aSign ^ bSign;
1908
if ( aExp == 0xFF ) {
1909
if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1910
return propagateFloat32NaN( a, b );
1911
}
1912
if ( ( bExp | bSig ) == 0 ) {
1913
float_raise( float_flag_invalid );
1914
return float32_default_nan;
1915
}
1916
return packFloat32( zSign, 0xFF, 0 );
1917
}
1918
if ( bExp == 0xFF ) {
1919
if ( bSig ) return propagateFloat32NaN( a, b );
1920
if ( ( aExp | aSig ) == 0 ) {
1921
float_raise( float_flag_invalid );
1922
return float32_default_nan;
1923
}
1924
return packFloat32( zSign, 0xFF, 0 );
1925
}
1926
if ( aExp == 0 ) {
1927
if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1928
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1929
}
1930
if ( bExp == 0 ) {
1931
if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1932
normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1933
}
1934
zExp = aExp + bExp - 0x7F;
1935
aSig = ( aSig | 0x00800000 )<<7;
1936
bSig = ( bSig | 0x00800000 )<<8;
1937
shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1938
zSig = zSig64;
1939
if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1940
zSig <<= 1;
1941
--zExp;
1942
}
1943
return roundAndPackFloat32( zSign, zExp, zSig );
1944
1945
}
1946
1947
/*
1948
-------------------------------------------------------------------------------
1949
Returns the result of dividing the single-precision floating-point value `a'
1950
by the corresponding value `b'. The operation is performed according to the
1951
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1952
-------------------------------------------------------------------------------
1953
*/
1954
float32 float32_div( float32 a, float32 b )
1955
{
1956
flag aSign, bSign, zSign;
1957
int16 aExp, bExp, zExp;
1958
bits32 aSig, bSig, zSig;
1959
1960
aSig = extractFloat32Frac( a );
1961
aExp = extractFloat32Exp( a );
1962
aSign = extractFloat32Sign( a );
1963
bSig = extractFloat32Frac( b );
1964
bExp = extractFloat32Exp( b );
1965
bSign = extractFloat32Sign( b );
1966
zSign = aSign ^ bSign;
1967
if ( aExp == 0xFF ) {
1968
if ( aSig ) return propagateFloat32NaN( a, b );
1969
if ( bExp == 0xFF ) {
1970
if ( bSig ) return propagateFloat32NaN( a, b );
1971
float_raise( float_flag_invalid );
1972
return float32_default_nan;
1973
}
1974
return packFloat32( zSign, 0xFF, 0 );
1975
}
1976
if ( bExp == 0xFF ) {
1977
if ( bSig ) return propagateFloat32NaN( a, b );
1978
return packFloat32( zSign, 0, 0 );
1979
}
1980
if ( bExp == 0 ) {
1981
if ( bSig == 0 ) {
1982
if ( ( aExp | aSig ) == 0 ) {
1983
float_raise( float_flag_invalid );
1984
return float32_default_nan;
1985
}
1986
float_raise( float_flag_divbyzero );
1987
return packFloat32( zSign, 0xFF, 0 );
1988
}
1989
normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1990
}
1991
if ( aExp == 0 ) {
1992
if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1993
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1994
}
1995
zExp = aExp - bExp + 0x7D;
1996
aSig = ( aSig | 0x00800000 )<<7;
1997
bSig = ( bSig | 0x00800000 )<<8;
1998
if ( bSig <= ( aSig + aSig ) ) {
1999
aSig >>= 1;
2000
++zExp;
2001
}
2002
zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2003
if ( ( zSig & 0x3F ) == 0 ) {
2004
zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2005
}
2006
return roundAndPackFloat32( zSign, zExp, zSig );
2007
2008
}
2009
2010
#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2011
/*
2012
-------------------------------------------------------------------------------
2013
Returns the remainder of the single-precision floating-point value `a'
2014
with respect to the corresponding value `b'. The operation is performed
2015
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2016
-------------------------------------------------------------------------------
2017
*/
2018
float32 float32_rem( float32 a, float32 b )
2019
{
2020
flag aSign, bSign, zSign;
2021
int16 aExp, bExp, expDiff;
2022
bits32 aSig, bSig;
2023
bits32 q;
2024
bits64 aSig64, bSig64, q64;
2025
bits32 alternateASig;
2026
sbits32 sigMean;
2027
2028
aSig = extractFloat32Frac( a );
2029
aExp = extractFloat32Exp( a );
2030
aSign = extractFloat32Sign( a );
2031
bSig = extractFloat32Frac( b );
2032
bExp = extractFloat32Exp( b );
2033
bSign = extractFloat32Sign( b );
2034
if ( aExp == 0xFF ) {
2035
if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2036
return propagateFloat32NaN( a, b );
2037
}
2038
float_raise( float_flag_invalid );
2039
return float32_default_nan;
2040
}
2041
if ( bExp == 0xFF ) {
2042
if ( bSig ) return propagateFloat32NaN( a, b );
2043
return a;
2044
}
2045
if ( bExp == 0 ) {
2046
if ( bSig == 0 ) {
2047
float_raise( float_flag_invalid );
2048
return float32_default_nan;
2049
}
2050
normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2051
}
2052
if ( aExp == 0 ) {
2053
if ( aSig == 0 ) return a;
2054
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2055
}
2056
expDiff = aExp - bExp;
2057
aSig |= 0x00800000;
2058
bSig |= 0x00800000;
2059
if ( expDiff < 32 ) {
2060
aSig <<= 8;
2061
bSig <<= 8;
2062
if ( expDiff < 0 ) {
2063
if ( expDiff < -1 ) return a;
2064
aSig >>= 1;
2065
}
2066
q = ( bSig <= aSig );
2067
if ( q ) aSig -= bSig;
2068
if ( 0 < expDiff ) {
2069
q = ( ( (bits64) aSig )<<32 ) / bSig;
2070
q >>= 32 - expDiff;
2071
bSig >>= 2;
2072
aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2073
}
2074
else {
2075
aSig >>= 2;
2076
bSig >>= 2;
2077
}
2078
}
2079
else {
2080
if ( bSig <= aSig ) aSig -= bSig;
2081
aSig64 = ( (bits64) aSig )<<40;
2082
bSig64 = ( (bits64) bSig )<<40;
2083
expDiff -= 64;
2084
while ( 0 < expDiff ) {
2085
q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2086
q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2087
aSig64 = - ( ( bSig * q64 )<<38 );
2088
expDiff -= 62;
2089
}
2090
expDiff += 64;
2091
q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2092
q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2093
q = q64>>( 64 - expDiff );
2094
bSig <<= 6;
2095
aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2096
}
2097
do {
2098
alternateASig = aSig;
2099
++q;
2100
aSig -= bSig;
2101
} while ( 0 <= (sbits32) aSig );
2102
sigMean = aSig + alternateASig;
2103
if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2104
aSig = alternateASig;
2105
}
2106
zSign = ( (sbits32) aSig < 0 );
2107
if ( zSign ) aSig = - aSig;
2108
return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2109
2110
}
2111
#endif /* !SOFTFLOAT_FOR_GCC */
2112
2113
#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2114
/*
2115
-------------------------------------------------------------------------------
2116
Returns the square root of the single-precision floating-point value `a'.
2117
The operation is performed according to the IEC/IEEE Standard for Binary
2118
Floating-Point Arithmetic.
2119
-------------------------------------------------------------------------------
2120
*/
2121
float32 float32_sqrt( float32 a )
2122
{
2123
flag aSign;
2124
int16 aExp, zExp;
2125
bits32 aSig, zSig;
2126
bits64 rem, term;
2127
2128
aSig = extractFloat32Frac( a );
2129
aExp = extractFloat32Exp( a );
2130
aSign = extractFloat32Sign( a );
2131
if ( aExp == 0xFF ) {
2132
if ( aSig ) return propagateFloat32NaN( a, 0 );
2133
if ( ! aSign ) return a;
2134
float_raise( float_flag_invalid );
2135
return float32_default_nan;
2136
}
2137
if ( aSign ) {
2138
if ( ( aExp | aSig ) == 0 ) return a;
2139
float_raise( float_flag_invalid );
2140
return float32_default_nan;
2141
}
2142
if ( aExp == 0 ) {
2143
if ( aSig == 0 ) return 0;
2144
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2145
}
2146
zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2147
aSig = ( aSig | 0x00800000 )<<8;
2148
zSig = estimateSqrt32( aExp, aSig ) + 2;
2149
if ( ( zSig & 0x7F ) <= 5 ) {
2150
if ( zSig < 2 ) {
2151
zSig = 0x7FFFFFFF;
2152
goto roundAndPack;
2153
}
2154
aSig >>= aExp & 1;
2155
term = ( (bits64) zSig ) * zSig;
2156
rem = ( ( (bits64) aSig )<<32 ) - term;
2157
while ( (sbits64) rem < 0 ) {
2158
--zSig;
2159
rem += ( ( (bits64) zSig )<<1 ) | 1;
2160
}
2161
zSig |= ( rem != 0 );
2162
}
2163
shift32RightJamming( zSig, 1, &zSig );
2164
roundAndPack:
2165
return roundAndPackFloat32( 0, zExp, zSig );
2166
2167
}
2168
#endif /* !SOFTFLOAT_FOR_GCC */
2169
2170
/*
2171
-------------------------------------------------------------------------------
2172
Returns 1 if the single-precision floating-point value `a' is equal to
2173
the corresponding value `b', and 0 otherwise. The comparison is performed
2174
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2175
-------------------------------------------------------------------------------
2176
*/
2177
flag float32_eq( float32 a, float32 b )
2178
{
2179
2180
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2181
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2182
) {
2183
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2184
float_raise( float_flag_invalid );
2185
}
2186
return 0;
2187
}
2188
return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2189
2190
}
2191
2192
/*
2193
-------------------------------------------------------------------------------
2194
Returns 1 if the single-precision floating-point value `a' is less than
2195
or equal to the corresponding value `b', and 0 otherwise. The comparison
2196
is performed according to the IEC/IEEE Standard for Binary Floating-Point
2197
Arithmetic.
2198
-------------------------------------------------------------------------------
2199
*/
2200
flag float32_le( float32 a, float32 b )
2201
{
2202
flag aSign, bSign;
2203
2204
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2205
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2206
) {
2207
float_raise( float_flag_invalid );
2208
return 0;
2209
}
2210
aSign = extractFloat32Sign( a );
2211
bSign = extractFloat32Sign( b );
2212
if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2213
return ( a == b ) || ( aSign ^ ( a < b ) );
2214
2215
}
2216
2217
/*
2218
-------------------------------------------------------------------------------
2219
Returns 1 if the single-precision floating-point value `a' is less than
2220
the corresponding value `b', and 0 otherwise. The comparison is performed
2221
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2222
-------------------------------------------------------------------------------
2223
*/
2224
flag float32_lt( float32 a, float32 b )
2225
{
2226
flag aSign, bSign;
2227
2228
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2229
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2230
) {
2231
float_raise( float_flag_invalid );
2232
return 0;
2233
}
2234
aSign = extractFloat32Sign( a );
2235
bSign = extractFloat32Sign( b );
2236
if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2237
return ( a != b ) && ( aSign ^ ( a < b ) );
2238
2239
}
2240
2241
#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2242
/*
2243
-------------------------------------------------------------------------------
2244
Returns 1 if the single-precision floating-point value `a' is equal to
2245
the corresponding value `b', and 0 otherwise. The invalid exception is
2246
raised if either operand is a NaN. Otherwise, the comparison is performed
2247
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2248
-------------------------------------------------------------------------------
2249
*/
2250
flag float32_eq_signaling( float32 a, float32 b )
2251
{
2252
2253
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2254
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2255
) {
2256
float_raise( float_flag_invalid );
2257
return 0;
2258
}
2259
return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2260
2261
}
2262
2263
/*
2264
-------------------------------------------------------------------------------
2265
Returns 1 if the single-precision floating-point value `a' is less than or
2266
equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2267
cause an exception. Otherwise, the comparison is performed according to the
2268
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2269
-------------------------------------------------------------------------------
2270
*/
2271
flag float32_le_quiet( float32 a, float32 b )
2272
{
2273
flag aSign, bSign;
2274
2275
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2276
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2277
) {
2278
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2279
float_raise( float_flag_invalid );
2280
}
2281
return 0;
2282
}
2283
aSign = extractFloat32Sign( a );
2284
bSign = extractFloat32Sign( b );
2285
if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2286
return ( a == b ) || ( aSign ^ ( a < b ) );
2287
2288
}
2289
2290
/*
2291
-------------------------------------------------------------------------------
2292
Returns 1 if the single-precision floating-point value `a' is less than
2293
the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2294
exception. Otherwise, the comparison is performed according to the IEC/IEEE
2295
Standard for Binary Floating-Point Arithmetic.
2296
-------------------------------------------------------------------------------
2297
*/
2298
flag float32_lt_quiet( float32 a, float32 b )
2299
{
2300
flag aSign, bSign;
2301
2302
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2303
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2304
) {
2305
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2306
float_raise( float_flag_invalid );
2307
}
2308
return 0;
2309
}
2310
aSign = extractFloat32Sign( a );
2311
bSign = extractFloat32Sign( b );
2312
if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2313
return ( a != b ) && ( aSign ^ ( a < b ) );
2314
2315
}
2316
#endif /* !SOFTFLOAT_FOR_GCC */
2317
2318
#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2319
/*
2320
-------------------------------------------------------------------------------
2321
Returns the result of converting the double-precision floating-point value
2322
`a' to the 32-bit two's complement integer format. The conversion is
2323
performed according to the IEC/IEEE Standard for Binary Floating-Point
2324
Arithmetic---which means in particular that the conversion is rounded
2325
according to the current rounding mode. If `a' is a NaN, the largest
2326
positive integer is returned. Otherwise, if the conversion overflows, the
2327
largest integer with the same sign as `a' is returned.
2328
-------------------------------------------------------------------------------
2329
*/
2330
int32 float64_to_int32( float64 a )
2331
{
2332
flag aSign;
2333
int16 aExp, shiftCount;
2334
bits64 aSig;
2335
2336
aSig = extractFloat64Frac( a );
2337
aExp = extractFloat64Exp( a );
2338
aSign = extractFloat64Sign( a );
2339
if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2340
if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2341
shiftCount = 0x42C - aExp;
2342
if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2343
return roundAndPackInt32( aSign, aSig );
2344
2345
}
2346
#endif /* !SOFTFLOAT_FOR_GCC */
2347
2348
/*
2349
-------------------------------------------------------------------------------
2350
Returns the result of converting the double-precision floating-point value
2351
`a' to the 32-bit two's complement integer format. The conversion is
2352
performed according to the IEC/IEEE Standard for Binary Floating-Point
2353
Arithmetic, except that the conversion is always rounded toward zero.
2354
If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2355
the conversion overflows, the largest integer with the same sign as `a' is
2356
returned.
2357
-------------------------------------------------------------------------------
2358
*/
2359
int32 float64_to_int32_round_to_zero( float64 a )
2360
{
2361
flag aSign;
2362
int16 aExp, shiftCount;
2363
bits64 aSig, savedASig;
2364
int32 z;
2365
2366
aSig = extractFloat64Frac( a );
2367
aExp = extractFloat64Exp( a );
2368
aSign = extractFloat64Sign( a );
2369
if ( 0x41E < aExp ) {
2370
if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2371
goto invalid;
2372
}
2373
else if ( aExp < 0x3FF ) {
2374
if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2375
return 0;
2376
}
2377
aSig |= LIT64( 0x0010000000000000 );
2378
shiftCount = 0x433 - aExp;
2379
savedASig = aSig;
2380
aSig >>= shiftCount;
2381
z = aSig;
2382
if ( aSign ) z = - z;
2383
if ( ( z < 0 ) ^ aSign ) {
2384
invalid:
2385
float_raise( float_flag_invalid );
2386
return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2387
}
2388
if ( ( aSig<<shiftCount ) != savedASig ) {
2389
float_exception_flags |= float_flag_inexact;
2390
}
2391
return z;
2392
2393
}
2394
2395
#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2396
/*
2397
-------------------------------------------------------------------------------
2398
Returns the result of converting the double-precision floating-point value
2399
`a' to the 64-bit two's complement integer format. The conversion is
2400
performed according to the IEC/IEEE Standard for Binary Floating-Point
2401
Arithmetic---which means in particular that the conversion is rounded
2402
according to the current rounding mode. If `a' is a NaN, the largest
2403
positive integer is returned. Otherwise, if the conversion overflows, the
2404
largest integer with the same sign as `a' is returned.
2405
-------------------------------------------------------------------------------
2406
*/
2407
int64 float64_to_int64( float64 a )
2408
{
2409
flag aSign;
2410
int16 aExp, shiftCount;
2411
bits64 aSig, aSigExtra;
2412
2413
aSig = extractFloat64Frac( a );
2414
aExp = extractFloat64Exp( a );
2415
aSign = extractFloat64Sign( a );
2416
if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2417
shiftCount = 0x433 - aExp;
2418
if ( shiftCount <= 0 ) {
2419
if ( 0x43E < aExp ) {
2420
float_raise( float_flag_invalid );
2421
if ( ! aSign
2422
|| ( ( aExp == 0x7FF )
2423
&& ( aSig != LIT64( 0x0010000000000000 ) ) )
2424
) {
2425
return LIT64( 0x7FFFFFFFFFFFFFFF );
2426
}
2427
return (sbits64) LIT64( 0x8000000000000000 );
2428
}
2429
aSigExtra = 0;
2430
aSig <<= - shiftCount;
2431
}
2432
else {
2433
shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2434
}
2435
return roundAndPackInt64( aSign, aSig, aSigExtra );
2436
2437
}
2438
2439
/*
2440
-------------------------------------------------------------------------------
2441
Returns the result of converting the double-precision floating-point value
2442
`a' to the 64-bit two's complement integer format. The conversion is
2443
performed according to the IEC/IEEE Standard for Binary Floating-Point
2444
Arithmetic, except that the conversion is always rounded toward zero.
2445
If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2446
the conversion overflows, the largest integer with the same sign as `a' is
2447
returned.
2448
-------------------------------------------------------------------------------
2449
*/
2450
int64 float64_to_int64_round_to_zero( float64 a )
2451
{
2452
flag aSign;
2453
int16 aExp, shiftCount;
2454
bits64 aSig;
2455
int64 z;
2456
2457
aSig = extractFloat64Frac( a );
2458
aExp = extractFloat64Exp( a );
2459
aSign = extractFloat64Sign( a );
2460
if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2461
shiftCount = aExp - 0x433;
2462
if ( 0 <= shiftCount ) {
2463
if ( 0x43E <= aExp ) {
2464
if ( a != LIT64( 0xC3E0000000000000 ) ) {
2465
float_raise( float_flag_invalid );
2466
if ( ! aSign
2467
|| ( ( aExp == 0x7FF )
2468
&& ( aSig != LIT64( 0x0010000000000000 ) ) )
2469
) {
2470
return LIT64( 0x7FFFFFFFFFFFFFFF );
2471
}
2472
}
2473
return (sbits64) LIT64( 0x8000000000000000 );
2474
}
2475
z = aSig<<shiftCount;
2476
}
2477
else {
2478
if ( aExp < 0x3FE ) {
2479
if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2480
return 0;
2481
}
2482
z = aSig>>( - shiftCount );
2483
if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2484
float_exception_flags |= float_flag_inexact;
2485
}
2486
}
2487
if ( aSign ) z = - z;
2488
return z;
2489
2490
}
2491
#endif /* !SOFTFLOAT_FOR_GCC */
2492
2493
/*
2494
-------------------------------------------------------------------------------
2495
Returns the result of converting the double-precision floating-point value
2496
`a' to the single-precision floating-point format. The conversion is
2497
performed according to the IEC/IEEE Standard for Binary Floating-Point
2498
Arithmetic.
2499
-------------------------------------------------------------------------------
2500
*/
2501
float32 float64_to_float32( float64 a )
2502
{
2503
flag aSign;
2504
int16 aExp;
2505
bits64 aSig;
2506
bits32 zSig;
2507
2508
aSig = extractFloat64Frac( a );
2509
aExp = extractFloat64Exp( a );
2510
aSign = extractFloat64Sign( a );
2511
if ( aExp == 0x7FF ) {
2512
if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2513
return packFloat32( aSign, 0xFF, 0 );
2514
}
2515
shift64RightJamming( aSig, 22, &aSig );
2516
zSig = aSig;
2517
if ( aExp || zSig ) {
2518
zSig |= 0x40000000;
2519
aExp -= 0x381;
2520
}
2521
return roundAndPackFloat32( aSign, aExp, zSig );
2522
2523
}
2524
2525
#ifdef FLOATX80
2526
2527
/*
2528
-------------------------------------------------------------------------------
2529
Returns the result of converting the double-precision floating-point value
2530
`a' to the extended double-precision floating-point format. The conversion
2531
is performed according to the IEC/IEEE Standard for Binary Floating-Point
2532
Arithmetic.
2533
-------------------------------------------------------------------------------
2534
*/
2535
floatx80 float64_to_floatx80( float64 a )
2536
{
2537
flag aSign;
2538
int16 aExp;
2539
bits64 aSig;
2540
2541
aSig = extractFloat64Frac( a );
2542
aExp = extractFloat64Exp( a );
2543
aSign = extractFloat64Sign( a );
2544
if ( aExp == 0x7FF ) {
2545
if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2546
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2547
}
2548
if ( aExp == 0 ) {
2549
if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2550
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2551
}
2552
return
2553
packFloatx80(
2554
aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2555
2556
}
2557
2558
#endif
2559
2560
#ifdef FLOAT128
2561
2562
/*
2563
-------------------------------------------------------------------------------
2564
Returns the result of converting the double-precision floating-point value
2565
`a' to the quadruple-precision floating-point format. The conversion is
2566
performed according to the IEC/IEEE Standard for Binary Floating-Point
2567
Arithmetic.
2568
-------------------------------------------------------------------------------
2569
*/
2570
float128 float64_to_float128( float64 a )
2571
{
2572
flag aSign;
2573
int16 aExp;
2574
bits64 aSig, zSig0, zSig1;
2575
2576
aSig = extractFloat64Frac( a );
2577
aExp = extractFloat64Exp( a );
2578
aSign = extractFloat64Sign( a );
2579
if ( aExp == 0x7FF ) {
2580
if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2581
return packFloat128( aSign, 0x7FFF, 0, 0 );
2582
}
2583
if ( aExp == 0 ) {
2584
if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2585
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2586
--aExp;
2587
}
2588
shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2589
return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2590
2591
}
2592
2593
#endif
2594
2595
#ifndef SOFTFLOAT_FOR_GCC
2596
/*
2597
-------------------------------------------------------------------------------
2598
Rounds the double-precision floating-point value `a' to an integer, and
2599
returns the result as a double-precision floating-point value. The
2600
operation is performed according to the IEC/IEEE Standard for Binary
2601
Floating-Point Arithmetic.
2602
-------------------------------------------------------------------------------
2603
*/
2604
float64 float64_round_to_int( float64 a )
2605
{
2606
flag aSign;
2607
int16 aExp;
2608
bits64 lastBitMask, roundBitsMask;
2609
int8 roundingMode;
2610
float64 z;
2611
2612
aExp = extractFloat64Exp( a );
2613
if ( 0x433 <= aExp ) {
2614
if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2615
return propagateFloat64NaN( a, a );
2616
}
2617
return a;
2618
}
2619
if ( aExp < 0x3FF ) {
2620
if ( (bits64) ( a<<1 ) == 0 ) return a;
2621
float_exception_flags |= float_flag_inexact;
2622
aSign = extractFloat64Sign( a );
2623
switch ( float_rounding_mode ) {
2624
case float_round_nearest_even:
2625
if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2626
return packFloat64( aSign, 0x3FF, 0 );
2627
}
2628
break;
2629
case float_round_to_zero:
2630
break;
2631
case float_round_down:
2632
return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2633
case float_round_up:
2634
return
2635
aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2636
}
2637
return packFloat64( aSign, 0, 0 );
2638
}
2639
lastBitMask = 1;
2640
lastBitMask <<= 0x433 - aExp;
2641
roundBitsMask = lastBitMask - 1;
2642
z = a;
2643
roundingMode = float_rounding_mode;
2644
if ( roundingMode == float_round_nearest_even ) {
2645
z += lastBitMask>>1;
2646
if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2647
}
2648
else if ( roundingMode != float_round_to_zero ) {
2649
if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2650
z += roundBitsMask;
2651
}
2652
}
2653
z &= ~ roundBitsMask;
2654
if ( z != a ) float_exception_flags |= float_flag_inexact;
2655
return z;
2656
2657
}
2658
#endif
2659
2660
/*
2661
-------------------------------------------------------------------------------
2662
Returns the result of adding the absolute values of the double-precision
2663
floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2664
before being returned. `zSign' is ignored if the result is a NaN.
2665
The addition is performed according to the IEC/IEEE Standard for Binary
2666
Floating-Point Arithmetic.
2667
-------------------------------------------------------------------------------
2668
*/
2669
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2670
{
2671
int16 aExp, bExp, zExp;
2672
bits64 aSig, bSig, zSig;
2673
int16 expDiff;
2674
2675
aSig = extractFloat64Frac( a );
2676
aExp = extractFloat64Exp( a );
2677
bSig = extractFloat64Frac( b );
2678
bExp = extractFloat64Exp( b );
2679
expDiff = aExp - bExp;
2680
aSig <<= 9;
2681
bSig <<= 9;
2682
if ( 0 < expDiff ) {
2683
if ( aExp == 0x7FF ) {
2684
if ( aSig ) return propagateFloat64NaN( a, b );
2685
return a;
2686
}
2687
if ( bExp == 0 ) {
2688
--expDiff;
2689
}
2690
else {
2691
bSig |= LIT64( 0x2000000000000000 );
2692
}
2693
shift64RightJamming( bSig, expDiff, &bSig );
2694
zExp = aExp;
2695
}
2696
else if ( expDiff < 0 ) {
2697
if ( bExp == 0x7FF ) {
2698
if ( bSig ) return propagateFloat64NaN( a, b );
2699
return packFloat64( zSign, 0x7FF, 0 );
2700
}
2701
if ( aExp == 0 ) {
2702
++expDiff;
2703
}
2704
else {
2705
aSig |= LIT64( 0x2000000000000000 );
2706
}
2707
shift64RightJamming( aSig, - expDiff, &aSig );
2708
zExp = bExp;
2709
}
2710
else {
2711
if ( aExp == 0x7FF ) {
2712
if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2713
return a;
2714
}
2715
if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2716
zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2717
zExp = aExp;
2718
goto roundAndPack;
2719
}
2720
aSig |= LIT64( 0x2000000000000000 );
2721
zSig = ( aSig + bSig )<<1;
2722
--zExp;
2723
if ( (sbits64) zSig < 0 ) {
2724
zSig = aSig + bSig;
2725
++zExp;
2726
}
2727
roundAndPack:
2728
return roundAndPackFloat64( zSign, zExp, zSig );
2729
2730
}
2731
2732
/*
2733
-------------------------------------------------------------------------------
2734
Returns the result of subtracting the absolute values of the double-
2735
precision floating-point values `a' and `b'. If `zSign' is 1, the
2736
difference is negated before being returned. `zSign' is ignored if the
2737
result is a NaN. The subtraction is performed according to the IEC/IEEE
2738
Standard for Binary Floating-Point Arithmetic.
2739
-------------------------------------------------------------------------------
2740
*/
2741
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2742
{
2743
int16 aExp, bExp, zExp;
2744
bits64 aSig, bSig, zSig;
2745
int16 expDiff;
2746
2747
aSig = extractFloat64Frac( a );
2748
aExp = extractFloat64Exp( a );
2749
bSig = extractFloat64Frac( b );
2750
bExp = extractFloat64Exp( b );
2751
expDiff = aExp - bExp;
2752
aSig <<= 10;
2753
bSig <<= 10;
2754
if ( 0 < expDiff ) goto aExpBigger;
2755
if ( expDiff < 0 ) goto bExpBigger;
2756
if ( aExp == 0x7FF ) {
2757
if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2758
float_raise( float_flag_invalid );
2759
return float64_default_nan;
2760
}
2761
if ( aExp == 0 ) {
2762
aExp = 1;
2763
bExp = 1;
2764
}
2765
if ( bSig < aSig ) goto aBigger;
2766
if ( aSig < bSig ) goto bBigger;
2767
return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2768
bExpBigger:
2769
if ( bExp == 0x7FF ) {
2770
if ( bSig ) return propagateFloat64NaN( a, b );
2771
return packFloat64( zSign ^ 1, 0x7FF, 0 );
2772
}
2773
if ( aExp == 0 ) {
2774
++expDiff;
2775
}
2776
else {
2777
aSig |= LIT64( 0x4000000000000000 );
2778
}
2779
shift64RightJamming( aSig, - expDiff, &aSig );
2780
bSig |= LIT64( 0x4000000000000000 );
2781
bBigger:
2782
zSig = bSig - aSig;
2783
zExp = bExp;
2784
zSign ^= 1;
2785
goto normalizeRoundAndPack;
2786
aExpBigger:
2787
if ( aExp == 0x7FF ) {
2788
if ( aSig ) return propagateFloat64NaN( a, b );
2789
return a;
2790
}
2791
if ( bExp == 0 ) {
2792
--expDiff;
2793
}
2794
else {
2795
bSig |= LIT64( 0x4000000000000000 );
2796
}
2797
shift64RightJamming( bSig, expDiff, &bSig );
2798
aSig |= LIT64( 0x4000000000000000 );
2799
aBigger:
2800
zSig = aSig - bSig;
2801
zExp = aExp;
2802
normalizeRoundAndPack:
2803
--zExp;
2804
return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2805
2806
}
2807
2808
/*
2809
-------------------------------------------------------------------------------
2810
Returns the result of adding the double-precision floating-point values `a'
2811
and `b'. The operation is performed according to the IEC/IEEE Standard for
2812
Binary Floating-Point Arithmetic.
2813
-------------------------------------------------------------------------------
2814
*/
2815
float64 float64_add( float64 a, float64 b )
2816
{
2817
flag aSign, bSign;
2818
2819
aSign = extractFloat64Sign( a );
2820
bSign = extractFloat64Sign( b );
2821
if ( aSign == bSign ) {
2822
return addFloat64Sigs( a, b, aSign );
2823
}
2824
else {
2825
return subFloat64Sigs( a, b, aSign );
2826
}
2827
2828
}
2829
2830
/*
2831
-------------------------------------------------------------------------------
2832
Returns the result of subtracting the double-precision floating-point values
2833
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
2834
for Binary Floating-Point Arithmetic.
2835
-------------------------------------------------------------------------------
2836
*/
2837
float64 float64_sub( float64 a, float64 b )
2838
{
2839
flag aSign, bSign;
2840
2841
aSign = extractFloat64Sign( a );
2842
bSign = extractFloat64Sign( b );
2843
if ( aSign == bSign ) {
2844
return subFloat64Sigs( a, b, aSign );
2845
}
2846
else {
2847
return addFloat64Sigs( a, b, aSign );
2848
}
2849
2850
}
2851
2852
/*
2853
-------------------------------------------------------------------------------
2854
Returns the result of multiplying the double-precision floating-point values
2855
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
2856
for Binary Floating-Point Arithmetic.
2857
-------------------------------------------------------------------------------
2858
*/
2859
float64 float64_mul( float64 a, float64 b )
2860
{
2861
flag aSign, bSign, zSign;
2862
int16 aExp, bExp, zExp;
2863
bits64 aSig, bSig, zSig0, zSig1;
2864
2865
aSig = extractFloat64Frac( a );
2866
aExp = extractFloat64Exp( a );
2867
aSign = extractFloat64Sign( a );
2868
bSig = extractFloat64Frac( b );
2869
bExp = extractFloat64Exp( b );
2870
bSign = extractFloat64Sign( b );
2871
zSign = aSign ^ bSign;
2872
if ( aExp == 0x7FF ) {
2873
if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2874
return propagateFloat64NaN( a, b );
2875
}
2876
if ( ( bExp | bSig ) == 0 ) {
2877
float_raise( float_flag_invalid );
2878
return float64_default_nan;
2879
}
2880
return packFloat64( zSign, 0x7FF, 0 );
2881
}
2882
if ( bExp == 0x7FF ) {
2883
if ( bSig ) return propagateFloat64NaN( a, b );
2884
if ( ( aExp | aSig ) == 0 ) {
2885
float_raise( float_flag_invalid );
2886
return float64_default_nan;
2887
}
2888
return packFloat64( zSign, 0x7FF, 0 );
2889
}
2890
if ( aExp == 0 ) {
2891
if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2892
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2893
}
2894
if ( bExp == 0 ) {
2895
if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2896
normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2897
}
2898
zExp = aExp + bExp - 0x3FF;
2899
aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2900
bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2901
mul64To128( aSig, bSig, &zSig0, &zSig1 );
2902
zSig0 |= ( zSig1 != 0 );
2903
if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2904
zSig0 <<= 1;
2905
--zExp;
2906
}
2907
return roundAndPackFloat64( zSign, zExp, zSig0 );
2908
2909
}
2910
2911
/*
2912
-------------------------------------------------------------------------------
2913
Returns the result of dividing the double-precision floating-point value `a'
2914
by the corresponding value `b'. The operation is performed according to
2915
the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2916
-------------------------------------------------------------------------------
2917
*/
2918
float64 float64_div( float64 a, float64 b )
2919
{
2920
flag aSign, bSign, zSign;
2921
int16 aExp, bExp, zExp;
2922
bits64 aSig, bSig, zSig;
2923
bits64 rem0, rem1;
2924
bits64 term0, term1;
2925
2926
aSig = extractFloat64Frac( a );
2927
aExp = extractFloat64Exp( a );
2928
aSign = extractFloat64Sign( a );
2929
bSig = extractFloat64Frac( b );
2930
bExp = extractFloat64Exp( b );
2931
bSign = extractFloat64Sign( b );
2932
zSign = aSign ^ bSign;
2933
if ( aExp == 0x7FF ) {
2934
if ( aSig ) return propagateFloat64NaN( a, b );
2935
if ( bExp == 0x7FF ) {
2936
if ( bSig ) return propagateFloat64NaN( a, b );
2937
float_raise( float_flag_invalid );
2938
return float64_default_nan;
2939
}
2940
return packFloat64( zSign, 0x7FF, 0 );
2941
}
2942
if ( bExp == 0x7FF ) {
2943
if ( bSig ) return propagateFloat64NaN( a, b );
2944
return packFloat64( zSign, 0, 0 );
2945
}
2946
if ( bExp == 0 ) {
2947
if ( bSig == 0 ) {
2948
if ( ( aExp | aSig ) == 0 ) {
2949
float_raise( float_flag_invalid );
2950
return float64_default_nan;
2951
}
2952
float_raise( float_flag_divbyzero );
2953
return packFloat64( zSign, 0x7FF, 0 );
2954
}
2955
normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2956
}
2957
if ( aExp == 0 ) {
2958
if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2959
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2960
}
2961
zExp = aExp - bExp + 0x3FD;
2962
aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2963
bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2964
if ( bSig <= ( aSig + aSig ) ) {
2965
aSig >>= 1;
2966
++zExp;
2967
}
2968
zSig = estimateDiv128To64( aSig, 0, bSig );
2969
if ( ( zSig & 0x1FF ) <= 2 ) {
2970
mul64To128( bSig, zSig, &term0, &term1 );
2971
sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2972
while ( (sbits64) rem0 < 0 ) {
2973
--zSig;
2974
add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2975
}
2976
zSig |= ( rem1 != 0 );
2977
}
2978
return roundAndPackFloat64( zSign, zExp, zSig );
2979
2980
}
2981
2982
#ifndef SOFTFLOAT_FOR_GCC
2983
/*
2984
-------------------------------------------------------------------------------
2985
Returns the remainder of the double-precision floating-point value `a'
2986
with respect to the corresponding value `b'. The operation is performed
2987
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2988
-------------------------------------------------------------------------------
2989
*/
2990
float64 float64_rem( float64 a, float64 b )
2991
{
2992
flag aSign, bSign, zSign;
2993
int16 aExp, bExp, expDiff;
2994
bits64 aSig, bSig;
2995
bits64 q, alternateASig;
2996
sbits64 sigMean;
2997
2998
aSig = extractFloat64Frac( a );
2999
aExp = extractFloat64Exp( a );
3000
aSign = extractFloat64Sign( a );
3001
bSig = extractFloat64Frac( b );
3002
bExp = extractFloat64Exp( b );
3003
bSign = extractFloat64Sign( b );
3004
if ( aExp == 0x7FF ) {
3005
if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3006
return propagateFloat64NaN( a, b );
3007
}
3008
float_raise( float_flag_invalid );
3009
return float64_default_nan;
3010
}
3011
if ( bExp == 0x7FF ) {
3012
if ( bSig ) return propagateFloat64NaN( a, b );
3013
return a;
3014
}
3015
if ( bExp == 0 ) {
3016
if ( bSig == 0 ) {
3017
float_raise( float_flag_invalid );
3018
return float64_default_nan;
3019
}
3020
normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3021
}
3022
if ( aExp == 0 ) {
3023
if ( aSig == 0 ) return a;
3024
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3025
}
3026
expDiff = aExp - bExp;
3027
aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3028
bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3029
if ( expDiff < 0 ) {
3030
if ( expDiff < -1 ) return a;
3031
aSig >>= 1;
3032
}
3033
q = ( bSig <= aSig );
3034
if ( q ) aSig -= bSig;
3035
expDiff -= 64;
3036
while ( 0 < expDiff ) {
3037
q = estimateDiv128To64( aSig, 0, bSig );
3038
q = ( 2 < q ) ? q - 2 : 0;
3039
aSig = - ( ( bSig>>2 ) * q );
3040
expDiff -= 62;
3041
}
3042
expDiff += 64;
3043
if ( 0 < expDiff ) {
3044
q = estimateDiv128To64( aSig, 0, bSig );
3045
q = ( 2 < q ) ? q - 2 : 0;
3046
q >>= 64 - expDiff;
3047
bSig >>= 2;
3048
aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3049
}
3050
else {
3051
aSig >>= 2;
3052
bSig >>= 2;
3053
}
3054
do {
3055
alternateASig = aSig;
3056
++q;
3057
aSig -= bSig;
3058
} while ( 0 <= (sbits64) aSig );
3059
sigMean = aSig + alternateASig;
3060
if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3061
aSig = alternateASig;
3062
}
3063
zSign = ( (sbits64) aSig < 0 );
3064
if ( zSign ) aSig = - aSig;
3065
return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3066
3067
}
3068
3069
/*
3070
-------------------------------------------------------------------------------
3071
Returns the square root of the double-precision floating-point value `a'.
3072
The operation is performed according to the IEC/IEEE Standard for Binary
3073
Floating-Point Arithmetic.
3074
-------------------------------------------------------------------------------
3075
*/
3076
float64 float64_sqrt( float64 a )
3077
{
3078
flag aSign;
3079
int16 aExp, zExp;
3080
bits64 aSig, zSig, doubleZSig;
3081
bits64 rem0, rem1, term0, term1;
3082
3083
aSig = extractFloat64Frac( a );
3084
aExp = extractFloat64Exp( a );
3085
aSign = extractFloat64Sign( a );
3086
if ( aExp == 0x7FF ) {
3087
if ( aSig ) return propagateFloat64NaN( a, a );
3088
if ( ! aSign ) return a;
3089
float_raise( float_flag_invalid );
3090
return float64_default_nan;
3091
}
3092
if ( aSign ) {
3093
if ( ( aExp | aSig ) == 0 ) return a;
3094
float_raise( float_flag_invalid );
3095
return float64_default_nan;
3096
}
3097
if ( aExp == 0 ) {
3098
if ( aSig == 0 ) return 0;
3099
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3100
}
3101
zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3102
aSig |= LIT64( 0x0010000000000000 );
3103
zSig = estimateSqrt32( aExp, aSig>>21 );
3104
aSig <<= 9 - ( aExp & 1 );
3105
zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3106
if ( ( zSig & 0x1FF ) <= 5 ) {
3107
doubleZSig = zSig<<1;
3108
mul64To128( zSig, zSig, &term0, &term1 );
3109
sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3110
while ( (sbits64) rem0 < 0 ) {
3111
--zSig;
3112
doubleZSig -= 2;
3113
add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3114
}
3115
zSig |= ( ( rem0 | rem1 ) != 0 );
3116
}
3117
return roundAndPackFloat64( 0, zExp, zSig );
3118
3119
}
3120
#endif
3121
3122
/*
3123
-------------------------------------------------------------------------------
3124
Returns 1 if the double-precision floating-point value `a' is equal to the
3125
corresponding value `b', and 0 otherwise. The comparison is performed
3126
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3127
-------------------------------------------------------------------------------
3128
*/
3129
flag float64_eq( float64 a, float64 b )
3130
{
3131
3132
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3133
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3134
) {
3135
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3136
float_raise( float_flag_invalid );
3137
}
3138
return 0;
3139
}
3140
return ( a == b ) ||
3141
( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3142
3143
}
3144
3145
/*
3146
-------------------------------------------------------------------------------
3147
Returns 1 if the double-precision floating-point value `a' is less than or
3148
equal to the corresponding value `b', and 0 otherwise. The comparison is
3149
performed according to the IEC/IEEE Standard for Binary Floating-Point
3150
Arithmetic.
3151
-------------------------------------------------------------------------------
3152
*/
3153
flag float64_le( float64 a, float64 b )
3154
{
3155
flag aSign, bSign;
3156
3157
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3158
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3159
) {
3160
float_raise( float_flag_invalid );
3161
return 0;
3162
}
3163
aSign = extractFloat64Sign( a );
3164
bSign = extractFloat64Sign( b );
3165
if ( aSign != bSign )
3166
return aSign ||
3167
( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3168
0 );
3169
return ( a == b ) ||
3170
( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3171
3172
}
3173
3174
/*
3175
-------------------------------------------------------------------------------
3176
Returns 1 if the double-precision floating-point value `a' is less than
3177
the corresponding value `b', and 0 otherwise. The comparison is performed
3178
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3179
-------------------------------------------------------------------------------
3180
*/
3181
flag float64_lt( float64 a, float64 b )
3182
{
3183
flag aSign, bSign;
3184
3185
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3186
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3187
) {
3188
float_raise( float_flag_invalid );
3189
return 0;
3190
}
3191
aSign = extractFloat64Sign( a );
3192
bSign = extractFloat64Sign( b );
3193
if ( aSign != bSign )
3194
return aSign &&
3195
( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3196
0 );
3197
return ( a != b ) &&
3198
( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3199
3200
}
3201
3202
#ifndef SOFTFLOAT_FOR_GCC
3203
/*
3204
-------------------------------------------------------------------------------
3205
Returns 1 if the double-precision floating-point value `a' is equal to the
3206
corresponding value `b', and 0 otherwise. The invalid exception is raised
3207
if either operand is a NaN. Otherwise, the comparison is performed
3208
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3209
-------------------------------------------------------------------------------
3210
*/
3211
flag float64_eq_signaling( float64 a, float64 b )
3212
{
3213
3214
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3215
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3216
) {
3217
float_raise( float_flag_invalid );
3218
return 0;
3219
}
3220
return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3221
3222
}
3223
3224
/*
3225
-------------------------------------------------------------------------------
3226
Returns 1 if the double-precision floating-point value `a' is less than or
3227
equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3228
cause an exception. Otherwise, the comparison is performed according to the
3229
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3230
-------------------------------------------------------------------------------
3231
*/
3232
flag float64_le_quiet( float64 a, float64 b )
3233
{
3234
flag aSign, bSign;
3235
3236
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3237
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3238
) {
3239
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3240
float_raise( float_flag_invalid );
3241
}
3242
return 0;
3243
}
3244
aSign = extractFloat64Sign( a );
3245
bSign = extractFloat64Sign( b );
3246
if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3247
return ( a == b ) || ( aSign ^ ( a < b ) );
3248
3249
}
3250
3251
/*
3252
-------------------------------------------------------------------------------
3253
Returns 1 if the double-precision floating-point value `a' is less than
3254
the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3255
exception. Otherwise, the comparison is performed according to the IEC/IEEE
3256
Standard for Binary Floating-Point Arithmetic.
3257
-------------------------------------------------------------------------------
3258
*/
3259
flag float64_lt_quiet( float64 a, float64 b )
3260
{
3261
flag aSign, bSign;
3262
3263
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3264
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3265
) {
3266
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3267
float_raise( float_flag_invalid );
3268
}
3269
return 0;
3270
}
3271
aSign = extractFloat64Sign( a );
3272
bSign = extractFloat64Sign( b );
3273
if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3274
return ( a != b ) && ( aSign ^ ( a < b ) );
3275
3276
}
3277
#endif
3278
3279
#ifdef FLOATX80
3280
3281
/*
3282
-------------------------------------------------------------------------------
3283
Returns the result of converting the extended double-precision floating-
3284
point value `a' to the 32-bit two's complement integer format. The
3285
conversion is performed according to the IEC/IEEE Standard for Binary
3286
Floating-Point Arithmetic---which means in particular that the conversion
3287
is rounded according to the current rounding mode. If `a' is a NaN, the
3288
largest positive integer is returned. Otherwise, if the conversion
3289
overflows, the largest integer with the same sign as `a' is returned.
3290
-------------------------------------------------------------------------------
3291
*/
3292
int32 floatx80_to_int32( floatx80 a )
3293
{
3294
flag aSign;
3295
int32 aExp, shiftCount;
3296
bits64 aSig;
3297
3298
aSig = extractFloatx80Frac( a );
3299
aExp = extractFloatx80Exp( a );
3300
aSign = extractFloatx80Sign( a );
3301
if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3302
shiftCount = 0x4037 - aExp;
3303
if ( shiftCount <= 0 ) shiftCount = 1;
3304
shift64RightJamming( aSig, shiftCount, &aSig );
3305
return roundAndPackInt32( aSign, aSig );
3306
3307
}
3308
3309
/*
3310
-------------------------------------------------------------------------------
3311
Returns the result of converting the extended double-precision floating-
3312
point value `a' to the 32-bit two's complement integer format. The
3313
conversion is performed according to the IEC/IEEE Standard for Binary
3314
Floating-Point Arithmetic, except that the conversion is always rounded
3315
toward zero. If `a' is a NaN, the largest positive integer is returned.
3316
Otherwise, if the conversion overflows, the largest integer with the same
3317
sign as `a' is returned.
3318
-------------------------------------------------------------------------------
3319
*/
3320
int32 floatx80_to_int32_round_to_zero( floatx80 a )
3321
{
3322
flag aSign;
3323
int32 aExp, shiftCount;
3324
bits64 aSig, savedASig;
3325
int32 z;
3326
3327
aSig = extractFloatx80Frac( a );
3328
aExp = extractFloatx80Exp( a );
3329
aSign = extractFloatx80Sign( a );
3330
if ( 0x401E < aExp ) {
3331
if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3332
goto invalid;
3333
}
3334
else if ( aExp < 0x3FFF ) {
3335
if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3336
return 0;
3337
}
3338
shiftCount = 0x403E - aExp;
3339
savedASig = aSig;
3340
aSig >>= shiftCount;
3341
z = aSig;
3342
if ( aSign ) z = - z;
3343
if ( ( z < 0 ) ^ aSign ) {
3344
invalid:
3345
float_raise( float_flag_invalid );
3346
return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3347
}
3348
if ( ( aSig<<shiftCount ) != savedASig ) {
3349
float_exception_flags |= float_flag_inexact;
3350
}
3351
return z;
3352
3353
}
3354
3355
/*
3356
-------------------------------------------------------------------------------
3357
Returns the result of converting the extended double-precision floating-
3358
point value `a' to the 64-bit two's complement integer format. The
3359
conversion is performed according to the IEC/IEEE Standard for Binary
3360
Floating-Point Arithmetic---which means in particular that the conversion
3361
is rounded according to the current rounding mode. If `a' is a NaN,
3362
the largest positive integer is returned. Otherwise, if the conversion
3363
overflows, the largest integer with the same sign as `a' is returned.
3364
-------------------------------------------------------------------------------
3365
*/
3366
int64 floatx80_to_int64( floatx80 a )
3367
{
3368
flag aSign;
3369
int32 aExp, shiftCount;
3370
bits64 aSig, aSigExtra;
3371
3372
aSig = extractFloatx80Frac( a );
3373
aExp = extractFloatx80Exp( a );
3374
aSign = extractFloatx80Sign( a );
3375
shiftCount = 0x403E - aExp;
3376
if ( shiftCount <= 0 ) {
3377
if ( shiftCount ) {
3378
float_raise( float_flag_invalid );
3379
if ( ! aSign
3380
|| ( ( aExp == 0x7FFF )
3381
&& ( aSig != LIT64( 0x8000000000000000 ) ) )
3382
) {
3383
return LIT64( 0x7FFFFFFFFFFFFFFF );
3384
}
3385
return (sbits64) LIT64( 0x8000000000000000 );
3386
}
3387
aSigExtra = 0;
3388
}
3389
else {
3390
shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3391
}
3392
return roundAndPackInt64( aSign, aSig, aSigExtra );
3393
3394
}
3395
3396
/*
3397
-------------------------------------------------------------------------------
3398
Returns the result of converting the extended double-precision floating-
3399
point value `a' to the 64-bit two's complement integer format. The
3400
conversion is performed according to the IEC/IEEE Standard for Binary
3401
Floating-Point Arithmetic, except that the conversion is always rounded
3402
toward zero. If `a' is a NaN, the largest positive integer is returned.
3403
Otherwise, if the conversion overflows, the largest integer with the same
3404
sign as `a' is returned.
3405
-------------------------------------------------------------------------------
3406
*/
3407
int64 floatx80_to_int64_round_to_zero( floatx80 a )
3408
{
3409
flag aSign;
3410
int32 aExp, shiftCount;
3411
bits64 aSig;
3412
int64 z;
3413
3414
aSig = extractFloatx80Frac( a );
3415
aExp = extractFloatx80Exp( a );
3416
aSign = extractFloatx80Sign( a );
3417
shiftCount = aExp - 0x403E;
3418
if ( 0 <= shiftCount ) {
3419
aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3420
if ( ( a.high != 0xC03E ) || aSig ) {
3421
float_raise( float_flag_invalid );
3422
if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3423
return LIT64( 0x7FFFFFFFFFFFFFFF );
3424
}
3425
}
3426
return (sbits64) LIT64( 0x8000000000000000 );
3427
}
3428
else if ( aExp < 0x3FFF ) {
3429
if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3430
return 0;
3431
}
3432
z = aSig>>( - shiftCount );
3433
if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3434
float_exception_flags |= float_flag_inexact;
3435
}
3436
if ( aSign ) z = - z;
3437
return z;
3438
3439
}
3440
3441
/*
3442
-------------------------------------------------------------------------------
3443
Returns the result of converting the extended double-precision floating-
3444
point value `a' to the single-precision floating-point format. The
3445
conversion is performed according to the IEC/IEEE Standard for Binary
3446
Floating-Point Arithmetic.
3447
-------------------------------------------------------------------------------
3448
*/
3449
float32 floatx80_to_float32( floatx80 a )
3450
{
3451
flag aSign;
3452
int32 aExp;
3453
bits64 aSig;
3454
3455
aSig = extractFloatx80Frac( a );
3456
aExp = extractFloatx80Exp( a );
3457
aSign = extractFloatx80Sign( a );
3458
if ( aExp == 0x7FFF ) {
3459
if ( (bits64) ( aSig<<1 ) ) {
3460
return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3461
}
3462
return packFloat32( aSign, 0xFF, 0 );
3463
}
3464
shift64RightJamming( aSig, 33, &aSig );
3465
if ( aExp || aSig ) aExp -= 0x3F81;
3466
return roundAndPackFloat32( aSign, aExp, aSig );
3467
3468
}
3469
3470
/*
3471
-------------------------------------------------------------------------------
3472
Returns the result of converting the extended double-precision floating-
3473
point value `a' to the double-precision floating-point format. The
3474
conversion is performed according to the IEC/IEEE Standard for Binary
3475
Floating-Point Arithmetic.
3476
-------------------------------------------------------------------------------
3477
*/
3478
float64 floatx80_to_float64( floatx80 a )
3479
{
3480
flag aSign;
3481
int32 aExp;
3482
bits64 aSig, zSig;
3483
3484
aSig = extractFloatx80Frac( a );
3485
aExp = extractFloatx80Exp( a );
3486
aSign = extractFloatx80Sign( a );
3487
if ( aExp == 0x7FFF ) {
3488
if ( (bits64) ( aSig<<1 ) ) {
3489
return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3490
}
3491
return packFloat64( aSign, 0x7FF, 0 );
3492
}
3493
shift64RightJamming( aSig, 1, &zSig );
3494
if ( aExp || aSig ) aExp -= 0x3C01;
3495
return roundAndPackFloat64( aSign, aExp, zSig );
3496
3497
}
3498
3499
#ifdef FLOAT128
3500
3501
/*
3502
-------------------------------------------------------------------------------
3503
Returns the result of converting the extended double-precision floating-
3504
point value `a' to the quadruple-precision floating-point format. The
3505
conversion is performed according to the IEC/IEEE Standard for Binary
3506
Floating-Point Arithmetic.
3507
-------------------------------------------------------------------------------
3508
*/
3509
float128 floatx80_to_float128( floatx80 a )
3510
{
3511
flag aSign;
3512
int16 aExp;
3513
bits64 aSig, zSig0, zSig1;
3514
3515
aSig = extractFloatx80Frac( a );
3516
aExp = extractFloatx80Exp( a );
3517
aSign = extractFloatx80Sign( a );
3518
if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3519
return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3520
}
3521
shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3522
return packFloat128( aSign, aExp, zSig0, zSig1 );
3523
3524
}
3525
3526
#endif
3527
3528
/*
3529
-------------------------------------------------------------------------------
3530
Rounds the extended double-precision floating-point value `a' to an integer,
3531
and returns the result as an extended quadruple-precision floating-point
3532
value. The operation is performed according to the IEC/IEEE Standard for
3533
Binary Floating-Point Arithmetic.
3534
-------------------------------------------------------------------------------
3535
*/
3536
floatx80 floatx80_round_to_int( floatx80 a )
3537
{
3538
flag aSign;
3539
int32 aExp;
3540
bits64 lastBitMask, roundBitsMask;
3541
int8 roundingMode;
3542
floatx80 z;
3543
3544
aExp = extractFloatx80Exp( a );
3545
if ( 0x403E <= aExp ) {
3546
if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3547
return propagateFloatx80NaN( a, a );
3548
}
3549
return a;
3550
}
3551
if ( aExp < 0x3FFF ) {
3552
if ( ( aExp == 0 )
3553
&& ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3554
return a;
3555
}
3556
float_exception_flags |= float_flag_inexact;
3557
aSign = extractFloatx80Sign( a );
3558
switch ( float_rounding_mode ) {
3559
case float_round_nearest_even:
3560
if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3561
) {
3562
return
3563
packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3564
}
3565
break;
3566
case float_round_to_zero:
3567
break;
3568
case float_round_down:
3569
return
3570
aSign ?
3571
packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3572
: packFloatx80( 0, 0, 0 );
3573
case float_round_up:
3574
return
3575
aSign ? packFloatx80( 1, 0, 0 )
3576
: packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3577
}
3578
return packFloatx80( aSign, 0, 0 );
3579
}
3580
lastBitMask = 1;
3581
lastBitMask <<= 0x403E - aExp;
3582
roundBitsMask = lastBitMask - 1;
3583
z = a;
3584
roundingMode = float_rounding_mode;
3585
if ( roundingMode == float_round_nearest_even ) {
3586
z.low += lastBitMask>>1;
3587
if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3588
}
3589
else if ( roundingMode != float_round_to_zero ) {
3590
if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3591
z.low += roundBitsMask;
3592
}
3593
}
3594
z.low &= ~ roundBitsMask;
3595
if ( z.low == 0 ) {
3596
++z.high;
3597
z.low = LIT64( 0x8000000000000000 );
3598
}
3599
if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3600
return z;
3601
3602
}
3603
3604
/*
3605
-------------------------------------------------------------------------------
3606
Returns the result of adding the absolute values of the extended double-
3607
precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3608
negated before being returned. `zSign' is ignored if the result is a NaN.
3609
The addition is performed according to the IEC/IEEE Standard for Binary
3610
Floating-Point Arithmetic.
3611
-------------------------------------------------------------------------------
3612
*/
3613
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3614
{
3615
int32 aExp, bExp, zExp;
3616
bits64 aSig, bSig, zSig0, zSig1;
3617
int32 expDiff;
3618
3619
aSig = extractFloatx80Frac( a );
3620
aExp = extractFloatx80Exp( a );
3621
bSig = extractFloatx80Frac( b );
3622
bExp = extractFloatx80Exp( b );
3623
expDiff = aExp - bExp;
3624
if ( 0 < expDiff ) {
3625
if ( aExp == 0x7FFF ) {
3626
if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3627
return a;
3628
}
3629
if ( bExp == 0 ) --expDiff;
3630
shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3631
zExp = aExp;
3632
}
3633
else if ( expDiff < 0 ) {
3634
if ( bExp == 0x7FFF ) {
3635
if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3636
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3637
}
3638
if ( aExp == 0 ) ++expDiff;
3639
shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3640
zExp = bExp;
3641
}
3642
else {
3643
if ( aExp == 0x7FFF ) {
3644
if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3645
return propagateFloatx80NaN( a, b );
3646
}
3647
return a;
3648
}
3649
zSig1 = 0;
3650
zSig0 = aSig + bSig;
3651
if ( aExp == 0 ) {
3652
normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3653
goto roundAndPack;
3654
}
3655
zExp = aExp;
3656
goto shiftRight1;
3657
}
3658
zSig0 = aSig + bSig;
3659
if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3660
shiftRight1:
3661
shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3662
zSig0 |= LIT64( 0x8000000000000000 );
3663
++zExp;
3664
roundAndPack:
3665
return
3666
roundAndPackFloatx80(
3667
floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3668
3669
}
3670
3671
/*
3672
-------------------------------------------------------------------------------
3673
Returns the result of subtracting the absolute values of the extended
3674
double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3675
difference is negated before being returned. `zSign' is ignored if the
3676
result is a NaN. The subtraction is performed according to the IEC/IEEE
3677
Standard for Binary Floating-Point Arithmetic.
3678
-------------------------------------------------------------------------------
3679
*/
3680
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3681
{
3682
int32 aExp, bExp, zExp;
3683
bits64 aSig, bSig, zSig0, zSig1;
3684
int32 expDiff;
3685
floatx80 z;
3686
3687
aSig = extractFloatx80Frac( a );
3688
aExp = extractFloatx80Exp( a );
3689
bSig = extractFloatx80Frac( b );
3690
bExp = extractFloatx80Exp( b );
3691
expDiff = aExp - bExp;
3692
if ( 0 < expDiff ) goto aExpBigger;
3693
if ( expDiff < 0 ) goto bExpBigger;
3694
if ( aExp == 0x7FFF ) {
3695
if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3696
return propagateFloatx80NaN( a, b );
3697
}
3698
float_raise( float_flag_invalid );
3699
z.low = floatx80_default_nan_low;
3700
z.high = floatx80_default_nan_high;
3701
return z;
3702
}
3703
if ( aExp == 0 ) {
3704
aExp = 1;
3705
bExp = 1;
3706
}
3707
zSig1 = 0;
3708
if ( bSig < aSig ) goto aBigger;
3709
if ( aSig < bSig ) goto bBigger;
3710
return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3711
bExpBigger:
3712
if ( bExp == 0x7FFF ) {
3713
if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3714
return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3715
}
3716
if ( aExp == 0 ) ++expDiff;
3717
shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3718
bBigger:
3719
sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3720
zExp = bExp;
3721
zSign ^= 1;
3722
goto normalizeRoundAndPack;
3723
aExpBigger:
3724
if ( aExp == 0x7FFF ) {
3725
if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3726
return a;
3727
}
3728
if ( bExp == 0 ) --expDiff;
3729
shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3730
aBigger:
3731
sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3732
zExp = aExp;
3733
normalizeRoundAndPack:
3734
return
3735
normalizeRoundAndPackFloatx80(
3736
floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3737
3738
}
3739
3740
/*
3741
-------------------------------------------------------------------------------
3742
Returns the result of adding the extended double-precision floating-point
3743
values `a' and `b'. The operation is performed according to the IEC/IEEE
3744
Standard for Binary Floating-Point Arithmetic.
3745
-------------------------------------------------------------------------------
3746
*/
3747
floatx80 floatx80_add( floatx80 a, floatx80 b )
3748
{
3749
flag aSign, bSign;
3750
3751
aSign = extractFloatx80Sign( a );
3752
bSign = extractFloatx80Sign( b );
3753
if ( aSign == bSign ) {
3754
return addFloatx80Sigs( a, b, aSign );
3755
}
3756
else {
3757
return subFloatx80Sigs( a, b, aSign );
3758
}
3759
3760
}
3761
3762
/*
3763
-------------------------------------------------------------------------------
3764
Returns the result of subtracting the extended double-precision floating-
3765
point values `a' and `b'. The operation is performed according to the
3766
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3767
-------------------------------------------------------------------------------
3768
*/
3769
floatx80 floatx80_sub( floatx80 a, floatx80 b )
3770
{
3771
flag aSign, bSign;
3772
3773
aSign = extractFloatx80Sign( a );
3774
bSign = extractFloatx80Sign( b );
3775
if ( aSign == bSign ) {
3776
return subFloatx80Sigs( a, b, aSign );
3777
}
3778
else {
3779
return addFloatx80Sigs( a, b, aSign );
3780
}
3781
3782
}
3783
3784
/*
3785
-------------------------------------------------------------------------------
3786
Returns the result of multiplying the extended double-precision floating-
3787
point values `a' and `b'. The operation is performed according to the
3788
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3789
-------------------------------------------------------------------------------
3790
*/
3791
floatx80 floatx80_mul( floatx80 a, floatx80 b )
3792
{
3793
flag aSign, bSign, zSign;
3794
int32 aExp, bExp, zExp;
3795
bits64 aSig, bSig, zSig0, zSig1;
3796
floatx80 z;
3797
3798
aSig = extractFloatx80Frac( a );
3799
aExp = extractFloatx80Exp( a );
3800
aSign = extractFloatx80Sign( a );
3801
bSig = extractFloatx80Frac( b );
3802
bExp = extractFloatx80Exp( b );
3803
bSign = extractFloatx80Sign( b );
3804
zSign = aSign ^ bSign;
3805
if ( aExp == 0x7FFF ) {
3806
if ( (bits64) ( aSig<<1 )
3807
|| ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3808
return propagateFloatx80NaN( a, b );
3809
}
3810
if ( ( bExp | bSig ) == 0 ) goto invalid;
3811
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3812
}
3813
if ( bExp == 0x7FFF ) {
3814
if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3815
if ( ( aExp | aSig ) == 0 ) {
3816
invalid:
3817
float_raise( float_flag_invalid );
3818
z.low = floatx80_default_nan_low;
3819
z.high = floatx80_default_nan_high;
3820
return z;
3821
}
3822
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3823
}
3824
if ( aExp == 0 ) {
3825
if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3826
normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3827
}
3828
if ( bExp == 0 ) {
3829
if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3830
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3831
}
3832
zExp = aExp + bExp - 0x3FFE;
3833
mul64To128( aSig, bSig, &zSig0, &zSig1 );
3834
if ( 0 < (sbits64) zSig0 ) {
3835
shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3836
--zExp;
3837
}
3838
return
3839
roundAndPackFloatx80(
3840
floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3841
3842
}
3843
3844
/*
3845
-------------------------------------------------------------------------------
3846
Returns the result of dividing the extended double-precision floating-point
3847
value `a' by the corresponding value `b'. The operation is performed
3848
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3849
-------------------------------------------------------------------------------
3850
*/
3851
floatx80 floatx80_div( floatx80 a, floatx80 b )
3852
{
3853
flag aSign, bSign, zSign;
3854
int32 aExp, bExp, zExp;
3855
bits64 aSig, bSig, zSig0, zSig1;
3856
bits64 rem0, rem1, rem2, term0, term1, term2;
3857
floatx80 z;
3858
3859
aSig = extractFloatx80Frac( a );
3860
aExp = extractFloatx80Exp( a );
3861
aSign = extractFloatx80Sign( a );
3862
bSig = extractFloatx80Frac( b );
3863
bExp = extractFloatx80Exp( b );
3864
bSign = extractFloatx80Sign( b );
3865
zSign = aSign ^ bSign;
3866
if ( aExp == 0x7FFF ) {
3867
if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3868
if ( bExp == 0x7FFF ) {
3869
if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3870
goto invalid;
3871
}
3872
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3873
}
3874
if ( bExp == 0x7FFF ) {
3875
if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3876
return packFloatx80( zSign, 0, 0 );
3877
}
3878
if ( bExp == 0 ) {
3879
if ( bSig == 0 ) {
3880
if ( ( aExp | aSig ) == 0 ) {
3881
invalid:
3882
float_raise( float_flag_invalid );
3883
z.low = floatx80_default_nan_low;
3884
z.high = floatx80_default_nan_high;
3885
return z;
3886
}
3887
float_raise( float_flag_divbyzero );
3888
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3889
}
3890
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3891
}
3892
if ( aExp == 0 ) {
3893
if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3894
normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3895
}
3896
zExp = aExp - bExp + 0x3FFE;
3897
rem1 = 0;
3898
if ( bSig <= aSig ) {
3899
shift128Right( aSig, 0, 1, &aSig, &rem1 );
3900
++zExp;
3901
}
3902
zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3903
mul64To128( bSig, zSig0, &term0, &term1 );
3904
sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3905
while ( (sbits64) rem0 < 0 ) {
3906
--zSig0;
3907
add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3908
}
3909
zSig1 = estimateDiv128To64( rem1, 0, bSig );
3910
if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3911
mul64To128( bSig, zSig1, &term1, &term2 );
3912
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3913
while ( (sbits64) rem1 < 0 ) {
3914
--zSig1;
3915
add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3916
}
3917
zSig1 |= ( ( rem1 | rem2 ) != 0 );
3918
}
3919
return
3920
roundAndPackFloatx80(
3921
floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3922
3923
}
3924
3925
/*
3926
-------------------------------------------------------------------------------
3927
Returns the remainder of the extended double-precision floating-point value
3928
`a' with respect to the corresponding value `b'. The operation is performed
3929
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3930
-------------------------------------------------------------------------------
3931
*/
3932
floatx80 floatx80_rem( floatx80 a, floatx80 b )
3933
{
3934
flag aSign, bSign, zSign;
3935
int32 aExp, bExp, expDiff;
3936
bits64 aSig0, aSig1, bSig;
3937
bits64 q, term0, term1, alternateASig0, alternateASig1;
3938
floatx80 z;
3939
3940
aSig0 = extractFloatx80Frac( a );
3941
aExp = extractFloatx80Exp( a );
3942
aSign = extractFloatx80Sign( a );
3943
bSig = extractFloatx80Frac( b );
3944
bExp = extractFloatx80Exp( b );
3945
bSign = extractFloatx80Sign( b );
3946
if ( aExp == 0x7FFF ) {
3947
if ( (bits64) ( aSig0<<1 )
3948
|| ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3949
return propagateFloatx80NaN( a, b );
3950
}
3951
goto invalid;
3952
}
3953
if ( bExp == 0x7FFF ) {
3954
if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3955
return a;
3956
}
3957
if ( bExp == 0 ) {
3958
if ( bSig == 0 ) {
3959
invalid:
3960
float_raise( float_flag_invalid );
3961
z.low = floatx80_default_nan_low;
3962
z.high = floatx80_default_nan_high;
3963
return z;
3964
}
3965
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3966
}
3967
if ( aExp == 0 ) {
3968
if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3969
normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3970
}
3971
bSig |= LIT64( 0x8000000000000000 );
3972
zSign = aSign;
3973
expDiff = aExp - bExp;
3974
aSig1 = 0;
3975
if ( expDiff < 0 ) {
3976
if ( expDiff < -1 ) return a;
3977
shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3978
expDiff = 0;
3979
}
3980
q = ( bSig <= aSig0 );
3981
if ( q ) aSig0 -= bSig;
3982
expDiff -= 64;
3983
while ( 0 < expDiff ) {
3984
q = estimateDiv128To64( aSig0, aSig1, bSig );
3985
q = ( 2 < q ) ? q - 2 : 0;
3986
mul64To128( bSig, q, &term0, &term1 );
3987
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3988
shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3989
expDiff -= 62;
3990
}
3991
expDiff += 64;
3992
if ( 0 < expDiff ) {
3993
q = estimateDiv128To64( aSig0, aSig1, bSig );
3994
q = ( 2 < q ) ? q - 2 : 0;
3995
q >>= 64 - expDiff;
3996
mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3997
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3998
shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3999
while ( le128( term0, term1, aSig0, aSig1 ) ) {
4000
++q;
4001
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4002
}
4003
}
4004
else {
4005
term1 = 0;
4006
term0 = bSig;
4007
}
4008
sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4009
if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4010
|| ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4011
&& ( q & 1 ) )
4012
) {
4013
aSig0 = alternateASig0;
4014
aSig1 = alternateASig1;
4015
zSign = ! zSign;
4016
}
4017
return
4018
normalizeRoundAndPackFloatx80(
4019
80, zSign, bExp + expDiff, aSig0, aSig1 );
4020
4021
}
4022
4023
/*
4024
-------------------------------------------------------------------------------
4025
Returns the square root of the extended double-precision floating-point
4026
value `a'. The operation is performed according to the IEC/IEEE Standard
4027
for Binary Floating-Point Arithmetic.
4028
-------------------------------------------------------------------------------
4029
*/
4030
floatx80 floatx80_sqrt( floatx80 a )
4031
{
4032
flag aSign;
4033
int32 aExp, zExp;
4034
bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4035
bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4036
floatx80 z;
4037
4038
aSig0 = extractFloatx80Frac( a );
4039
aExp = extractFloatx80Exp( a );
4040
aSign = extractFloatx80Sign( a );
4041
if ( aExp == 0x7FFF ) {
4042
if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4043
if ( ! aSign ) return a;
4044
goto invalid;
4045
}
4046
if ( aSign ) {
4047
if ( ( aExp | aSig0 ) == 0 ) return a;
4048
invalid:
4049
float_raise( float_flag_invalid );
4050
z.low = floatx80_default_nan_low;
4051
z.high = floatx80_default_nan_high;
4052
return z;
4053
}
4054
if ( aExp == 0 ) {
4055
if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4056
normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4057
}
4058
zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4059
zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4060
shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4061
zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4062
doubleZSig0 = zSig0<<1;
4063
mul64To128( zSig0, zSig0, &term0, &term1 );
4064
sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4065
while ( (sbits64) rem0 < 0 ) {
4066
--zSig0;
4067
doubleZSig0 -= 2;
4068
add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4069
}
4070
zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4071
if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4072
if ( zSig1 == 0 ) zSig1 = 1;
4073
mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4074
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4075
mul64To128( zSig1, zSig1, &term2, &term3 );
4076
sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4077
while ( (sbits64) rem1 < 0 ) {
4078
--zSig1;
4079
shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4080
term3 |= 1;
4081
term2 |= doubleZSig0;
4082
add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4083
}
4084
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4085
}
4086
shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4087
zSig0 |= doubleZSig0;
4088
return
4089
roundAndPackFloatx80(
4090
floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4091
4092
}
4093
4094
/*
4095
-------------------------------------------------------------------------------
4096
Returns 1 if the extended double-precision floating-point value `a' is
4097
equal to the corresponding value `b', and 0 otherwise. The comparison is
4098
performed according to the IEC/IEEE Standard for Binary Floating-Point
4099
Arithmetic.
4100
-------------------------------------------------------------------------------
4101
*/
4102
flag floatx80_eq( floatx80 a, floatx80 b )
4103
{
4104
4105
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4106
&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4107
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4108
&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4109
) {
4110
if ( floatx80_is_signaling_nan( a )
4111
|| floatx80_is_signaling_nan( b ) ) {
4112
float_raise( float_flag_invalid );
4113
}
4114
return 0;
4115
}
4116
return
4117
( a.low == b.low )
4118
&& ( ( a.high == b.high )
4119
|| ( ( a.low == 0 )
4120
&& ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4121
);
4122
4123
}
4124
4125
/*
4126
-------------------------------------------------------------------------------
4127
Returns 1 if the extended double-precision floating-point value `a' is
4128
less than or equal to the corresponding value `b', and 0 otherwise. The
4129
comparison is performed according to the IEC/IEEE Standard for Binary
4130
Floating-Point Arithmetic.
4131
-------------------------------------------------------------------------------
4132
*/
4133
flag floatx80_le( floatx80 a, floatx80 b )
4134
{
4135
flag aSign, bSign;
4136
4137
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4138
&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4139
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4140
&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4141
) {
4142
float_raise( float_flag_invalid );
4143
return 0;
4144
}
4145
aSign = extractFloatx80Sign( a );
4146
bSign = extractFloatx80Sign( b );
4147
if ( aSign != bSign ) {
4148
return
4149
aSign
4150
|| ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4151
== 0 );
4152
}
4153
return
4154
aSign ? le128( b.high, b.low, a.high, a.low )
4155
: le128( a.high, a.low, b.high, b.low );
4156
4157
}
4158
4159
/*
4160
-------------------------------------------------------------------------------
4161
Returns 1 if the extended double-precision floating-point value `a' is
4162
less than the corresponding value `b', and 0 otherwise. The comparison
4163
is performed according to the IEC/IEEE Standard for Binary Floating-Point
4164
Arithmetic.
4165
-------------------------------------------------------------------------------
4166
*/
4167
flag floatx80_lt( floatx80 a, floatx80 b )
4168
{
4169
flag aSign, bSign;
4170
4171
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4172
&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4173
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4174
&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4175
) {
4176
float_raise( float_flag_invalid );
4177
return 0;
4178
}
4179
aSign = extractFloatx80Sign( a );
4180
bSign = extractFloatx80Sign( b );
4181
if ( aSign != bSign ) {
4182
return
4183
aSign
4184
&& ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4185
!= 0 );
4186
}
4187
return
4188
aSign ? lt128( b.high, b.low, a.high, a.low )
4189
: lt128( a.high, a.low, b.high, b.low );
4190
4191
}
4192
4193
/*
4194
-------------------------------------------------------------------------------
4195
Returns 1 if the extended double-precision floating-point value `a' is equal
4196
to the corresponding value `b', and 0 otherwise. The invalid exception is
4197
raised if either operand is a NaN. Otherwise, the comparison is performed
4198
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4199
-------------------------------------------------------------------------------
4200
*/
4201
flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4202
{
4203
4204
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4205
&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4206
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4207
&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4208
) {
4209
float_raise( float_flag_invalid );
4210
return 0;
4211
}
4212
return
4213
( a.low == b.low )
4214
&& ( ( a.high == b.high )
4215
|| ( ( a.low == 0 )
4216
&& ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4217
);
4218
4219
}
4220
4221
/*
4222
-------------------------------------------------------------------------------
4223
Returns 1 if the extended double-precision floating-point value `a' is less
4224
than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4225
do not cause an exception. Otherwise, the comparison is performed according
4226
to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4227
-------------------------------------------------------------------------------
4228
*/
4229
flag floatx80_le_quiet( floatx80 a, floatx80 b )
4230
{
4231
flag aSign, bSign;
4232
4233
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4234
&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4235
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4236
&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4237
) {
4238
if ( floatx80_is_signaling_nan( a )
4239
|| floatx80_is_signaling_nan( b ) ) {
4240
float_raise( float_flag_invalid );
4241
}
4242
return 0;
4243
}
4244
aSign = extractFloatx80Sign( a );
4245
bSign = extractFloatx80Sign( b );
4246
if ( aSign != bSign ) {
4247
return
4248
aSign
4249
|| ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4250
== 0 );
4251
}
4252
return
4253
aSign ? le128( b.high, b.low, a.high, a.low )
4254
: le128( a.high, a.low, b.high, b.low );
4255
4256
}
4257
4258
/*
4259
-------------------------------------------------------------------------------
4260
Returns 1 if the extended double-precision floating-point value `a' is less
4261
than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4262
an exception. Otherwise, the comparison is performed according to the
4263
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4264
-------------------------------------------------------------------------------
4265
*/
4266
flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4267
{
4268
flag aSign, bSign;
4269
4270
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4271
&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4272
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4273
&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4274
) {
4275
if ( floatx80_is_signaling_nan( a )
4276
|| floatx80_is_signaling_nan( b ) ) {
4277
float_raise( float_flag_invalid );
4278
}
4279
return 0;
4280
}
4281
aSign = extractFloatx80Sign( a );
4282
bSign = extractFloatx80Sign( b );
4283
if ( aSign != bSign ) {
4284
return
4285
aSign
4286
&& ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4287
!= 0 );
4288
}
4289
return
4290
aSign ? lt128( b.high, b.low, a.high, a.low )
4291
: lt128( a.high, a.low, b.high, b.low );
4292
4293
}
4294
4295
#endif
4296
4297
#ifdef FLOAT128
4298
4299
/*
4300
-------------------------------------------------------------------------------
4301
Returns the result of converting the quadruple-precision floating-point
4302
value `a' to the 32-bit two's complement integer format. The conversion
4303
is performed according to the IEC/IEEE Standard for Binary Floating-Point
4304
Arithmetic---which means in particular that the conversion is rounded
4305
according to the current rounding mode. If `a' is a NaN, the largest
4306
positive integer is returned. Otherwise, if the conversion overflows, the
4307
largest integer with the same sign as `a' is returned.
4308
-------------------------------------------------------------------------------
4309
*/
4310
int32 float128_to_int32( float128 a )
4311
{
4312
flag aSign;
4313
int32 aExp, shiftCount;
4314
bits64 aSig0, aSig1;
4315
4316
aSig1 = extractFloat128Frac1( a );
4317
aSig0 = extractFloat128Frac0( a );
4318
aExp = extractFloat128Exp( a );
4319
aSign = extractFloat128Sign( a );
4320
if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4321
if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4322
aSig0 |= ( aSig1 != 0 );
4323
shiftCount = 0x4028 - aExp;
4324
if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4325
return roundAndPackInt32( aSign, aSig0 );
4326
4327
}
4328
4329
/*
4330
-------------------------------------------------------------------------------
4331
Returns the result of converting the quadruple-precision floating-point
4332
value `a' to the 32-bit two's complement integer format. The conversion
4333
is performed according to the IEC/IEEE Standard for Binary Floating-Point
4334
Arithmetic, except that the conversion is always rounded toward zero. If
4335
`a' is a NaN, the largest positive integer is returned. Otherwise, if the
4336
conversion overflows, the largest integer with the same sign as `a' is
4337
returned.
4338
-------------------------------------------------------------------------------
4339
*/
4340
int32 float128_to_int32_round_to_zero( float128 a )
4341
{
4342
flag aSign;
4343
int32 aExp, shiftCount;
4344
bits64 aSig0, aSig1, savedASig;
4345
int32 z;
4346
4347
aSig1 = extractFloat128Frac1( a );
4348
aSig0 = extractFloat128Frac0( a );
4349
aExp = extractFloat128Exp( a );
4350
aSign = extractFloat128Sign( a );
4351
aSig0 |= ( aSig1 != 0 );
4352
if ( 0x401E < aExp ) {
4353
if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4354
goto invalid;
4355
}
4356
else if ( aExp < 0x3FFF ) {
4357
if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4358
return 0;
4359
}
4360
aSig0 |= LIT64( 0x0001000000000000 );
4361
shiftCount = 0x402F - aExp;
4362
savedASig = aSig0;
4363
aSig0 >>= shiftCount;
4364
z = aSig0;
4365
if ( aSign ) z = - z;
4366
if ( ( z < 0 ) ^ aSign ) {
4367
invalid:
4368
float_raise( float_flag_invalid );
4369
return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4370
}
4371
if ( ( aSig0<<shiftCount ) != savedASig ) {
4372
float_exception_flags |= float_flag_inexact;
4373
}
4374
return z;
4375
4376
}
4377
4378
/*
4379
-------------------------------------------------------------------------------
4380
Returns the result of converting the quadruple-precision floating-point
4381
value `a' to the 64-bit two's complement integer format. The conversion
4382
is performed according to the IEC/IEEE Standard for Binary Floating-Point
4383
Arithmetic---which means in particular that the conversion is rounded
4384
according to the current rounding mode. If `a' is a NaN, the largest
4385
positive integer is returned. Otherwise, if the conversion overflows, the
4386
largest integer with the same sign as `a' is returned.
4387
-------------------------------------------------------------------------------
4388
*/
4389
int64 float128_to_int64( float128 a )
4390
{
4391
flag aSign;
4392
int32 aExp, shiftCount;
4393
bits64 aSig0, aSig1;
4394
4395
aSig1 = extractFloat128Frac1( a );
4396
aSig0 = extractFloat128Frac0( a );
4397
aExp = extractFloat128Exp( a );
4398
aSign = extractFloat128Sign( a );
4399
if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4400
shiftCount = 0x402F - aExp;
4401
if ( shiftCount <= 0 ) {
4402
if ( 0x403E < aExp ) {
4403
float_raise( float_flag_invalid );
4404
if ( ! aSign
4405
|| ( ( aExp == 0x7FFF )
4406
&& ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4407
)
4408
) {
4409
return LIT64( 0x7FFFFFFFFFFFFFFF );
4410
}
4411
return (sbits64) LIT64( 0x8000000000000000 );
4412
}
4413
shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4414
}
4415
else {
4416
shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4417
}
4418
return roundAndPackInt64( aSign, aSig0, aSig1 );
4419
4420
}
4421
4422
/*
4423
-------------------------------------------------------------------------------
4424
Returns the result of converting the quadruple-precision floating-point
4425
value `a' to the 64-bit two's complement integer format. The conversion
4426
is performed according to the IEC/IEEE Standard for Binary Floating-Point
4427
Arithmetic, except that the conversion is always rounded toward zero.
4428
If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4429
the conversion overflows, the largest integer with the same sign as `a' is
4430
returned.
4431
-------------------------------------------------------------------------------
4432
*/
4433
int64 float128_to_int64_round_to_zero( float128 a )
4434
{
4435
flag aSign;
4436
int32 aExp, shiftCount;
4437
bits64 aSig0, aSig1;
4438
int64 z;
4439
4440
aSig1 = extractFloat128Frac1( a );
4441
aSig0 = extractFloat128Frac0( a );
4442
aExp = extractFloat128Exp( a );
4443
aSign = extractFloat128Sign( a );
4444
if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4445
shiftCount = aExp - 0x402F;
4446
if ( 0 < shiftCount ) {
4447
if ( 0x403E <= aExp ) {
4448
aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4449
if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4450
&& ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4451
if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4452
}
4453
else {
4454
float_raise( float_flag_invalid );
4455
if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4456
return LIT64( 0x7FFFFFFFFFFFFFFF );
4457
}
4458
}
4459
return (sbits64) LIT64( 0x8000000000000000 );
4460
}
4461
z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4462
if ( (bits64) ( aSig1<<shiftCount ) ) {
4463
float_exception_flags |= float_flag_inexact;
4464
}
4465
}
4466
else {
4467
if ( aExp < 0x3FFF ) {
4468
if ( aExp | aSig0 | aSig1 ) {
4469
float_exception_flags |= float_flag_inexact;
4470
}
4471
return 0;
4472
}
4473
z = aSig0>>( - shiftCount );
4474
if ( aSig1
4475
|| ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4476
float_exception_flags |= float_flag_inexact;
4477
}
4478
}
4479
if ( aSign ) z = - z;
4480
return z;
4481
4482
}
4483
4484
#if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \
4485
&& defined(SOFTFLOAT_NEED_FIXUNS)
4486
/*
4487
* just like above - but do not care for overflow of signed results
4488
*/
4489
uint64 float128_to_uint64_round_to_zero( float128 a )
4490
{
4491
flag aSign;
4492
int32 aExp, shiftCount;
4493
bits64 aSig0, aSig1;
4494
uint64 z;
4495
4496
aSig1 = extractFloat128Frac1( a );
4497
aSig0 = extractFloat128Frac0( a );
4498
aExp = extractFloat128Exp( a );
4499
aSign = extractFloat128Sign( a );
4500
if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4501
shiftCount = aExp - 0x402F;
4502
if ( 0 < shiftCount ) {
4503
if ( 0x403F <= aExp ) {
4504
aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4505
if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4506
&& ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4507
if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4508
}
4509
else {
4510
float_raise( float_flag_invalid );
4511
}
4512
return LIT64( 0xFFFFFFFFFFFFFFFF );
4513
}
4514
z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4515
if ( (bits64) ( aSig1<<shiftCount ) ) {
4516
float_exception_flags |= float_flag_inexact;
4517
}
4518
}
4519
else {
4520
if ( aExp < 0x3FFF ) {
4521
if ( aExp | aSig0 | aSig1 ) {
4522
float_exception_flags |= float_flag_inexact;
4523
}
4524
return 0;
4525
}
4526
z = aSig0>>( - shiftCount );
4527
if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4528
float_exception_flags |= float_flag_inexact;
4529
}
4530
}
4531
if ( aSign ) z = - z;
4532
return z;
4533
4534
}
4535
#endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */
4536
4537
/*
4538
-------------------------------------------------------------------------------
4539
Returns the result of converting the quadruple-precision floating-point
4540
value `a' to the single-precision floating-point format. The conversion
4541
is performed according to the IEC/IEEE Standard for Binary Floating-Point
4542
Arithmetic.
4543
-------------------------------------------------------------------------------
4544
*/
4545
float32 float128_to_float32( float128 a )
4546
{
4547
flag aSign;
4548
int32 aExp;
4549
bits64 aSig0, aSig1;
4550
bits32 zSig;
4551
4552
aSig1 = extractFloat128Frac1( a );
4553
aSig0 = extractFloat128Frac0( a );
4554
aExp = extractFloat128Exp( a );
4555
aSign = extractFloat128Sign( a );
4556
if ( aExp == 0x7FFF ) {
4557
if ( aSig0 | aSig1 ) {
4558
return commonNaNToFloat32( float128ToCommonNaN( a ) );
4559
}
4560
return packFloat32( aSign, 0xFF, 0 );
4561
}
4562
aSig0 |= ( aSig1 != 0 );
4563
shift64RightJamming( aSig0, 18, &aSig0 );
4564
zSig = aSig0;
4565
if ( aExp || zSig ) {
4566
zSig |= 0x40000000;
4567
aExp -= 0x3F81;
4568
}
4569
return roundAndPackFloat32( aSign, aExp, zSig );
4570
4571
}
4572
4573
/*
4574
-------------------------------------------------------------------------------
4575
Returns the result of converting the quadruple-precision floating-point
4576
value `a' to the double-precision floating-point format. The conversion
4577
is performed according to the IEC/IEEE Standard for Binary Floating-Point
4578
Arithmetic.
4579
-------------------------------------------------------------------------------
4580
*/
4581
float64 float128_to_float64( float128 a )
4582
{
4583
flag aSign;
4584
int32 aExp;
4585
bits64 aSig0, aSig1;
4586
4587
aSig1 = extractFloat128Frac1( a );
4588
aSig0 = extractFloat128Frac0( a );
4589
aExp = extractFloat128Exp( a );
4590
aSign = extractFloat128Sign( a );
4591
if ( aExp == 0x7FFF ) {
4592
if ( aSig0 | aSig1 ) {
4593
return commonNaNToFloat64( float128ToCommonNaN( a ) );
4594
}
4595
return packFloat64( aSign, 0x7FF, 0 );
4596
}
4597
shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4598
aSig0 |= ( aSig1 != 0 );
4599
if ( aExp || aSig0 ) {
4600
aSig0 |= LIT64( 0x4000000000000000 );
4601
aExp -= 0x3C01;
4602
}
4603
return roundAndPackFloat64( aSign, aExp, aSig0 );
4604
4605
}
4606
4607
#ifdef FLOATX80
4608
4609
/*
4610
-------------------------------------------------------------------------------
4611
Returns the result of converting the quadruple-precision floating-point
4612
value `a' to the extended double-precision floating-point format. The
4613
conversion is performed according to the IEC/IEEE Standard for Binary
4614
Floating-Point Arithmetic.
4615
-------------------------------------------------------------------------------
4616
*/
4617
floatx80 float128_to_floatx80( float128 a )
4618
{
4619
flag aSign;
4620
int32 aExp;
4621
bits64 aSig0, aSig1;
4622
4623
aSig1 = extractFloat128Frac1( a );
4624
aSig0 = extractFloat128Frac0( a );
4625
aExp = extractFloat128Exp( a );
4626
aSign = extractFloat128Sign( a );
4627
if ( aExp == 0x7FFF ) {
4628
if ( aSig0 | aSig1 ) {
4629
return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4630
}
4631
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4632
}
4633
if ( aExp == 0 ) {
4634
if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4635
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4636
}
4637
else {
4638
aSig0 |= LIT64( 0x0001000000000000 );
4639
}
4640
shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4641
return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4642
4643
}
4644
4645
#endif
4646
4647
/*
4648
-------------------------------------------------------------------------------
4649
Rounds the quadruple-precision floating-point value `a' to an integer, and
4650
returns the result as a quadruple-precision floating-point value. The
4651
operation is performed according to the IEC/IEEE Standard for Binary
4652
Floating-Point Arithmetic.
4653
-------------------------------------------------------------------------------
4654
*/
4655
float128 float128_round_to_int( float128 a )
4656
{
4657
flag aSign;
4658
int32 aExp;
4659
bits64 lastBitMask, roundBitsMask;
4660
int8 roundingMode;
4661
float128 z;
4662
4663
aExp = extractFloat128Exp( a );
4664
if ( 0x402F <= aExp ) {
4665
if ( 0x406F <= aExp ) {
4666
if ( ( aExp == 0x7FFF )
4667
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4668
) {
4669
return propagateFloat128NaN( a, a );
4670
}
4671
return a;
4672
}
4673
lastBitMask = 1;
4674
lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4675
roundBitsMask = lastBitMask - 1;
4676
z = a;
4677
roundingMode = float_rounding_mode;
4678
if ( roundingMode == float_round_nearest_even ) {
4679
if ( lastBitMask ) {
4680
add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4681
if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4682
}
4683
else {
4684
if ( (sbits64) z.low < 0 ) {
4685
++z.high;
4686
if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4687
}
4688
}
4689
}
4690
else if ( roundingMode != float_round_to_zero ) {
4691
if ( extractFloat128Sign( z )
4692
^ ( roundingMode == float_round_up ) ) {
4693
add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4694
}
4695
}
4696
z.low &= ~ roundBitsMask;
4697
}
4698
else {
4699
if ( aExp < 0x3FFF ) {
4700
if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4701
float_exception_flags |= float_flag_inexact;
4702
aSign = extractFloat128Sign( a );
4703
switch ( float_rounding_mode ) {
4704
case float_round_nearest_even:
4705
if ( ( aExp == 0x3FFE )
4706
&& ( extractFloat128Frac0( a )
4707
| extractFloat128Frac1( a ) )
4708
) {
4709
return packFloat128( aSign, 0x3FFF, 0, 0 );
4710
}
4711
break;
4712
case float_round_to_zero:
4713
break;
4714
case float_round_down:
4715
return
4716
aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4717
: packFloat128( 0, 0, 0, 0 );
4718
case float_round_up:
4719
return
4720
aSign ? packFloat128( 1, 0, 0, 0 )
4721
: packFloat128( 0, 0x3FFF, 0, 0 );
4722
}
4723
return packFloat128( aSign, 0, 0, 0 );
4724
}
4725
lastBitMask = 1;
4726
lastBitMask <<= 0x402F - aExp;
4727
roundBitsMask = lastBitMask - 1;
4728
z.low = 0;
4729
z.high = a.high;
4730
roundingMode = float_rounding_mode;
4731
if ( roundingMode == float_round_nearest_even ) {
4732
z.high += lastBitMask>>1;
4733
if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4734
z.high &= ~ lastBitMask;
4735
}
4736
}
4737
else if ( roundingMode != float_round_to_zero ) {
4738
if ( extractFloat128Sign( z )
4739
^ ( roundingMode == float_round_up ) ) {
4740
z.high |= ( a.low != 0 );
4741
z.high += roundBitsMask;
4742
}
4743
}
4744
z.high &= ~ roundBitsMask;
4745
}
4746
if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4747
float_exception_flags |= float_flag_inexact;
4748
}
4749
return z;
4750
4751
}
4752
4753
/*
4754
-------------------------------------------------------------------------------
4755
Returns the result of adding the absolute values of the quadruple-precision
4756
floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4757
before being returned. `zSign' is ignored if the result is a NaN.
4758
The addition is performed according to the IEC/IEEE Standard for Binary
4759
Floating-Point Arithmetic.
4760
-------------------------------------------------------------------------------
4761
*/
4762
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4763
{
4764
int32 aExp, bExp, zExp;
4765
bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4766
int32 expDiff;
4767
4768
aSig1 = extractFloat128Frac1( a );
4769
aSig0 = extractFloat128Frac0( a );
4770
aExp = extractFloat128Exp( a );
4771
bSig1 = extractFloat128Frac1( b );
4772
bSig0 = extractFloat128Frac0( b );
4773
bExp = extractFloat128Exp( b );
4774
expDiff = aExp - bExp;
4775
if ( 0 < expDiff ) {
4776
if ( aExp == 0x7FFF ) {
4777
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4778
return a;
4779
}
4780
if ( bExp == 0 ) {
4781
--expDiff;
4782
}
4783
else {
4784
bSig0 |= LIT64( 0x0001000000000000 );
4785
}
4786
shift128ExtraRightJamming(
4787
bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4788
zExp = aExp;
4789
}
4790
else if ( expDiff < 0 ) {
4791
if ( bExp == 0x7FFF ) {
4792
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4793
return packFloat128( zSign, 0x7FFF, 0, 0 );
4794
}
4795
if ( aExp == 0 ) {
4796
++expDiff;
4797
}
4798
else {
4799
aSig0 |= LIT64( 0x0001000000000000 );
4800
}
4801
shift128ExtraRightJamming(
4802
aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4803
zExp = bExp;
4804
}
4805
else {
4806
if ( aExp == 0x7FFF ) {
4807
if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4808
return propagateFloat128NaN( a, b );
4809
}
4810
return a;
4811
}
4812
add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4813
if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4814
zSig2 = 0;
4815
zSig0 |= LIT64( 0x0002000000000000 );
4816
zExp = aExp;
4817
goto shiftRight1;
4818
}
4819
aSig0 |= LIT64( 0x0001000000000000 );
4820
add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4821
--zExp;
4822
if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4823
++zExp;
4824
shiftRight1:
4825
shift128ExtraRightJamming(
4826
zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4827
roundAndPack:
4828
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4829
4830
}
4831
4832
/*
4833
-------------------------------------------------------------------------------
4834
Returns the result of subtracting the absolute values of the quadruple-
4835
precision floating-point values `a' and `b'. If `zSign' is 1, the
4836
difference is negated before being returned. `zSign' is ignored if the
4837
result is a NaN. The subtraction is performed according to the IEC/IEEE
4838
Standard for Binary Floating-Point Arithmetic.
4839
-------------------------------------------------------------------------------
4840
*/
4841
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4842
{
4843
int32 aExp, bExp, zExp;
4844
bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4845
int32 expDiff;
4846
float128 z;
4847
4848
aSig1 = extractFloat128Frac1( a );
4849
aSig0 = extractFloat128Frac0( a );
4850
aExp = extractFloat128Exp( a );
4851
bSig1 = extractFloat128Frac1( b );
4852
bSig0 = extractFloat128Frac0( b );
4853
bExp = extractFloat128Exp( b );
4854
expDiff = aExp - bExp;
4855
shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4856
shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4857
if ( 0 < expDiff ) goto aExpBigger;
4858
if ( expDiff < 0 ) goto bExpBigger;
4859
if ( aExp == 0x7FFF ) {
4860
if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4861
return propagateFloat128NaN( a, b );
4862
}
4863
float_raise( float_flag_invalid );
4864
z.low = float128_default_nan_low;
4865
z.high = float128_default_nan_high;
4866
return z;
4867
}
4868
if ( aExp == 0 ) {
4869
aExp = 1;
4870
bExp = 1;
4871
}
4872
if ( bSig0 < aSig0 ) goto aBigger;
4873
if ( aSig0 < bSig0 ) goto bBigger;
4874
if ( bSig1 < aSig1 ) goto aBigger;
4875
if ( aSig1 < bSig1 ) goto bBigger;
4876
return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4877
bExpBigger:
4878
if ( bExp == 0x7FFF ) {
4879
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4880
return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4881
}
4882
if ( aExp == 0 ) {
4883
++expDiff;
4884
}
4885
else {
4886
aSig0 |= LIT64( 0x4000000000000000 );
4887
}
4888
shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4889
bSig0 |= LIT64( 0x4000000000000000 );
4890
bBigger:
4891
sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4892
zExp = bExp;
4893
zSign ^= 1;
4894
goto normalizeRoundAndPack;
4895
aExpBigger:
4896
if ( aExp == 0x7FFF ) {
4897
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4898
return a;
4899
}
4900
if ( bExp == 0 ) {
4901
--expDiff;
4902
}
4903
else {
4904
bSig0 |= LIT64( 0x4000000000000000 );
4905
}
4906
shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4907
aSig0 |= LIT64( 0x4000000000000000 );
4908
aBigger:
4909
sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4910
zExp = aExp;
4911
normalizeRoundAndPack:
4912
--zExp;
4913
return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4914
4915
}
4916
4917
/*
4918
-------------------------------------------------------------------------------
4919
Returns the result of adding the quadruple-precision floating-point values
4920
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
4921
for Binary Floating-Point Arithmetic.
4922
-------------------------------------------------------------------------------
4923
*/
4924
float128 float128_add( float128 a, float128 b )
4925
{
4926
flag aSign, bSign;
4927
4928
aSign = extractFloat128Sign( a );
4929
bSign = extractFloat128Sign( b );
4930
if ( aSign == bSign ) {
4931
return addFloat128Sigs( a, b, aSign );
4932
}
4933
else {
4934
return subFloat128Sigs( a, b, aSign );
4935
}
4936
4937
}
4938
4939
/*
4940
-------------------------------------------------------------------------------
4941
Returns the result of subtracting the quadruple-precision floating-point
4942
values `a' and `b'. The operation is performed according to the IEC/IEEE
4943
Standard for Binary Floating-Point Arithmetic.
4944
-------------------------------------------------------------------------------
4945
*/
4946
float128 float128_sub( float128 a, float128 b )
4947
{
4948
flag aSign, bSign;
4949
4950
aSign = extractFloat128Sign( a );
4951
bSign = extractFloat128Sign( b );
4952
if ( aSign == bSign ) {
4953
return subFloat128Sigs( a, b, aSign );
4954
}
4955
else {
4956
return addFloat128Sigs( a, b, aSign );
4957
}
4958
4959
}
4960
4961
/*
4962
-------------------------------------------------------------------------------
4963
Returns the result of multiplying the quadruple-precision floating-point
4964
values `a' and `b'. The operation is performed according to the IEC/IEEE
4965
Standard for Binary Floating-Point Arithmetic.
4966
-------------------------------------------------------------------------------
4967
*/
4968
float128 float128_mul( float128 a, float128 b )
4969
{
4970
flag aSign, bSign, zSign;
4971
int32 aExp, bExp, zExp;
4972
bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4973
float128 z;
4974
4975
aSig1 = extractFloat128Frac1( a );
4976
aSig0 = extractFloat128Frac0( a );
4977
aExp = extractFloat128Exp( a );
4978
aSign = extractFloat128Sign( a );
4979
bSig1 = extractFloat128Frac1( b );
4980
bSig0 = extractFloat128Frac0( b );
4981
bExp = extractFloat128Exp( b );
4982
bSign = extractFloat128Sign( b );
4983
zSign = aSign ^ bSign;
4984
if ( aExp == 0x7FFF ) {
4985
if ( ( aSig0 | aSig1 )
4986
|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4987
return propagateFloat128NaN( a, b );
4988
}
4989
if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4990
return packFloat128( zSign, 0x7FFF, 0, 0 );
4991
}
4992
if ( bExp == 0x7FFF ) {
4993
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4994
if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4995
invalid:
4996
float_raise( float_flag_invalid );
4997
z.low = float128_default_nan_low;
4998
z.high = float128_default_nan_high;
4999
return z;
5000
}
5001
return packFloat128( zSign, 0x7FFF, 0, 0 );
5002
}
5003
if ( aExp == 0 ) {
5004
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5005
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5006
}
5007
if ( bExp == 0 ) {
5008
if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5009
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5010
}
5011
zExp = aExp + bExp - 0x4000;
5012
aSig0 |= LIT64( 0x0001000000000000 );
5013
shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5014
mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5015
add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5016
zSig2 |= ( zSig3 != 0 );
5017
if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5018
shift128ExtraRightJamming(
5019
zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5020
++zExp;
5021
}
5022
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5023
5024
}
5025
5026
/*
5027
-------------------------------------------------------------------------------
5028
Returns the result of dividing the quadruple-precision floating-point value
5029
`a' by the corresponding value `b'. The operation is performed according to
5030
the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5031
-------------------------------------------------------------------------------
5032
*/
5033
float128 float128_div( float128 a, float128 b )
5034
{
5035
flag aSign, bSign, zSign;
5036
int32 aExp, bExp, zExp;
5037
bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5038
bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5039
float128 z;
5040
5041
aSig1 = extractFloat128Frac1( a );
5042
aSig0 = extractFloat128Frac0( a );
5043
aExp = extractFloat128Exp( a );
5044
aSign = extractFloat128Sign( a );
5045
bSig1 = extractFloat128Frac1( b );
5046
bSig0 = extractFloat128Frac0( b );
5047
bExp = extractFloat128Exp( b );
5048
bSign = extractFloat128Sign( b );
5049
zSign = aSign ^ bSign;
5050
if ( aExp == 0x7FFF ) {
5051
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5052
if ( bExp == 0x7FFF ) {
5053
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5054
goto invalid;
5055
}
5056
return packFloat128( zSign, 0x7FFF, 0, 0 );
5057
}
5058
if ( bExp == 0x7FFF ) {
5059
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5060
return packFloat128( zSign, 0, 0, 0 );
5061
}
5062
if ( bExp == 0 ) {
5063
if ( ( bSig0 | bSig1 ) == 0 ) {
5064
if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5065
invalid:
5066
float_raise( float_flag_invalid );
5067
z.low = float128_default_nan_low;
5068
z.high = float128_default_nan_high;
5069
return z;
5070
}
5071
float_raise( float_flag_divbyzero );
5072
return packFloat128( zSign, 0x7FFF, 0, 0 );
5073
}
5074
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5075
}
5076
if ( aExp == 0 ) {
5077
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5078
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5079
}
5080
zExp = aExp - bExp + 0x3FFD;
5081
shortShift128Left(
5082
aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5083
shortShift128Left(
5084
bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5085
if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5086
shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5087
++zExp;
5088
}
5089
zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5090
mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5091
sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5092
while ( (sbits64) rem0 < 0 ) {
5093
--zSig0;
5094
add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5095
}
5096
zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5097
if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5098
mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5099
sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5100
while ( (sbits64) rem1 < 0 ) {
5101
--zSig1;
5102
add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5103
}
5104
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5105
}
5106
shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5107
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5108
5109
}
5110
5111
/*
5112
-------------------------------------------------------------------------------
5113
Returns the remainder of the quadruple-precision floating-point value `a'
5114
with respect to the corresponding value `b'. The operation is performed
5115
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5116
-------------------------------------------------------------------------------
5117
*/
5118
float128 float128_rem( float128 a, float128 b )
5119
{
5120
flag aSign, bSign, zSign;
5121
int32 aExp, bExp, expDiff;
5122
bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5123
bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5124
sbits64 sigMean0;
5125
float128 z;
5126
5127
aSig1 = extractFloat128Frac1( a );
5128
aSig0 = extractFloat128Frac0( a );
5129
aExp = extractFloat128Exp( a );
5130
aSign = extractFloat128Sign( a );
5131
bSig1 = extractFloat128Frac1( b );
5132
bSig0 = extractFloat128Frac0( b );
5133
bExp = extractFloat128Exp( b );
5134
bSign = extractFloat128Sign( b );
5135
if ( aExp == 0x7FFF ) {
5136
if ( ( aSig0 | aSig1 )
5137
|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5138
return propagateFloat128NaN( a, b );
5139
}
5140
goto invalid;
5141
}
5142
if ( bExp == 0x7FFF ) {
5143
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5144
return a;
5145
}
5146
if ( bExp == 0 ) {
5147
if ( ( bSig0 | bSig1 ) == 0 ) {
5148
invalid:
5149
float_raise( float_flag_invalid );
5150
z.low = float128_default_nan_low;
5151
z.high = float128_default_nan_high;
5152
return z;
5153
}
5154
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5155
}
5156
if ( aExp == 0 ) {
5157
if ( ( aSig0 | aSig1 ) == 0 ) return a;
5158
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5159
}
5160
expDiff = aExp - bExp;
5161
if ( expDiff < -1 ) return a;
5162
shortShift128Left(
5163
aSig0 | LIT64( 0x0001000000000000 ),
5164
aSig1,
5165
15 - ( expDiff < 0 ),
5166
&aSig0,
5167
&aSig1
5168
);
5169
shortShift128Left(
5170
bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5171
q = le128( bSig0, bSig1, aSig0, aSig1 );
5172
if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5173
expDiff -= 64;
5174
while ( 0 < expDiff ) {
5175
q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5176
q = ( 4 < q ) ? q - 4 : 0;
5177
mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5178
shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5179
shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5180
sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5181
expDiff -= 61;
5182
}
5183
if ( -64 < expDiff ) {
5184
q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5185
q = ( 4 < q ) ? q - 4 : 0;
5186
q >>= - expDiff;
5187
shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5188
expDiff += 52;
5189
if ( expDiff < 0 ) {
5190
shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5191
}
5192
else {
5193
shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5194
}
5195
mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5196
sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5197
}
5198
else {
5199
shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5200
shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5201
}
5202
do {
5203
alternateASig0 = aSig0;
5204
alternateASig1 = aSig1;
5205
++q;
5206
sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5207
} while ( 0 <= (sbits64) aSig0 );
5208
add128(
5209
aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5210
if ( ( sigMean0 < 0 )
5211
|| ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5212
aSig0 = alternateASig0;
5213
aSig1 = alternateASig1;
5214
}
5215
zSign = ( (sbits64) aSig0 < 0 );
5216
if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5217
return
5218
normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5219
5220
}
5221
5222
/*
5223
-------------------------------------------------------------------------------
5224
Returns the square root of the quadruple-precision floating-point value `a'.
5225
The operation is performed according to the IEC/IEEE Standard for Binary
5226
Floating-Point Arithmetic.
5227
-------------------------------------------------------------------------------
5228
*/
5229
float128 float128_sqrt( float128 a )
5230
{
5231
flag aSign;
5232
int32 aExp, zExp;
5233
bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5234
bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5235
float128 z;
5236
5237
aSig1 = extractFloat128Frac1( a );
5238
aSig0 = extractFloat128Frac0( a );
5239
aExp = extractFloat128Exp( a );
5240
aSign = extractFloat128Sign( a );
5241
if ( aExp == 0x7FFF ) {
5242
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5243
if ( ! aSign ) return a;
5244
goto invalid;
5245
}
5246
if ( aSign ) {
5247
if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5248
invalid:
5249
float_raise( float_flag_invalid );
5250
z.low = float128_default_nan_low;
5251
z.high = float128_default_nan_high;
5252
return z;
5253
}
5254
if ( aExp == 0 ) {
5255
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5256
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5257
}
5258
zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5259
aSig0 |= LIT64( 0x0001000000000000 );
5260
zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5261
shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5262
zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5263
doubleZSig0 = zSig0<<1;
5264
mul64To128( zSig0, zSig0, &term0, &term1 );
5265
sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5266
while ( (sbits64) rem0 < 0 ) {
5267
--zSig0;
5268
doubleZSig0 -= 2;
5269
add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5270
}
5271
zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5272
if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5273
if ( zSig1 == 0 ) zSig1 = 1;
5274
mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5275
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5276
mul64To128( zSig1, zSig1, &term2, &term3 );
5277
sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5278
while ( (sbits64) rem1 < 0 ) {
5279
--zSig1;
5280
shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5281
term3 |= 1;
5282
term2 |= doubleZSig0;
5283
add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5284
}
5285
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5286
}
5287
shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5288
return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5289
5290
}
5291
5292
/*
5293
-------------------------------------------------------------------------------
5294
Returns 1 if the quadruple-precision floating-point value `a' is equal to
5295
the corresponding value `b', and 0 otherwise. The comparison is performed
5296
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5297
-------------------------------------------------------------------------------
5298
*/
5299
flag float128_eq( float128 a, float128 b )
5300
{
5301
5302
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5303
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5304
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
5305
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5306
) {
5307
if ( float128_is_signaling_nan( a )
5308
|| float128_is_signaling_nan( b ) ) {
5309
float_raise( float_flag_invalid );
5310
}
5311
return 0;
5312
}
5313
return
5314
( a.low == b.low )
5315
&& ( ( a.high == b.high )
5316
|| ( ( a.low == 0 )
5317
&& ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5318
);
5319
5320
}
5321
5322
/*
5323
-------------------------------------------------------------------------------
5324
Returns 1 if the quadruple-precision floating-point value `a' is less than
5325
or equal to the corresponding value `b', and 0 otherwise. The comparison
5326
is performed according to the IEC/IEEE Standard for Binary Floating-Point
5327
Arithmetic.
5328
-------------------------------------------------------------------------------
5329
*/
5330
flag float128_le( float128 a, float128 b )
5331
{
5332
flag aSign, bSign;
5333
5334
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5335
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5336
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
5337
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5338
) {
5339
float_raise( float_flag_invalid );
5340
return 0;
5341
}
5342
aSign = extractFloat128Sign( a );
5343
bSign = extractFloat128Sign( b );
5344
if ( aSign != bSign ) {
5345
return
5346
aSign
5347
|| ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5348
== 0 );
5349
}
5350
return
5351
aSign ? le128( b.high, b.low, a.high, a.low )
5352
: le128( a.high, a.low, b.high, b.low );
5353
5354
}
5355
5356
/*
5357
-------------------------------------------------------------------------------
5358
Returns 1 if the quadruple-precision floating-point value `a' is less than
5359
the corresponding value `b', and 0 otherwise. The comparison is performed
5360
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5361
-------------------------------------------------------------------------------
5362
*/
5363
flag float128_lt( float128 a, float128 b )
5364
{
5365
flag aSign, bSign;
5366
5367
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5368
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5369
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
5370
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5371
) {
5372
float_raise( float_flag_invalid );
5373
return 0;
5374
}
5375
aSign = extractFloat128Sign( a );
5376
bSign = extractFloat128Sign( b );
5377
if ( aSign != bSign ) {
5378
return
5379
aSign
5380
&& ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5381
!= 0 );
5382
}
5383
return
5384
aSign ? lt128( b.high, b.low, a.high, a.low )
5385
: lt128( a.high, a.low, b.high, b.low );
5386
5387
}
5388
5389
/*
5390
-------------------------------------------------------------------------------
5391
Returns 1 if the quadruple-precision floating-point value `a' is equal to
5392
the corresponding value `b', and 0 otherwise. The invalid exception is
5393
raised if either operand is a NaN. Otherwise, the comparison is performed
5394
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5395
-------------------------------------------------------------------------------
5396
*/
5397
flag float128_eq_signaling( float128 a, float128 b )
5398
{
5399
5400
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5401
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5402
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
5403
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5404
) {
5405
float_raise( float_flag_invalid );
5406
return 0;
5407
}
5408
return
5409
( a.low == b.low )
5410
&& ( ( a.high == b.high )
5411
|| ( ( a.low == 0 )
5412
&& ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5413
);
5414
5415
}
5416
5417
/*
5418
-------------------------------------------------------------------------------
5419
Returns 1 if the quadruple-precision floating-point value `a' is less than
5420
or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5421
cause an exception. Otherwise, the comparison is performed according to the
5422
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5423
-------------------------------------------------------------------------------
5424
*/
5425
flag float128_le_quiet( float128 a, float128 b )
5426
{
5427
flag aSign, bSign;
5428
5429
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5430
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5431
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
5432
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5433
) {
5434
if ( float128_is_signaling_nan( a )
5435
|| float128_is_signaling_nan( b ) ) {
5436
float_raise( float_flag_invalid );
5437
}
5438
return 0;
5439
}
5440
aSign = extractFloat128Sign( a );
5441
bSign = extractFloat128Sign( b );
5442
if ( aSign != bSign ) {
5443
return
5444
aSign
5445
|| ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5446
== 0 );
5447
}
5448
return
5449
aSign ? le128( b.high, b.low, a.high, a.low )
5450
: le128( a.high, a.low, b.high, b.low );
5451
5452
}
5453
5454
/*
5455
-------------------------------------------------------------------------------
5456
Returns 1 if the quadruple-precision floating-point value `a' is less than
5457
the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5458
exception. Otherwise, the comparison is performed according to the IEC/IEEE
5459
Standard for Binary Floating-Point Arithmetic.
5460
-------------------------------------------------------------------------------
5461
*/
5462
flag float128_lt_quiet( float128 a, float128 b )
5463
{
5464
flag aSign, bSign;
5465
5466
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5467
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5468
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
5469
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5470
) {
5471
if ( float128_is_signaling_nan( a )
5472
|| float128_is_signaling_nan( b ) ) {
5473
float_raise( float_flag_invalid );
5474
}
5475
return 0;
5476
}
5477
aSign = extractFloat128Sign( a );
5478
bSign = extractFloat128Sign( b );
5479
if ( aSign != bSign ) {
5480
return
5481
aSign
5482
&& ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5483
!= 0 );
5484
}
5485
return
5486
aSign ? lt128( b.high, b.low, a.high, a.low )
5487
: lt128( a.high, a.low, b.high, b.low );
5488
5489
}
5490
5491
#endif
5492
5493
5494
#if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5495
5496
/*
5497
* These two routines are not part of the original softfloat distribution.
5498
*
5499
* They are based on the corresponding conversions to integer but return
5500
* unsigned numbers instead since these functions are required by GCC.
5501
*
5502
* Added by Mark Brinicombe <[email protected]> 27/09/97
5503
*
5504
* float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5505
*/
5506
5507
/*
5508
-------------------------------------------------------------------------------
5509
Returns the result of converting the double-precision floating-point value
5510
`a' to the 32-bit unsigned integer format. The conversion is
5511
performed according to the IEC/IEEE Standard for Binary Floating-point
5512
Arithmetic, except that the conversion is always rounded toward zero. If
5513
`a' is a NaN, the largest positive integer is returned. If the conversion
5514
overflows, the largest integer positive is returned.
5515
-------------------------------------------------------------------------------
5516
*/
5517
uint32 float64_to_uint32_round_to_zero( float64 a )
5518
{
5519
flag aSign;
5520
int16 aExp, shiftCount;
5521
bits64 aSig, savedASig;
5522
uint32 z;
5523
5524
aSig = extractFloat64Frac( a );
5525
aExp = extractFloat64Exp( a );
5526
aSign = extractFloat64Sign( a );
5527
5528
if (aSign) {
5529
float_raise( float_flag_invalid );
5530
return(0);
5531
}
5532
5533
if ( 0x41E < aExp ) {
5534
float_raise( float_flag_invalid );
5535
return 0xffffffff;
5536
}
5537
else if ( aExp < 0x3FF ) {
5538
if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
5539
return 0;
5540
}
5541
aSig |= LIT64( 0x0010000000000000 );
5542
shiftCount = 0x433 - aExp;
5543
savedASig = aSig;
5544
aSig >>= shiftCount;
5545
z = aSig;
5546
if ( ( aSig<<shiftCount ) != savedASig ) {
5547
float_exception_flags |= float_flag_inexact;
5548
}
5549
return z;
5550
5551
}
5552
5553
/*
5554
-------------------------------------------------------------------------------
5555
Returns the result of converting the single-precision floating-point value
5556
`a' to the 32-bit unsigned integer format. The conversion is
5557
performed according to the IEC/IEEE Standard for Binary Floating-point
5558
Arithmetic, except that the conversion is always rounded toward zero. If
5559
`a' is a NaN, the largest positive integer is returned. If the conversion
5560
overflows, the largest positive integer is returned.
5561
-------------------------------------------------------------------------------
5562
*/
5563
uint32 float32_to_uint32_round_to_zero( float32 a )
5564
{
5565
flag aSign;
5566
int16 aExp, shiftCount;
5567
bits32 aSig;
5568
uint32 z;
5569
5570
aSig = extractFloat32Frac( a );
5571
aExp = extractFloat32Exp( a );
5572
aSign = extractFloat32Sign( a );
5573
shiftCount = aExp - 0x9E;
5574
5575
if (aSign) {
5576
float_raise( float_flag_invalid );
5577
return(0);
5578
}
5579
if ( 0 < shiftCount ) {
5580
float_raise( float_flag_invalid );
5581
return 0xFFFFFFFF;
5582
}
5583
else if ( aExp <= 0x7E ) {
5584
if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
5585
return 0;
5586
}
5587
aSig = ( aSig | 0x800000 )<<8;
5588
z = aSig>>( - shiftCount );
5589
if ( aSig<<( shiftCount & 31 ) ) {
5590
float_exception_flags |= float_flag_inexact;
5591
}
5592
return z;
5593
5594
}
5595
5596
#endif
5597
5598