CoCalc -- softfloat.c

GitHub Repository: freebsd/freebsd-src
Path: blob/main/lib/libc/softfloat/bits64/softfloat.c
³⁹⁵³⁰ views
1
/* $NetBSD: softfloat.c,v 1.8 2011/07/10 04:52:23 matt Exp $ */
2

3
/*
4
 * This version hacked for use with gcc -msoft-float by bjh21.
5
 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6
 *  itself).
7
 */
8

9
/*
10
 * Things you may want to define:
11
 *
12
 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13
 *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
14
 *   properly renamed.
15
 */
16

17
/*
18
===============================================================================
19

20
This C source file is part of the SoftFloat IEC/IEEE Floating-point
21
Arithmetic Package, Release 2a.
22

23
Written by John R. Hauser.  This work was made possible in part by the
24
International Computer Science Institute, located at Suite 600, 1947 Center
25
Street, Berkeley, California 94704.  Funding was partially provided by the
26
National Science Foundation under grant MIP-9311980.  The original version
27
of this code was written as part of a project to build a fixed-point vector
28
processor in collaboration with the University of California at Berkeley,
29
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30
is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31
arithmetic/SoftFloat.html'.
32

33
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34
has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35
TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36
PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37
AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38

39
Derivative works are acceptable, even for commercial purposes, so long as
40
(1) they include prominent notice that the work is derivative, and (2) they
41
include prominent notice akin to these four paragraphs for those parts of
42
this code that are retained.
43

44
===============================================================================
45
*/
46

47
#ifdef SOFTFLOAT_FOR_GCC
48
#include "softfloat-for-gcc.h"
49
#endif
50

51
#include "milieu.h"
52
#include "softfloat.h"
53

54
/*
55
 * Conversions between floats as stored in memory and floats as
56
 * SoftFloat uses them
57
 */
58
#ifndef FLOAT64_DEMANGLE
59
#define FLOAT64_DEMANGLE(a)	(a)
60
#endif
61
#ifndef FLOAT64_MANGLE
62
#define FLOAT64_MANGLE(a)	(a)
63
#endif
64

65
/*
66
-------------------------------------------------------------------------------
67
Floating-point rounding mode, extended double-precision rounding precision,
68
and exception flags.
69
-------------------------------------------------------------------------------
70
*/
71
int float_rounding_mode = float_round_nearest_even;
72
int float_exception_flags = 0;
73
#ifdef FLOATX80
74
int8 floatx80_rounding_precision = 80;
75
#endif
76

77
/*
78
-------------------------------------------------------------------------------
79
Primitive arithmetic functions, including multi-word arithmetic, and
80
division and square root approximations.  (Can be specialized to target if
81
desired.)
82
-------------------------------------------------------------------------------
83
*/
84
#include "softfloat-macros"
85

86
/*
87
-------------------------------------------------------------------------------
88
Functions and definitions to determine:  (1) whether tininess for underflow
89
is detected before or after rounding by default, (2) what (if anything)
90
happens when exceptions are raised, (3) how signaling NaNs are distinguished
91
from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
92
are propagated from function inputs to output.  These details are target-
93
specific.
94
-------------------------------------------------------------------------------
95
*/
96
#include "softfloat-specialize"
97

98
#if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
99
/*
100
-------------------------------------------------------------------------------
101
Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
102
and 7, and returns the properly rounded 32-bit integer corresponding to the
103
input.  If `zSign' is 1, the input is negated before being converted to an
104
integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
105
is simply rounded to an integer, with the inexact exception raised if the
106
input cannot be represented exactly as an integer.  However, if the fixed-
107
point input is too large, the invalid exception is raised and the largest
108
positive or negative integer is returned.
109
-------------------------------------------------------------------------------
110
*/
111
static int32 roundAndPackInt32( flag zSign, bits64 absZ )
112
{
113
    int8 roundingMode;
114
    flag roundNearestEven;
115
    int8 roundIncrement, roundBits;
116
    int32 z;
117

118
    roundingMode = float_rounding_mode;
119
    roundNearestEven = ( roundingMode == float_round_nearest_even );
120
    roundIncrement = 0x40;
121
    if ( ! roundNearestEven ) {
122
        if ( roundingMode == float_round_to_zero ) {
123
            roundIncrement = 0;
124
        }
125
        else {
126
            roundIncrement = 0x7F;
127
            if ( zSign ) {
128
                if ( roundingMode == float_round_up ) roundIncrement = 0;
129
            }
130
            else {
131
                if ( roundingMode == float_round_down ) roundIncrement = 0;
132
            }
133
        }
134
    }
135
    roundBits = absZ & 0x7F;
136
    absZ = ( absZ + roundIncrement )>>7;
137
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
138
    z = absZ;
139
    if ( zSign ) z = - z;
140
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
141
        float_raise( float_flag_invalid );
142
        return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
143
    }
144
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
145
    return z;
146

147
}
148

149
/*
150
-------------------------------------------------------------------------------
151
Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
152
`absZ1', with binary point between bits 63 and 64 (between the input words),
153
and returns the properly rounded 64-bit integer corresponding to the input.
154
If `zSign' is 1, the input is negated before being converted to an integer.
155
Ordinarily, the fixed-point input is simply rounded to an integer, with
156
the inexact exception raised if the input cannot be represented exactly as
157
an integer.  However, if the fixed-point input is too large, the invalid
158
exception is raised and the largest positive or negative integer is
159
returned.
160
-------------------------------------------------------------------------------
161
*/
162
static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
163
{
164
    int8 roundingMode;
165
    flag roundNearestEven, increment;
166
    int64 z;
167

168
    roundingMode = float_rounding_mode;
169
    roundNearestEven = ( roundingMode == float_round_nearest_even );
170
    increment = ( (sbits64) absZ1 < 0 );
171
    if ( ! roundNearestEven ) {
172
        if ( roundingMode == float_round_to_zero ) {
173
            increment = 0;
174
        }
175
        else {
176
            if ( zSign ) {
177
                increment = ( roundingMode == float_round_down ) && absZ1;
178
            }
179
            else {
180
                increment = ( roundingMode == float_round_up ) && absZ1;
181
            }
182
        }
183
    }
184
    if ( increment ) {
185
        ++absZ0;
186
        if ( absZ0 == 0 ) goto overflow;
187
        absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
188
    }
189
    z = absZ0;
190
    if ( zSign ) z = - z;
191
    if ( z && ( ( z < 0 ) ^ zSign ) ) {
192
 overflow:
193
        float_raise( float_flag_invalid );
194
        return
195
              zSign ? (sbits64) LIT64( 0x8000000000000000 )
196
            : LIT64( 0x7FFFFFFFFFFFFFFF );
197
    }
198
    if ( absZ1 ) float_exception_flags |= float_flag_inexact;
199
    return z;
200

201
}
202
#endif
203

204
/*
205
-------------------------------------------------------------------------------
206
Returns the fraction bits of the single-precision floating-point value `a'.
207
-------------------------------------------------------------------------------
208
*/
209
INLINE bits32 extractFloat32Frac( float32 a )
210
{
211

212
    return a & 0x007FFFFF;
213

214
}
215

216
/*
217
-------------------------------------------------------------------------------
218
Returns the exponent bits of the single-precision floating-point value `a'.
219
-------------------------------------------------------------------------------
220
*/
221
INLINE int16 extractFloat32Exp( float32 a )
222
{
223

224
    return ( a>>23 ) & 0xFF;
225

226
}
227

228
/*
229
-------------------------------------------------------------------------------
230
Returns the sign bit of the single-precision floating-point value `a'.
231
-------------------------------------------------------------------------------
232
*/
233
INLINE flag extractFloat32Sign( float32 a )
234
{
235

236
    return a>>31;
237

238
}
239

240
/*
241
-------------------------------------------------------------------------------
242
Normalizes the subnormal single-precision floating-point value represented
243
by the denormalized significand `aSig'.  The normalized exponent and
244
significand are stored at the locations pointed to by `zExpPtr' and
245
`zSigPtr', respectively.
246
-------------------------------------------------------------------------------
247
*/
248
static void
249
 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
250
{
251
    int8 shiftCount;
252

253
    shiftCount = countLeadingZeros32( aSig ) - 8;
254
    *zSigPtr = aSig<<shiftCount;
255
    *zExpPtr = 1 - shiftCount;
256

257
}
258

259
/*
260
-------------------------------------------------------------------------------
261
Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
262
single-precision floating-point value, returning the result.  After being
263
shifted into the proper positions, the three fields are simply added
264
together to form the result.  This means that any integer portion of `zSig'
265
will be added into the exponent.  Since a properly normalized significand
266
will have an integer portion equal to 1, the `zExp' input should be 1 less
267
than the desired result exponent whenever `zSig' is a complete, normalized
268
significand.
269
-------------------------------------------------------------------------------
270
*/
271
INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
272
{
273

274
    return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
275

276
}
277

278
/*
279
-------------------------------------------------------------------------------
280
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
281
and significand `zSig', and returns the proper single-precision floating-
282
point value corresponding to the abstract input.  Ordinarily, the abstract
283
value is simply rounded and packed into the single-precision format, with
284
the inexact exception raised if the abstract input cannot be represented
285
exactly.  However, if the abstract value is too large, the overflow and
286
inexact exceptions are raised and an infinity or maximal finite value is
287
returned.  If the abstract value is too small, the input value is rounded to
288
a subnormal number, and the underflow and inexact exceptions are raised if
289
the abstract input cannot be represented exactly as a subnormal single-
290
precision floating-point number.
291
    The input significand `zSig' has its binary point between bits 30
292
and 29, which is 7 bits to the left of the usual location.  This shifted
293
significand must be normalized or smaller.  If `zSig' is not normalized,
294
`zExp' must be 0; in that case, the result returned is a subnormal number,
295
and it must not require rounding.  In the usual case that `zSig' is
296
normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
297
The handling of underflow and overflow follows the IEC/IEEE Standard for
298
Binary Floating-Point Arithmetic.
299
-------------------------------------------------------------------------------
300
*/
301
static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
302
{
303
    int8 roundingMode;
304
    flag roundNearestEven;
305
    int8 roundIncrement, roundBits;
306
    flag isTiny;
307

308
    roundingMode = float_rounding_mode;
309
    roundNearestEven = ( roundingMode == float_round_nearest_even );
310
    roundIncrement = 0x40;
311
    if ( ! roundNearestEven ) {
312
        if ( roundingMode == float_round_to_zero ) {
313
            roundIncrement = 0;
314
        }
315
        else {
316
            roundIncrement = 0x7F;
317
            if ( zSign ) {
318
                if ( roundingMode == float_round_up ) roundIncrement = 0;
319
            }
320
            else {
321
                if ( roundingMode == float_round_down ) roundIncrement = 0;
322
            }
323
        }
324
    }
325
    roundBits = zSig & 0x7F;
326
    if ( 0xFD <= (bits16) zExp ) {
327
        if (    ( 0xFD < zExp )
328
             || (    ( zExp == 0xFD )
329
                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
330
           ) {
331
            float_raise( float_flag_overflow | float_flag_inexact );
332
            return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
333
        }
334
        if ( zExp < 0 ) {
335
            isTiny =
336
                   ( float_detect_tininess == float_tininess_before_rounding )
337
                || ( zExp < -1 )
338
                || ( zSig + roundIncrement < 0x80000000 );
339
            shift32RightJamming( zSig, - zExp, &zSig );
340
            zExp = 0;
341
            roundBits = zSig & 0x7F;
342
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
343
        }
344
    }
345
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
346
    zSig = ( zSig + roundIncrement )>>7;
347
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
348
    if ( zSig == 0 ) zExp = 0;
349
    return packFloat32( zSign, zExp, zSig );
350

351
}
352

353
/*
354
-------------------------------------------------------------------------------
355
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
356
and significand `zSig', and returns the proper single-precision floating-
357
point value corresponding to the abstract input.  This routine is just like
358
`roundAndPackFloat32' except that `zSig' does not have to be normalized.
359
Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
360
floating-point exponent.
361
-------------------------------------------------------------------------------
362
*/
363
static float32
364
 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
365
{
366
    int8 shiftCount;
367

368
    shiftCount = countLeadingZeros32( zSig ) - 1;
369
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
370

371
}
372

373
/*
374
-------------------------------------------------------------------------------
375
Returns the fraction bits of the double-precision floating-point value `a'.
376
-------------------------------------------------------------------------------
377
*/
378
INLINE bits64 extractFloat64Frac( float64 a )
379
{
380

381
    return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
382

383
}
384

385
/*
386
-------------------------------------------------------------------------------
387
Returns the exponent bits of the double-precision floating-point value `a'.
388
-------------------------------------------------------------------------------
389
*/
390
INLINE int16 extractFloat64Exp( float64 a )
391
{
392

393
    return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
394

395
}
396

397
/*
398
-------------------------------------------------------------------------------
399
Returns the sign bit of the double-precision floating-point value `a'.
400
-------------------------------------------------------------------------------
401
*/
402
INLINE flag extractFloat64Sign( float64 a )
403
{
404

405
    return FLOAT64_DEMANGLE(a)>>63;
406

407
}
408

409
/*
410
-------------------------------------------------------------------------------
411
Normalizes the subnormal double-precision floating-point value represented
412
by the denormalized significand `aSig'.  The normalized exponent and
413
significand are stored at the locations pointed to by `zExpPtr' and
414
`zSigPtr', respectively.
415
-------------------------------------------------------------------------------
416
*/
417
static void
418
 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
419
{
420
    int8 shiftCount;
421

422
    shiftCount = countLeadingZeros64( aSig ) - 11;
423
    *zSigPtr = aSig<<shiftCount;
424
    *zExpPtr = 1 - shiftCount;
425

426
}
427

428
/*
429
-------------------------------------------------------------------------------
430
Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
431
double-precision floating-point value, returning the result.  After being
432
shifted into the proper positions, the three fields are simply added
433
together to form the result.  This means that any integer portion of `zSig'
434
will be added into the exponent.  Since a properly normalized significand
435
will have an integer portion equal to 1, the `zExp' input should be 1 less
436
than the desired result exponent whenever `zSig' is a complete, normalized
437
significand.
438
-------------------------------------------------------------------------------
439
*/
440
INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
441
{
442

443
    return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
444
			   ( ( (bits64) zExp )<<52 ) + zSig );
445

446
}
447

448
/*
449
-------------------------------------------------------------------------------
450
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
451
and significand `zSig', and returns the proper double-precision floating-
452
point value corresponding to the abstract input.  Ordinarily, the abstract
453
value is simply rounded and packed into the double-precision format, with
454
the inexact exception raised if the abstract input cannot be represented
455
exactly.  However, if the abstract value is too large, the overflow and
456
inexact exceptions are raised and an infinity or maximal finite value is
457
returned.  If the abstract value is too small, the input value is rounded to
458
a subnormal number, and the underflow and inexact exceptions are raised if
459
the abstract input cannot be represented exactly as a subnormal double-
460
precision floating-point number.
461
    The input significand `zSig' has its binary point between bits 62
462
and 61, which is 10 bits to the left of the usual location.  This shifted
463
significand must be normalized or smaller.  If `zSig' is not normalized,
464
`zExp' must be 0; in that case, the result returned is a subnormal number,
465
and it must not require rounding.  In the usual case that `zSig' is
466
normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
467
The handling of underflow and overflow follows the IEC/IEEE Standard for
468
Binary Floating-Point Arithmetic.
469
-------------------------------------------------------------------------------
470
*/
471
static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
472
{
473
    int8 roundingMode;
474
    flag roundNearestEven;
475
    int16 roundIncrement, roundBits;
476
    flag isTiny;
477

478
    roundingMode = float_rounding_mode;
479
    roundNearestEven = ( roundingMode == float_round_nearest_even );
480
    roundIncrement = 0x200;
481
    if ( ! roundNearestEven ) {
482
        if ( roundingMode == float_round_to_zero ) {
483
            roundIncrement = 0;
484
        }
485
        else {
486
            roundIncrement = 0x3FF;
487
            if ( zSign ) {
488
                if ( roundingMode == float_round_up ) roundIncrement = 0;
489
            }
490
            else {
491
                if ( roundingMode == float_round_down ) roundIncrement = 0;
492
            }
493
        }
494
    }
495
    roundBits = zSig & 0x3FF;
496
    if ( 0x7FD <= (bits16) zExp ) {
497
        if (    ( 0x7FD < zExp )
498
             || (    ( zExp == 0x7FD )
499
                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
500
           ) {
501
            float_raise( float_flag_overflow | float_flag_inexact );
502
            return FLOAT64_MANGLE(
503
		FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
504
		( roundIncrement == 0 ));
505
        }
506
        if ( zExp < 0 ) {
507
            isTiny =
508
                   ( float_detect_tininess == float_tininess_before_rounding )
509
                || ( zExp < -1 )
510
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
511
            shift64RightJamming( zSig, - zExp, &zSig );
512
            zExp = 0;
513
            roundBits = zSig & 0x3FF;
514
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
515
        }
516
    }
517
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
518
    zSig = ( zSig + roundIncrement )>>10;
519
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
520
    if ( zSig == 0 ) zExp = 0;
521
    return packFloat64( zSign, zExp, zSig );
522

523
}
524

525
/*
526
-------------------------------------------------------------------------------
527
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
528
and significand `zSig', and returns the proper double-precision floating-
529
point value corresponding to the abstract input.  This routine is just like
530
`roundAndPackFloat64' except that `zSig' does not have to be normalized.
531
Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
532
floating-point exponent.
533
-------------------------------------------------------------------------------
534
*/
535
static float64
536
 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
537
{
538
    int8 shiftCount;
539

540
    shiftCount = countLeadingZeros64( zSig ) - 1;
541
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
542

543
}
544

545
#ifdef FLOATX80
546

547
/*
548
-------------------------------------------------------------------------------
549
Returns the fraction bits of the extended double-precision floating-point
550
value `a'.
551
-------------------------------------------------------------------------------
552
*/
553
INLINE bits64 extractFloatx80Frac( floatx80 a )
554
{
555

556
    return a.low;
557

558
}
559

560
/*
561
-------------------------------------------------------------------------------
562
Returns the exponent bits of the extended double-precision floating-point
563
value `a'.
564
-------------------------------------------------------------------------------
565
*/
566
INLINE int32 extractFloatx80Exp( floatx80 a )
567
{
568

569
    return a.high & 0x7FFF;
570

571
}
572

573
/*
574
-------------------------------------------------------------------------------
575
Returns the sign bit of the extended double-precision floating-point value
576
`a'.
577
-------------------------------------------------------------------------------
578
*/
579
INLINE flag extractFloatx80Sign( floatx80 a )
580
{
581

582
    return a.high>>15;
583

584
}
585

586
/*
587
-------------------------------------------------------------------------------
588
Normalizes the subnormal extended double-precision floating-point value
589
represented by the denormalized significand `aSig'.  The normalized exponent
590
and significand are stored at the locations pointed to by `zExpPtr' and
591
`zSigPtr', respectively.
592
-------------------------------------------------------------------------------
593
*/
594
static void
595
 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
596
{
597
    int8 shiftCount;
598

599
    shiftCount = countLeadingZeros64( aSig );
600
    *zSigPtr = aSig<<shiftCount;
601
    *zExpPtr = 1 - shiftCount;
602

603
}
604

605
/*
606
-------------------------------------------------------------------------------
607
Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
608
extended double-precision floating-point value, returning the result.
609
-------------------------------------------------------------------------------
610
*/
611
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
612
{
613
    floatx80 z;
614

615
    z.low = zSig;
616
    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
617
    return z;
618

619
}
620

621
/*
622
-------------------------------------------------------------------------------
623
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
624
and extended significand formed by the concatenation of `zSig0' and `zSig1',
625
and returns the proper extended double-precision floating-point value
626
corresponding to the abstract input.  Ordinarily, the abstract value is
627
rounded and packed into the extended double-precision format, with the
628
inexact exception raised if the abstract input cannot be represented
629
exactly.  However, if the abstract value is too large, the overflow and
630
inexact exceptions are raised and an infinity or maximal finite value is
631
returned.  If the abstract value is too small, the input value is rounded to
632
a subnormal number, and the underflow and inexact exceptions are raised if
633
the abstract input cannot be represented exactly as a subnormal extended
634
double-precision floating-point number.
635
    If `roundingPrecision' is 32 or 64, the result is rounded to the same
636
number of bits as single or double precision, respectively.  Otherwise, the
637
result is rounded to the full precision of the extended double-precision
638
format.
639
    The input significand must be normalized or smaller.  If the input
640
significand is not normalized, `zExp' must be 0; in that case, the result
641
returned is a subnormal number, and it must not require rounding.  The
642
handling of underflow and overflow follows the IEC/IEEE Standard for Binary
643
Floating-Point Arithmetic.
644
-------------------------------------------------------------------------------
645
*/
646
static floatx80
647
 roundAndPackFloatx80(
648
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
649
 )
650
{
651
    int8 roundingMode;
652
    flag roundNearestEven, increment, isTiny;
653
    int64 roundIncrement, roundMask, roundBits;
654

655
    roundingMode = float_rounding_mode;
656
    roundNearestEven = ( roundingMode == float_round_nearest_even );
657
    if ( roundingPrecision == 80 ) goto precision80;
658
    if ( roundingPrecision == 64 ) {
659
        roundIncrement = LIT64( 0x0000000000000400 );
660
        roundMask = LIT64( 0x00000000000007FF );
661
    }
662
    else if ( roundingPrecision == 32 ) {
663
        roundIncrement = LIT64( 0x0000008000000000 );
664
        roundMask = LIT64( 0x000000FFFFFFFFFF );
665
    }
666
    else {
667
        goto precision80;
668
    }
669
    zSig0 |= ( zSig1 != 0 );
670
    if ( ! roundNearestEven ) {
671
        if ( roundingMode == float_round_to_zero ) {
672
            roundIncrement = 0;
673
        }
674
        else {
675
            roundIncrement = roundMask;
676
            if ( zSign ) {
677
                if ( roundingMode == float_round_up ) roundIncrement = 0;
678
            }
679
            else {
680
                if ( roundingMode == float_round_down ) roundIncrement = 0;
681
            }
682
        }
683
    }
684
    roundBits = zSig0 & roundMask;
685
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
686
        if (    ( 0x7FFE < zExp )
687
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
688
           ) {
689
            goto overflow;
690
        }
691
        if ( zExp <= 0 ) {
692
            isTiny =
693
                   ( float_detect_tininess == float_tininess_before_rounding )
694
                || ( zExp < 0 )
695
                || ( zSig0 <= zSig0 + roundIncrement );
696
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
697
            zExp = 0;
698
            roundBits = zSig0 & roundMask;
699
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
700
            if ( roundBits ) float_exception_flags |= float_flag_inexact;
701
            zSig0 += roundIncrement;
702
            if ( (sbits64) zSig0 < 0 ) zExp = 1;
703
            roundIncrement = roundMask + 1;
704
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
705
                roundMask |= roundIncrement;
706
            }
707
            zSig0 &= ~ roundMask;
708
            return packFloatx80( zSign, zExp, zSig0 );
709
        }
710
    }
711
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
712
    zSig0 += roundIncrement;
713
    if ( zSig0 < roundIncrement ) {
714
        ++zExp;
715
        zSig0 = LIT64( 0x8000000000000000 );
716
    }
717
    roundIncrement = roundMask + 1;
718
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
719
        roundMask |= roundIncrement;
720
    }
721
    zSig0 &= ~ roundMask;
722
    if ( zSig0 == 0 ) zExp = 0;
723
    return packFloatx80( zSign, zExp, zSig0 );
724
 precision80:
725
    increment = ( (sbits64) zSig1 < 0 );
726
    if ( ! roundNearestEven ) {
727
        if ( roundingMode == float_round_to_zero ) {
728
            increment = 0;
729
        }
730
        else {
731
            if ( zSign ) {
732
                increment = ( roundingMode == float_round_down ) && zSig1;
733
            }
734
            else {
735
                increment = ( roundingMode == float_round_up ) && zSig1;
736
            }
737
        }
738
    }
739
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
740
        if (    ( 0x7FFE < zExp )
741
             || (    ( zExp == 0x7FFE )
742
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
743
                  && increment
744
                )
745
           ) {
746
            roundMask = 0;
747
 overflow:
748
            float_raise( float_flag_overflow | float_flag_inexact );
749
            if (    ( roundingMode == float_round_to_zero )
750
                 || ( zSign && ( roundingMode == float_round_up ) )
751
                 || ( ! zSign && ( roundingMode == float_round_down ) )
752
               ) {
753
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
754
            }
755
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
756
        }
757
        if ( zExp <= 0 ) {
758
            isTiny =
759
                   ( float_detect_tininess == float_tininess_before_rounding )
760
                || ( zExp < 0 )
761
                || ! increment
762
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
763
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
764
            zExp = 0;
765
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
766
            if ( zSig1 ) float_exception_flags |= float_flag_inexact;
767
            if ( roundNearestEven ) {
768
                increment = ( (sbits64) zSig1 < 0 );
769
            }
770
            else {
771
                if ( zSign ) {
772
                    increment = ( roundingMode == float_round_down ) && zSig1;
773
                }
774
                else {
775
                    increment = ( roundingMode == float_round_up ) && zSig1;
776
                }
777
            }
778
            if ( increment ) {
779
                ++zSig0;
780
                zSig0 &=
781
                    ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
782
                if ( (sbits64) zSig0 < 0 ) zExp = 1;
783
            }
784
            return packFloatx80( zSign, zExp, zSig0 );
785
        }
786
    }
787
    if ( zSig1 ) float_exception_flags |= float_flag_inexact;
788
    if ( increment ) {
789
        ++zSig0;
790
        if ( zSig0 == 0 ) {
791
            ++zExp;
792
            zSig0 = LIT64( 0x8000000000000000 );
793
        }
794
        else {
795
            zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
796
        }
797
    }
798
    else {
799
        if ( zSig0 == 0 ) zExp = 0;
800
    }
801
    return packFloatx80( zSign, zExp, zSig0 );
802

803
}
804

805
/*
806
-------------------------------------------------------------------------------
807
Takes an abstract floating-point value having sign `zSign', exponent
808
`zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
809
and returns the proper extended double-precision floating-point value
810
corresponding to the abstract input.  This routine is just like
811
`roundAndPackFloatx80' except that the input significand does not have to be
812
normalized.
813
-------------------------------------------------------------------------------
814
*/
815
static floatx80
816
 normalizeRoundAndPackFloatx80(
817
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
818
 )
819
{
820
    int8 shiftCount;
821

822
    if ( zSig0 == 0 ) {
823
        zSig0 = zSig1;
824
        zSig1 = 0;
825
        zExp -= 64;
826
    }
827
    shiftCount = countLeadingZeros64( zSig0 );
828
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
829
    zExp -= shiftCount;
830
    return
831
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
832

833
}
834

835
#endif
836

837
#ifdef FLOAT128
838

839
/*
840
-------------------------------------------------------------------------------
841
Returns the least-significant 64 fraction bits of the quadruple-precision
842
floating-point value `a'.
843
-------------------------------------------------------------------------------
844
*/
845
INLINE bits64 extractFloat128Frac1( float128 a )
846
{
847

848
    return a.low;
849

850
}
851

852
/*
853
-------------------------------------------------------------------------------
854
Returns the most-significant 48 fraction bits of the quadruple-precision
855
floating-point value `a'.
856
-------------------------------------------------------------------------------
857
*/
858
INLINE bits64 extractFloat128Frac0( float128 a )
859
{
860

861
    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
862

863
}
864

865
/*
866
-------------------------------------------------------------------------------
867
Returns the exponent bits of the quadruple-precision floating-point value
868
`a'.
869
-------------------------------------------------------------------------------
870
*/
871
INLINE int32 extractFloat128Exp( float128 a )
872
{
873

874
    return ( a.high>>48 ) & 0x7FFF;
875

876
}
877

878
/*
879
-------------------------------------------------------------------------------
880
Returns the sign bit of the quadruple-precision floating-point value `a'.
881
-------------------------------------------------------------------------------
882
*/
883
INLINE flag extractFloat128Sign( float128 a )
884
{
885

886
    return a.high>>63;
887

888
}
889

890
/*
891
-------------------------------------------------------------------------------
892
Normalizes the subnormal quadruple-precision floating-point value
893
represented by the denormalized significand formed by the concatenation of
894
`aSig0' and `aSig1'.  The normalized exponent is stored at the location
895
pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
896
significand are stored at the location pointed to by `zSig0Ptr', and the
897
least significant 64 bits of the normalized significand are stored at the
898
location pointed to by `zSig1Ptr'.
899
-------------------------------------------------------------------------------
900
*/
901
static void
902
 normalizeFloat128Subnormal(
903
     bits64 aSig0,
904
     bits64 aSig1,
905
     int32 *zExpPtr,
906
     bits64 *zSig0Ptr,
907
     bits64 *zSig1Ptr
908
 )
909
{
910
    int8 shiftCount;
911

912
    if ( aSig0 == 0 ) {
913
        shiftCount = countLeadingZeros64( aSig1 ) - 15;
914
        if ( shiftCount < 0 ) {
915
            *zSig0Ptr = aSig1>>( - shiftCount );
916
            *zSig1Ptr = aSig1<<( shiftCount & 63 );
917
        }
918
        else {
919
            *zSig0Ptr = aSig1<<shiftCount;
920
            *zSig1Ptr = 0;
921
        }
922
        *zExpPtr = - shiftCount - 63;
923
    }
924
    else {
925
        shiftCount = countLeadingZeros64( aSig0 ) - 15;
926
        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
927
        *zExpPtr = 1 - shiftCount;
928
    }
929

930
}
931

932
/*
933
-------------------------------------------------------------------------------
934
Packs the sign `zSign', the exponent `zExp', and the significand formed
935
by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
936
floating-point value, returning the result.  After being shifted into the
937
proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
938
added together to form the most significant 32 bits of the result.  This
939
means that any integer portion of `zSig0' will be added into the exponent.
940
Since a properly normalized significand will have an integer portion equal
941
to 1, the `zExp' input should be 1 less than the desired result exponent
942
whenever `zSig0' and `zSig1' concatenated form a complete, normalized
943
significand.
944
-------------------------------------------------------------------------------
945
*/
946
INLINE float128
947
 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
948
{
949
    float128 z;
950

951
    z.low = zSig1;
952
    z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
953
    return z;
954

955
}
956

957
/*
958
-------------------------------------------------------------------------------
959
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
960
and extended significand formed by the concatenation of `zSig0', `zSig1',
961
and `zSig2', and returns the proper quadruple-precision floating-point value
962
corresponding to the abstract input.  Ordinarily, the abstract value is
963
simply rounded and packed into the quadruple-precision format, with the
964
inexact exception raised if the abstract input cannot be represented
965
exactly.  However, if the abstract value is too large, the overflow and
966
inexact exceptions are raised and an infinity or maximal finite value is
967
returned.  If the abstract value is too small, the input value is rounded to
968
a subnormal number, and the underflow and inexact exceptions are raised if
969
the abstract input cannot be represented exactly as a subnormal quadruple-
970
precision floating-point number.
971
    The input significand must be normalized or smaller.  If the input
972
significand is not normalized, `zExp' must be 0; in that case, the result
973
returned is a subnormal number, and it must not require rounding.  In the
974
usual case that the input significand is normalized, `zExp' must be 1 less
975
than the ``true'' floating-point exponent.  The handling of underflow and
976
overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
977
-------------------------------------------------------------------------------
978
*/
979
static float128
980
 roundAndPackFloat128(
981
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
982
{
983
    int8 roundingMode;
984
    flag roundNearestEven, increment, isTiny;
985

986
    roundingMode = float_rounding_mode;
987
    roundNearestEven = ( roundingMode == float_round_nearest_even );
988
    increment = ( (sbits64) zSig2 < 0 );
989
    if ( ! roundNearestEven ) {
990
        if ( roundingMode == float_round_to_zero ) {
991
            increment = 0;
992
        }
993
        else {
994
            if ( zSign ) {
995
                increment = ( roundingMode == float_round_down ) && zSig2;
996
            }
997
            else {
998
                increment = ( roundingMode == float_round_up ) && zSig2;
999
            }
1000
        }
1001
    }
1002
    if ( 0x7FFD <= (bits32) zExp ) {
1003
        if (    ( 0x7FFD < zExp )
1004
             || (    ( zExp == 0x7FFD )
1005
                  && eq128(
1006
                         LIT64( 0x0001FFFFFFFFFFFF ),
1007
                         LIT64( 0xFFFFFFFFFFFFFFFF ),
1008
                         zSig0,
1009
                         zSig1
1010
                     )
1011
                  && increment
1012
                )
1013
           ) {
1014
            float_raise( float_flag_overflow | float_flag_inexact );
1015
            if (    ( roundingMode == float_round_to_zero )
1016
                 || ( zSign && ( roundingMode == float_round_up ) )
1017
                 || ( ! zSign && ( roundingMode == float_round_down ) )
1018
               ) {
1019
                return
1020
                    packFloat128(
1021
                        zSign,
1022
                        0x7FFE,
1023
                        LIT64( 0x0000FFFFFFFFFFFF ),
1024
                        LIT64( 0xFFFFFFFFFFFFFFFF )
1025
                    );
1026
            }
1027
            return packFloat128( zSign, 0x7FFF, 0, 0 );
1028
        }
1029
        if ( zExp < 0 ) {
1030
            isTiny =
1031
                   ( float_detect_tininess == float_tininess_before_rounding )
1032
                || ( zExp < -1 )
1033
                || ! increment
1034
                || lt128(
1035
                       zSig0,
1036
                       zSig1,
1037
                       LIT64( 0x0001FFFFFFFFFFFF ),
1038
                       LIT64( 0xFFFFFFFFFFFFFFFF )
1039
                   );
1040
            shift128ExtraRightJamming(
1041
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1042
            zExp = 0;
1043
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1044
            if ( roundNearestEven ) {
1045
                increment = ( (sbits64) zSig2 < 0 );
1046
            }
1047
            else {
1048
                if ( zSign ) {
1049
                    increment = ( roundingMode == float_round_down ) && zSig2;
1050
                }
1051
                else {
1052
                    increment = ( roundingMode == float_round_up ) && zSig2;
1053
                }
1054
            }
1055
        }
1056
    }
1057
    if ( zSig2 ) float_exception_flags |= float_flag_inexact;
1058
    if ( increment ) {
1059
        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1060
        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1061
    }
1062
    else {
1063
        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1064
    }
1065
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1066

1067
}
1068

1069
/*
1070
-------------------------------------------------------------------------------
1071
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1072
and significand formed by the concatenation of `zSig0' and `zSig1', and
1073
returns the proper quadruple-precision floating-point value corresponding
1074
to the abstract input.  This routine is just like `roundAndPackFloat128'
1075
except that the input significand has fewer bits and does not have to be
1076
normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1077
point exponent.
1078
-------------------------------------------------------------------------------
1079
*/
1080
static float128
1081
 normalizeRoundAndPackFloat128(
1082
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1083
{
1084
    int8 shiftCount;
1085
    bits64 zSig2;
1086

1087
    if ( zSig0 == 0 ) {
1088
        zSig0 = zSig1;
1089
        zSig1 = 0;
1090
        zExp -= 64;
1091
    }
1092
    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1093
    if ( 0 <= shiftCount ) {
1094
        zSig2 = 0;
1095
        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1096
    }
1097
    else {
1098
        shift128ExtraRightJamming(
1099
            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1100
    }
1101
    zExp -= shiftCount;
1102
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1103

1104
}
1105

1106
#endif
1107

1108
/*
1109
-------------------------------------------------------------------------------
1110
Returns the result of converting the 32-bit two's complement integer `a'
1111
to the single-precision floating-point format.  The conversion is performed
1112
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113
-------------------------------------------------------------------------------
1114
*/
1115
float32 int32_to_float32( int32 a )
1116
{
1117
    flag zSign;
1118

1119
    if ( a == 0 ) return 0;
1120
    if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1121
    zSign = ( a < 0 );
1122
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1123

1124
}
1125

1126
#ifndef SOFTFLOAT_FOR_GCC /* __floatunsisf is in libgcc */
1127
float32 uint32_to_float32( uint32 a )
1128
{
1129
    if ( a == 0 ) return 0;
1130
    if ( a & (bits32) 0x80000000 )
1131
	return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 );
1132
    return normalizeRoundAndPackFloat32( 0, 0x9C, a );
1133
}
1134
#endif
1135

1136

1137
/*
1138
-------------------------------------------------------------------------------
1139
Returns the result of converting the 32-bit two's complement integer `a'
1140
to the double-precision floating-point format.  The conversion is performed
1141
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1142
-------------------------------------------------------------------------------
1143
*/
1144
float64 int32_to_float64( int32 a )
1145
{
1146
    flag zSign;
1147
    uint32 absA;
1148
    int8 shiftCount;
1149
    bits64 zSig;
1150

1151
    if ( a == 0 ) return 0;
1152
    zSign = ( a < 0 );
1153
    absA = zSign ? - a : a;
1154
    shiftCount = countLeadingZeros32( absA ) + 21;
1155
    zSig = absA;
1156
    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1157

1158
}
1159

1160
#ifndef SOFTFLOAT_FOR_GCC /* __floatunsidf is in libgcc */
1161
float64 uint32_to_float64( uint32 a )
1162
{
1163
    int8 shiftCount;
1164
    bits64 zSig = a;
1165

1166
    if ( a == 0 ) return 0;
1167
    shiftCount = countLeadingZeros32( a ) + 21;
1168
    return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount );
1169

1170
}
1171
#endif
1172

1173
#ifdef FLOATX80
1174

1175
/*
1176
-------------------------------------------------------------------------------
1177
Returns the result of converting the 32-bit two's complement integer `a'
1178
to the extended double-precision floating-point format.  The conversion
1179
is performed according to the IEC/IEEE Standard for Binary Floating-Point
1180
Arithmetic.
1181
-------------------------------------------------------------------------------
1182
*/
1183
floatx80 int32_to_floatx80( int32 a )
1184
{
1185
    flag zSign;
1186
    uint32 absA;
1187
    int8 shiftCount;
1188
    bits64 zSig;
1189

1190
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1191
    zSign = ( a < 0 );
1192
    absA = zSign ? - a : a;
1193
    shiftCount = countLeadingZeros32( absA ) + 32;
1194
    zSig = absA;
1195
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1196

1197
}
1198

1199
floatx80 uint32_to_floatx80( uint32 a )
1200
{
1201
    int8 shiftCount;
1202
    bits64 zSig = a;
1203

1204
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1205
    shiftCount = countLeadingZeros32( a ) + 32;
1206
    return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount );
1207

1208
}
1209

1210
#endif
1211

1212
#ifdef FLOAT128
1213

1214
/*
1215
-------------------------------------------------------------------------------
1216
Returns the result of converting the 32-bit two's complement integer `a' to
1217
the quadruple-precision floating-point format.  The conversion is performed
1218
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1219
-------------------------------------------------------------------------------
1220
*/
1221
float128 int32_to_float128( int32 a )
1222
{
1223
    flag zSign;
1224
    uint32 absA;
1225
    int8 shiftCount;
1226
    bits64 zSig0;
1227

1228
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1229
    zSign = ( a < 0 );
1230
    absA = zSign ? - a : a;
1231
    shiftCount = countLeadingZeros32( absA ) + 17;
1232
    zSig0 = absA;
1233
    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1234

1235
}
1236

1237
float128 uint32_to_float128( uint32 a )
1238
{
1239
    int8 shiftCount;
1240
    bits64 zSig0 = a;
1241

1242
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1243
    shiftCount = countLeadingZeros32( a ) + 17;
1244
    return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1245

1246
}
1247

1248
#endif
1249

1250
#ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1251
/*
1252
-------------------------------------------------------------------------------
1253
Returns the result of converting the 64-bit two's complement integer `a'
1254
to the single-precision floating-point format.  The conversion is performed
1255
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1256
-------------------------------------------------------------------------------
1257
*/
1258
float32 int64_to_float32( int64 a )
1259
{
1260
    flag zSign;
1261
    uint64 absA;
1262
    int8 shiftCount;
1263

1264
    if ( a == 0 ) return 0;
1265
    zSign = ( a < 0 );
1266
    absA = zSign ? - a : a;
1267
    shiftCount = countLeadingZeros64( absA ) - 40;
1268
    if ( 0 <= shiftCount ) {
1269
        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1270
    }
1271
    else {
1272
        shiftCount += 7;
1273
        if ( shiftCount < 0 ) {
1274
            shift64RightJamming( absA, - shiftCount, &absA );
1275
        }
1276
        else {
1277
            absA <<= shiftCount;
1278
        }
1279
        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1280
    }
1281

1282
}
1283

1284
/*
1285
-------------------------------------------------------------------------------
1286
Returns the result of converting the 64-bit two's complement integer `a'
1287
to the double-precision floating-point format.  The conversion is performed
1288
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1289
-------------------------------------------------------------------------------
1290
*/
1291
float64 int64_to_float64( int64 a )
1292
{
1293
    flag zSign;
1294

1295
    if ( a == 0 ) return 0;
1296
    if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1297
        return packFloat64( 1, 0x43E, 0 );
1298
    }
1299
    zSign = ( a < 0 );
1300
    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1301

1302
}
1303

1304
#ifdef FLOATX80
1305

1306
/*
1307
-------------------------------------------------------------------------------
1308
Returns the result of converting the 64-bit two's complement integer `a'
1309
to the extended double-precision floating-point format.  The conversion
1310
is performed according to the IEC/IEEE Standard for Binary Floating-Point
1311
Arithmetic.
1312
-------------------------------------------------------------------------------
1313
*/
1314
floatx80 int64_to_floatx80( int64 a )
1315
{
1316
    flag zSign;
1317
    uint64 absA;
1318
    int8 shiftCount;
1319

1320
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1321
    zSign = ( a < 0 );
1322
    absA = zSign ? - a : a;
1323
    shiftCount = countLeadingZeros64( absA );
1324
    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1325

1326
}
1327

1328
#endif
1329

1330
#endif /* !SOFTFLOAT_FOR_GCC */
1331

1332
#ifdef FLOAT128
1333

1334
/*
1335
-------------------------------------------------------------------------------
1336
Returns the result of converting the 64-bit two's complement integer `a' to
1337
the quadruple-precision floating-point format.  The conversion is performed
1338
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1339
-------------------------------------------------------------------------------
1340
*/
1341
float128 int64_to_float128( int64 a )
1342
{
1343
    flag zSign;
1344
    uint64 absA;
1345
    int8 shiftCount;
1346
    int32 zExp;
1347
    bits64 zSig0, zSig1;
1348

1349
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1350
    zSign = ( a < 0 );
1351
    absA = zSign ? - a : a;
1352
    shiftCount = countLeadingZeros64( absA ) + 49;
1353
    zExp = 0x406E - shiftCount;
1354
    if ( 64 <= shiftCount ) {
1355
        zSig1 = 0;
1356
        zSig0 = absA;
1357
        shiftCount -= 64;
1358
    }
1359
    else {
1360
        zSig1 = absA;
1361
        zSig0 = 0;
1362
    }
1363
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1364
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1365

1366
}
1367

1368
#endif
1369

1370
#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1371
/*
1372
-------------------------------------------------------------------------------
1373
Returns the result of converting the single-precision floating-point value
1374
`a' to the 32-bit two's complement integer format.  The conversion is
1375
performed according to the IEC/IEEE Standard for Binary Floating-Point
1376
Arithmetic---which means in particular that the conversion is rounded
1377
according to the current rounding mode.  If `a' is a NaN, the largest
1378
positive integer is returned.  Otherwise, if the conversion overflows, the
1379
largest integer with the same sign as `a' is returned.
1380
-------------------------------------------------------------------------------
1381
*/
1382
int32 float32_to_int32( float32 a )
1383
{
1384
    flag aSign;
1385
    int16 aExp, shiftCount;
1386
    bits32 aSig;
1387
    bits64 aSig64;
1388

1389
    aSig = extractFloat32Frac( a );
1390
    aExp = extractFloat32Exp( a );
1391
    aSign = extractFloat32Sign( a );
1392
    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1393
    if ( aExp ) aSig |= 0x00800000;
1394
    shiftCount = 0xAF - aExp;
1395
    aSig64 = aSig;
1396
    aSig64 <<= 32;
1397
    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1398
    return roundAndPackInt32( aSign, aSig64 );
1399

1400
}
1401
#endif /* !SOFTFLOAT_FOR_GCC */
1402

1403
/*
1404
-------------------------------------------------------------------------------
1405
Returns the result of converting the single-precision floating-point value
1406
`a' to the 32-bit two's complement integer format.  The conversion is
1407
performed according to the IEC/IEEE Standard for Binary Floating-Point
1408
Arithmetic, except that the conversion is always rounded toward zero.
1409
If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1410
the conversion overflows, the largest integer with the same sign as `a' is
1411
returned.
1412
-------------------------------------------------------------------------------
1413
*/
1414
int32 float32_to_int32_round_to_zero( float32 a )
1415
{
1416
    flag aSign;
1417
    int16 aExp, shiftCount;
1418
    bits32 aSig;
1419
    int32 z;
1420

1421
    aSig = extractFloat32Frac( a );
1422
    aExp = extractFloat32Exp( a );
1423
    aSign = extractFloat32Sign( a );
1424
    shiftCount = aExp - 0x9E;
1425
    if ( 0 <= shiftCount ) {
1426
        if ( a != 0xCF000000 ) {
1427
            float_raise( float_flag_invalid );
1428
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1429
        }
1430
        return (sbits32) 0x80000000;
1431
    }
1432
    else if ( aExp <= 0x7E ) {
1433
        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1434
        return 0;
1435
    }
1436
    aSig = ( aSig | 0x00800000 )<<8;
1437
    z = aSig>>( - shiftCount );
1438
    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1439
        float_exception_flags |= float_flag_inexact;
1440
    }
1441
    if ( aSign ) z = - z;
1442
    return z;
1443

1444
}
1445

1446
#ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1447
/*
1448
-------------------------------------------------------------------------------
1449
Returns the result of converting the single-precision floating-point value
1450
`a' to the 64-bit two's complement integer format.  The conversion is
1451
performed according to the IEC/IEEE Standard for Binary Floating-Point
1452
Arithmetic---which means in particular that the conversion is rounded
1453
according to the current rounding mode.  If `a' is a NaN, the largest
1454
positive integer is returned.  Otherwise, if the conversion overflows, the
1455
largest integer with the same sign as `a' is returned.
1456
-------------------------------------------------------------------------------
1457
*/
1458
int64 float32_to_int64( float32 a )
1459
{
1460
    flag aSign;
1461
    int16 aExp, shiftCount;
1462
    bits32 aSig;
1463
    bits64 aSig64, aSigExtra;
1464

1465
    aSig = extractFloat32Frac( a );
1466
    aExp = extractFloat32Exp( a );
1467
    aSign = extractFloat32Sign( a );
1468
    shiftCount = 0xBE - aExp;
1469
    if ( shiftCount < 0 ) {
1470
        float_raise( float_flag_invalid );
1471
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1472
            return LIT64( 0x7FFFFFFFFFFFFFFF );
1473
        }
1474
        return (sbits64) LIT64( 0x8000000000000000 );
1475
    }
1476
    if ( aExp ) aSig |= 0x00800000;
1477
    aSig64 = aSig;
1478
    aSig64 <<= 40;
1479
    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1480
    return roundAndPackInt64( aSign, aSig64, aSigExtra );
1481

1482
}
1483

1484
/*
1485
-------------------------------------------------------------------------------
1486
Returns the result of converting the single-precision floating-point value
1487
`a' to the 64-bit two's complement integer format.  The conversion is
1488
performed according to the IEC/IEEE Standard for Binary Floating-Point
1489
Arithmetic, except that the conversion is always rounded toward zero.  If
1490
`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1491
conversion overflows, the largest integer with the same sign as `a' is
1492
returned.
1493
-------------------------------------------------------------------------------
1494
*/
1495
int64 float32_to_int64_round_to_zero( float32 a )
1496
{
1497
    flag aSign;
1498
    int16 aExp, shiftCount;
1499
    bits32 aSig;
1500
    bits64 aSig64;
1501
    int64 z;
1502

1503
    aSig = extractFloat32Frac( a );
1504
    aExp = extractFloat32Exp( a );
1505
    aSign = extractFloat32Sign( a );
1506
    shiftCount = aExp - 0xBE;
1507
    if ( 0 <= shiftCount ) {
1508
        if ( a != 0xDF000000 ) {
1509
            float_raise( float_flag_invalid );
1510
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1511
                return LIT64( 0x7FFFFFFFFFFFFFFF );
1512
            }
1513
        }
1514
        return (sbits64) LIT64( 0x8000000000000000 );
1515
    }
1516
    else if ( aExp <= 0x7E ) {
1517
        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1518
        return 0;
1519
    }
1520
    aSig64 = aSig | 0x00800000;
1521
    aSig64 <<= 40;
1522
    z = aSig64>>( - shiftCount );
1523
    if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1524
        float_exception_flags |= float_flag_inexact;
1525
    }
1526
    if ( aSign ) z = - z;
1527
    return z;
1528

1529
}
1530
#endif /* !SOFTFLOAT_FOR_GCC */
1531

1532
/*
1533
-------------------------------------------------------------------------------
1534
Returns the result of converting the single-precision floating-point value
1535
`a' to the double-precision floating-point format.  The conversion is
1536
performed according to the IEC/IEEE Standard for Binary Floating-Point
1537
Arithmetic.
1538
-------------------------------------------------------------------------------
1539
*/
1540
float64 float32_to_float64( float32 a )
1541
{
1542
    flag aSign;
1543
    int16 aExp;
1544
    bits32 aSig;
1545

1546
    aSig = extractFloat32Frac( a );
1547
    aExp = extractFloat32Exp( a );
1548
    aSign = extractFloat32Sign( a );
1549
    if ( aExp == 0xFF ) {
1550
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1551
        return packFloat64( aSign, 0x7FF, 0 );
1552
    }
1553
    if ( aExp == 0 ) {
1554
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1555
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1556
        --aExp;
1557
    }
1558
    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1559

1560
}
1561

1562
#ifdef FLOATX80
1563

1564
/*
1565
-------------------------------------------------------------------------------
1566
Returns the result of converting the single-precision floating-point value
1567
`a' to the extended double-precision floating-point format.  The conversion
1568
is performed according to the IEC/IEEE Standard for Binary Floating-Point
1569
Arithmetic.
1570
-------------------------------------------------------------------------------
1571
*/
1572
floatx80 float32_to_floatx80( float32 a )
1573
{
1574
    flag aSign;
1575
    int16 aExp;
1576
    bits32 aSig;
1577

1578
    aSig = extractFloat32Frac( a );
1579
    aExp = extractFloat32Exp( a );
1580
    aSign = extractFloat32Sign( a );
1581
    if ( aExp == 0xFF ) {
1582
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1583
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1584
    }
1585
    if ( aExp == 0 ) {
1586
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1587
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1588
    }
1589
    aSig |= 0x00800000;
1590
    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1591

1592
}
1593

1594
#endif
1595

1596
#ifdef FLOAT128
1597

1598
/*
1599
-------------------------------------------------------------------------------
1600
Returns the result of converting the single-precision floating-point value
1601
`a' to the double-precision floating-point format.  The conversion is
1602
performed according to the IEC/IEEE Standard for Binary Floating-Point
1603
Arithmetic.
1604
-------------------------------------------------------------------------------
1605
*/
1606
float128 float32_to_float128( float32 a )
1607
{
1608
    flag aSign;
1609
    int16 aExp;
1610
    bits32 aSig;
1611

1612
    aSig = extractFloat32Frac( a );
1613
    aExp = extractFloat32Exp( a );
1614
    aSign = extractFloat32Sign( a );
1615
    if ( aExp == 0xFF ) {
1616
        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1617
        return packFloat128( aSign, 0x7FFF, 0, 0 );
1618
    }
1619
    if ( aExp == 0 ) {
1620
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1621
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1622
        --aExp;
1623
    }
1624
    return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1625

1626
}
1627

1628
#endif
1629

1630
#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1631
/*
1632
-------------------------------------------------------------------------------
1633
Rounds the single-precision floating-point value `a' to an integer, and
1634
returns the result as a single-precision floating-point value.  The
1635
operation is performed according to the IEC/IEEE Standard for Binary
1636
Floating-Point Arithmetic.
1637
-------------------------------------------------------------------------------
1638
*/
1639
float32 float32_round_to_int( float32 a )
1640
{
1641
    flag aSign;
1642
    int16 aExp;
1643
    bits32 lastBitMask, roundBitsMask;
1644
    int8 roundingMode;
1645
    float32 z;
1646

1647
    aExp = extractFloat32Exp( a );
1648
    if ( 0x96 <= aExp ) {
1649
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1650
            return propagateFloat32NaN( a, a );
1651
        }
1652
        return a;
1653
    }
1654
    if ( aExp <= 0x7E ) {
1655
        if ( (bits32) ( a<<1 ) == 0 ) return a;
1656
        float_exception_flags |= float_flag_inexact;
1657
        aSign = extractFloat32Sign( a );
1658
        switch ( float_rounding_mode ) {
1659
         case float_round_nearest_even:
1660
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1661
                return packFloat32( aSign, 0x7F, 0 );
1662
            }
1663
            break;
1664
	 case float_round_to_zero:
1665
	    break;
1666
         case float_round_down:
1667
            return aSign ? 0xBF800000 : 0;
1668
         case float_round_up:
1669
            return aSign ? 0x80000000 : 0x3F800000;
1670
        }
1671
        return packFloat32( aSign, 0, 0 );
1672
    }
1673
    lastBitMask = 1;
1674
    lastBitMask <<= 0x96 - aExp;
1675
    roundBitsMask = lastBitMask - 1;
1676
    z = a;
1677
    roundingMode = float_rounding_mode;
1678
    if ( roundingMode == float_round_nearest_even ) {
1679
        z += lastBitMask>>1;
1680
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1681
    }
1682
    else if ( roundingMode != float_round_to_zero ) {
1683
        if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1684
            z += roundBitsMask;
1685
        }
1686
    }
1687
    z &= ~ roundBitsMask;
1688
    if ( z != a ) float_exception_flags |= float_flag_inexact;
1689
    return z;
1690

1691
}
1692
#endif /* !SOFTFLOAT_FOR_GCC */
1693

1694
/*
1695
-------------------------------------------------------------------------------
1696
Returns the result of adding the absolute values of the single-precision
1697
floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1698
before being returned.  `zSign' is ignored if the result is a NaN.
1699
The addition is performed according to the IEC/IEEE Standard for Binary
1700
Floating-Point Arithmetic.
1701
-------------------------------------------------------------------------------
1702
*/
1703
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1704
{
1705
    int16 aExp, bExp, zExp;
1706
    bits32 aSig, bSig, zSig;
1707
    int16 expDiff;
1708

1709
    aSig = extractFloat32Frac( a );
1710
    aExp = extractFloat32Exp( a );
1711
    bSig = extractFloat32Frac( b );
1712
    bExp = extractFloat32Exp( b );
1713
    expDiff = aExp - bExp;
1714
    aSig <<= 6;
1715
    bSig <<= 6;
1716
    if ( 0 < expDiff ) {
1717
        if ( aExp == 0xFF ) {
1718
            if ( aSig ) return propagateFloat32NaN( a, b );
1719
            return a;
1720
        }
1721
        if ( bExp == 0 ) {
1722
            --expDiff;
1723
        }
1724
        else {
1725
            bSig |= 0x20000000;
1726
        }
1727
        shift32RightJamming( bSig, expDiff, &bSig );
1728
        zExp = aExp;
1729
    }
1730
    else if ( expDiff < 0 ) {
1731
        if ( bExp == 0xFF ) {
1732
            if ( bSig ) return propagateFloat32NaN( a, b );
1733
            return packFloat32( zSign, 0xFF, 0 );
1734
        }
1735
        if ( aExp == 0 ) {
1736
            ++expDiff;
1737
        }
1738
        else {
1739
            aSig |= 0x20000000;
1740
        }
1741
        shift32RightJamming( aSig, - expDiff, &aSig );
1742
        zExp = bExp;
1743
    }
1744
    else {
1745
        if ( aExp == 0xFF ) {
1746
            if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1747
            return a;
1748
        }
1749
        if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1750
        zSig = 0x40000000 + aSig + bSig;
1751
        zExp = aExp;
1752
        goto roundAndPack;
1753
    }
1754
    aSig |= 0x20000000;
1755
    zSig = ( aSig + bSig )<<1;
1756
    --zExp;
1757
    if ( (sbits32) zSig < 0 ) {
1758
        zSig = aSig + bSig;
1759
        ++zExp;
1760
    }
1761
 roundAndPack:
1762
    return roundAndPackFloat32( zSign, zExp, zSig );
1763

1764
}
1765

1766
/*
1767
-------------------------------------------------------------------------------
1768
Returns the result of subtracting the absolute values of the single-
1769
precision floating-point values `a' and `b'.  If `zSign' is 1, the
1770
difference is negated before being returned.  `zSign' is ignored if the
1771
result is a NaN.  The subtraction is performed according to the IEC/IEEE
1772
Standard for Binary Floating-Point Arithmetic.
1773
-------------------------------------------------------------------------------
1774
*/
1775
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1776
{
1777
    int16 aExp, bExp, zExp;
1778
    bits32 aSig, bSig, zSig;
1779
    int16 expDiff;
1780

1781
    aSig = extractFloat32Frac( a );
1782
    aExp = extractFloat32Exp( a );
1783
    bSig = extractFloat32Frac( b );
1784
    bExp = extractFloat32Exp( b );
1785
    expDiff = aExp - bExp;
1786
    aSig <<= 7;
1787
    bSig <<= 7;
1788
    if ( 0 < expDiff ) goto aExpBigger;
1789
    if ( expDiff < 0 ) goto bExpBigger;
1790
    if ( aExp == 0xFF ) {
1791
        if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1792
        float_raise( float_flag_invalid );
1793
        return float32_default_nan;
1794
    }
1795
    if ( aExp == 0 ) {
1796
        aExp = 1;
1797
        bExp = 1;
1798
    }
1799
    if ( bSig < aSig ) goto aBigger;
1800
    if ( aSig < bSig ) goto bBigger;
1801
    return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1802
 bExpBigger:
1803
    if ( bExp == 0xFF ) {
1804
        if ( bSig ) return propagateFloat32NaN( a, b );
1805
        return packFloat32( zSign ^ 1, 0xFF, 0 );
1806
    }
1807
    if ( aExp == 0 ) {
1808
        ++expDiff;
1809
    }
1810
    else {
1811
        aSig |= 0x40000000;
1812
    }
1813
    shift32RightJamming( aSig, - expDiff, &aSig );
1814
    bSig |= 0x40000000;
1815
 bBigger:
1816
    zSig = bSig - aSig;
1817
    zExp = bExp;
1818
    zSign ^= 1;
1819
    goto normalizeRoundAndPack;
1820
 aExpBigger:
1821
    if ( aExp == 0xFF ) {
1822
        if ( aSig ) return propagateFloat32NaN( a, b );
1823
        return a;
1824
    }
1825
    if ( bExp == 0 ) {
1826
        --expDiff;
1827
    }
1828
    else {
1829
        bSig |= 0x40000000;
1830
    }
1831
    shift32RightJamming( bSig, expDiff, &bSig );
1832
    aSig |= 0x40000000;
1833
 aBigger:
1834
    zSig = aSig - bSig;
1835
    zExp = aExp;
1836
 normalizeRoundAndPack:
1837
    --zExp;
1838
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1839

1840
}
1841

1842
/*
1843
-------------------------------------------------------------------------------
1844
Returns the result of adding the single-precision floating-point values `a'
1845
and `b'.  The operation is performed according to the IEC/IEEE Standard for
1846
Binary Floating-Point Arithmetic.
1847
-------------------------------------------------------------------------------
1848
*/
1849
float32 float32_add( float32 a, float32 b )
1850
{
1851
    flag aSign, bSign;
1852

1853
    aSign = extractFloat32Sign( a );
1854
    bSign = extractFloat32Sign( b );
1855
    if ( aSign == bSign ) {
1856
        return addFloat32Sigs( a, b, aSign );
1857
    }
1858
    else {
1859
        return subFloat32Sigs( a, b, aSign );
1860
    }
1861

1862
}
1863

1864
/*
1865
-------------------------------------------------------------------------------
1866
Returns the result of subtracting the single-precision floating-point values
1867
`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1868
for Binary Floating-Point Arithmetic.
1869
-------------------------------------------------------------------------------
1870
*/
1871
float32 float32_sub( float32 a, float32 b )
1872
{
1873
    flag aSign, bSign;
1874

1875
    aSign = extractFloat32Sign( a );
1876
    bSign = extractFloat32Sign( b );
1877
    if ( aSign == bSign ) {
1878
        return subFloat32Sigs( a, b, aSign );
1879
    }
1880
    else {
1881
        return addFloat32Sigs( a, b, aSign );
1882
    }
1883

1884
}
1885

1886
/*
1887
-------------------------------------------------------------------------------
1888
Returns the result of multiplying the single-precision floating-point values
1889
`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1890
for Binary Floating-Point Arithmetic.
1891
-------------------------------------------------------------------------------
1892
*/
1893
float32 float32_mul( float32 a, float32 b )
1894
{
1895
    flag aSign, bSign, zSign;
1896
    int16 aExp, bExp, zExp;
1897
    bits32 aSig, bSig;
1898
    bits64 zSig64;
1899
    bits32 zSig;
1900

1901
    aSig = extractFloat32Frac( a );
1902
    aExp = extractFloat32Exp( a );
1903
    aSign = extractFloat32Sign( a );
1904
    bSig = extractFloat32Frac( b );
1905
    bExp = extractFloat32Exp( b );
1906
    bSign = extractFloat32Sign( b );
1907
    zSign = aSign ^ bSign;
1908
    if ( aExp == 0xFF ) {
1909
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1910
            return propagateFloat32NaN( a, b );
1911
        }
1912
        if ( ( bExp | bSig ) == 0 ) {
1913
            float_raise( float_flag_invalid );
1914
            return float32_default_nan;
1915
        }
1916
        return packFloat32( zSign, 0xFF, 0 );
1917
    }
1918
    if ( bExp == 0xFF ) {
1919
        if ( bSig ) return propagateFloat32NaN( a, b );
1920
        if ( ( aExp | aSig ) == 0 ) {
1921
            float_raise( float_flag_invalid );
1922
            return float32_default_nan;
1923
        }
1924
        return packFloat32( zSign, 0xFF, 0 );
1925
    }
1926
    if ( aExp == 0 ) {
1927
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1928
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1929
    }
1930
    if ( bExp == 0 ) {
1931
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1932
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1933
    }
1934
    zExp = aExp + bExp - 0x7F;
1935
    aSig = ( aSig | 0x00800000 )<<7;
1936
    bSig = ( bSig | 0x00800000 )<<8;
1937
    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1938
    zSig = zSig64;
1939
    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1940
        zSig <<= 1;
1941
        --zExp;
1942
    }
1943
    return roundAndPackFloat32( zSign, zExp, zSig );
1944

1945
}
1946

1947
/*
1948
-------------------------------------------------------------------------------
1949
Returns the result of dividing the single-precision floating-point value `a'
1950
by the corresponding value `b'.  The operation is performed according to the
1951
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1952
-------------------------------------------------------------------------------
1953
*/
1954
float32 float32_div( float32 a, float32 b )
1955
{
1956
    flag aSign, bSign, zSign;
1957
    int16 aExp, bExp, zExp;
1958
    bits32 aSig, bSig, zSig;
1959

1960
    aSig = extractFloat32Frac( a );
1961
    aExp = extractFloat32Exp( a );
1962
    aSign = extractFloat32Sign( a );
1963
    bSig = extractFloat32Frac( b );
1964
    bExp = extractFloat32Exp( b );
1965
    bSign = extractFloat32Sign( b );
1966
    zSign = aSign ^ bSign;
1967
    if ( aExp == 0xFF ) {
1968
        if ( aSig ) return propagateFloat32NaN( a, b );
1969
        if ( bExp == 0xFF ) {
1970
            if ( bSig ) return propagateFloat32NaN( a, b );
1971
            float_raise( float_flag_invalid );
1972
            return float32_default_nan;
1973
        }
1974
        return packFloat32( zSign, 0xFF, 0 );
1975
    }
1976
    if ( bExp == 0xFF ) {
1977
        if ( bSig ) return propagateFloat32NaN( a, b );
1978
        return packFloat32( zSign, 0, 0 );
1979
    }
1980
    if ( bExp == 0 ) {
1981
        if ( bSig == 0 ) {
1982
            if ( ( aExp | aSig ) == 0 ) {
1983
                float_raise( float_flag_invalid );
1984
                return float32_default_nan;
1985
            }
1986
            float_raise( float_flag_divbyzero );
1987
            return packFloat32( zSign, 0xFF, 0 );
1988
        }
1989
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1990
    }
1991
    if ( aExp == 0 ) {
1992
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1993
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1994
    }
1995
    zExp = aExp - bExp + 0x7D;
1996
    aSig = ( aSig | 0x00800000 )<<7;
1997
    bSig = ( bSig | 0x00800000 )<<8;
1998
    if ( bSig <= ( aSig + aSig ) ) {
1999
        aSig >>= 1;
2000
        ++zExp;
2001
    }
2002
    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2003
    if ( ( zSig & 0x3F ) == 0 ) {
2004
        zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2005
    }
2006
    return roundAndPackFloat32( zSign, zExp, zSig );
2007

2008
}
2009

2010
#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2011
/*
2012
-------------------------------------------------------------------------------
2013
Returns the remainder of the single-precision floating-point value `a'
2014
with respect to the corresponding value `b'.  The operation is performed
2015
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2016
-------------------------------------------------------------------------------
2017
*/
2018
float32 float32_rem( float32 a, float32 b )
2019
{
2020
    flag aSign, bSign, zSign;
2021
    int16 aExp, bExp, expDiff;
2022
    bits32 aSig, bSig;
2023
    bits32 q;
2024
    bits64 aSig64, bSig64, q64;
2025
    bits32 alternateASig;
2026
    sbits32 sigMean;
2027

2028
    aSig = extractFloat32Frac( a );
2029
    aExp = extractFloat32Exp( a );
2030
    aSign = extractFloat32Sign( a );
2031
    bSig = extractFloat32Frac( b );
2032
    bExp = extractFloat32Exp( b );
2033
    bSign = extractFloat32Sign( b );
2034
    if ( aExp == 0xFF ) {
2035
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2036
            return propagateFloat32NaN( a, b );
2037
        }
2038
        float_raise( float_flag_invalid );
2039
        return float32_default_nan;
2040
    }
2041
    if ( bExp == 0xFF ) {
2042
        if ( bSig ) return propagateFloat32NaN( a, b );
2043
        return a;
2044
    }
2045
    if ( bExp == 0 ) {
2046
        if ( bSig == 0 ) {
2047
            float_raise( float_flag_invalid );
2048
            return float32_default_nan;
2049
        }
2050
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2051
    }
2052
    if ( aExp == 0 ) {
2053
        if ( aSig == 0 ) return a;
2054
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2055
    }
2056
    expDiff = aExp - bExp;
2057
    aSig |= 0x00800000;
2058
    bSig |= 0x00800000;
2059
    if ( expDiff < 32 ) {
2060
        aSig <<= 8;
2061
        bSig <<= 8;
2062
        if ( expDiff < 0 ) {
2063
            if ( expDiff < -1 ) return a;
2064
            aSig >>= 1;
2065
        }
2066
        q = ( bSig <= aSig );
2067
        if ( q ) aSig -= bSig;
2068
        if ( 0 < expDiff ) {
2069
            q = ( ( (bits64) aSig )<<32 ) / bSig;
2070
            q >>= 32 - expDiff;
2071
            bSig >>= 2;
2072
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2073
        }
2074
        else {
2075
            aSig >>= 2;
2076
            bSig >>= 2;
2077
        }
2078
    }
2079
    else {
2080
        if ( bSig <= aSig ) aSig -= bSig;
2081
        aSig64 = ( (bits64) aSig )<<40;
2082
        bSig64 = ( (bits64) bSig )<<40;
2083
        expDiff -= 64;
2084
        while ( 0 < expDiff ) {
2085
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2086
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2087
            aSig64 = - ( ( bSig * q64 )<<38 );
2088
            expDiff -= 62;
2089
        }
2090
        expDiff += 64;
2091
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2092
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2093
        q = q64>>( 64 - expDiff );
2094
        bSig <<= 6;
2095
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2096
    }
2097
    do {
2098
        alternateASig = aSig;
2099
        ++q;
2100
        aSig -= bSig;
2101
    } while ( 0 <= (sbits32) aSig );
2102
    sigMean = aSig + alternateASig;
2103
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2104
        aSig = alternateASig;
2105
    }
2106
    zSign = ( (sbits32) aSig < 0 );
2107
    if ( zSign ) aSig = - aSig;
2108
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2109

2110
}
2111
#endif /* !SOFTFLOAT_FOR_GCC */
2112

2113
#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2114
/*
2115
-------------------------------------------------------------------------------
2116
Returns the square root of the single-precision floating-point value `a'.
2117
The operation is performed according to the IEC/IEEE Standard for Binary
2118
Floating-Point Arithmetic.
2119
-------------------------------------------------------------------------------
2120
*/
2121
float32 float32_sqrt( float32 a )
2122
{
2123
    flag aSign;
2124
    int16 aExp, zExp;
2125
    bits32 aSig, zSig;
2126
    bits64 rem, term;
2127

2128
    aSig = extractFloat32Frac( a );
2129
    aExp = extractFloat32Exp( a );
2130
    aSign = extractFloat32Sign( a );
2131
    if ( aExp == 0xFF ) {
2132
        if ( aSig ) return propagateFloat32NaN( a, 0 );
2133
        if ( ! aSign ) return a;
2134
        float_raise( float_flag_invalid );
2135
        return float32_default_nan;
2136
    }
2137
    if ( aSign ) {
2138
        if ( ( aExp | aSig ) == 0 ) return a;
2139
        float_raise( float_flag_invalid );
2140
        return float32_default_nan;
2141
    }
2142
    if ( aExp == 0 ) {
2143
        if ( aSig == 0 ) return 0;
2144
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2145
    }
2146
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2147
    aSig = ( aSig | 0x00800000 )<<8;
2148
    zSig = estimateSqrt32( aExp, aSig ) + 2;
2149
    if ( ( zSig & 0x7F ) <= 5 ) {
2150
        if ( zSig < 2 ) {
2151
            zSig = 0x7FFFFFFF;
2152
            goto roundAndPack;
2153
        }
2154
        aSig >>= aExp & 1;
2155
        term = ( (bits64) zSig ) * zSig;
2156
        rem = ( ( (bits64) aSig )<<32 ) - term;
2157
        while ( (sbits64) rem < 0 ) {
2158
            --zSig;
2159
            rem += ( ( (bits64) zSig )<<1 ) | 1;
2160
        }
2161
        zSig |= ( rem != 0 );
2162
    }
2163
    shift32RightJamming( zSig, 1, &zSig );
2164
 roundAndPack:
2165
    return roundAndPackFloat32( 0, zExp, zSig );
2166

2167
}
2168
#endif /* !SOFTFLOAT_FOR_GCC */
2169

2170
/*
2171
-------------------------------------------------------------------------------
2172
Returns 1 if the single-precision floating-point value `a' is equal to
2173
the corresponding value `b', and 0 otherwise.  The comparison is performed
2174
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2175
-------------------------------------------------------------------------------
2176
*/
2177
flag float32_eq( float32 a, float32 b )
2178
{
2179

2180
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2181
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2182
       ) {
2183
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2184
            float_raise( float_flag_invalid );
2185
        }
2186
        return 0;
2187
    }
2188
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2189

2190
}
2191

2192
/*
2193
-------------------------------------------------------------------------------
2194
Returns 1 if the single-precision floating-point value `a' is less than
2195
or equal to the corresponding value `b', and 0 otherwise.  The comparison
2196
is performed according to the IEC/IEEE Standard for Binary Floating-Point
2197
Arithmetic.
2198
-------------------------------------------------------------------------------
2199
*/
2200
flag float32_le( float32 a, float32 b )
2201
{
2202
    flag aSign, bSign;
2203

2204
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2205
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2206
       ) {
2207
        float_raise( float_flag_invalid );
2208
        return 0;
2209
    }
2210
    aSign = extractFloat32Sign( a );
2211
    bSign = extractFloat32Sign( b );
2212
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2213
    return ( a == b ) || ( aSign ^ ( a < b ) );
2214

2215
}
2216

2217
/*
2218
-------------------------------------------------------------------------------
2219
Returns 1 if the single-precision floating-point value `a' is less than
2220
the corresponding value `b', and 0 otherwise.  The comparison is performed
2221
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2222
-------------------------------------------------------------------------------
2223
*/
2224
flag float32_lt( float32 a, float32 b )
2225
{
2226
    flag aSign, bSign;
2227

2228
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2229
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2230
       ) {
2231
        float_raise( float_flag_invalid );
2232
        return 0;
2233
    }
2234
    aSign = extractFloat32Sign( a );
2235
    bSign = extractFloat32Sign( b );
2236
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2237
    return ( a != b ) && ( aSign ^ ( a < b ) );
2238

2239
}
2240

2241
#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2242
/*
2243
-------------------------------------------------------------------------------
2244
Returns 1 if the single-precision floating-point value `a' is equal to
2245
the corresponding value `b', and 0 otherwise.  The invalid exception is
2246
raised if either operand is a NaN.  Otherwise, the comparison is performed
2247
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2248
-------------------------------------------------------------------------------
2249
*/
2250
flag float32_eq_signaling( float32 a, float32 b )
2251
{
2252

2253
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2254
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2255
       ) {
2256
        float_raise( float_flag_invalid );
2257
        return 0;
2258
    }
2259
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2260

2261
}
2262

2263
/*
2264
-------------------------------------------------------------------------------
2265
Returns 1 if the single-precision floating-point value `a' is less than or
2266
equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2267
cause an exception.  Otherwise, the comparison is performed according to the
2268
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2269
-------------------------------------------------------------------------------
2270
*/
2271
flag float32_le_quiet( float32 a, float32 b )
2272
{
2273
    flag aSign, bSign;
2274

2275
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2276
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2277
       ) {
2278
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2279
            float_raise( float_flag_invalid );
2280
        }
2281
        return 0;
2282
    }
2283
    aSign = extractFloat32Sign( a );
2284
    bSign = extractFloat32Sign( b );
2285
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2286
    return ( a == b ) || ( aSign ^ ( a < b ) );
2287

2288
}
2289

2290
/*
2291
-------------------------------------------------------------------------------
2292
Returns 1 if the single-precision floating-point value `a' is less than
2293
the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2294
exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2295
Standard for Binary Floating-Point Arithmetic.
2296
-------------------------------------------------------------------------------
2297
*/
2298
flag float32_lt_quiet( float32 a, float32 b )
2299
{
2300
    flag aSign, bSign;
2301

2302
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2303
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2304
       ) {
2305
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2306
            float_raise( float_flag_invalid );
2307
        }
2308
        return 0;
2309
    }
2310
    aSign = extractFloat32Sign( a );
2311
    bSign = extractFloat32Sign( b );
2312
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2313
    return ( a != b ) && ( aSign ^ ( a < b ) );
2314

2315
}
2316
#endif /* !SOFTFLOAT_FOR_GCC */
2317

2318
#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2319
/*
2320
-------------------------------------------------------------------------------
2321
Returns the result of converting the double-precision floating-point value
2322
`a' to the 32-bit two's complement integer format.  The conversion is
2323
performed according to the IEC/IEEE Standard for Binary Floating-Point
2324
Arithmetic---which means in particular that the conversion is rounded
2325
according to the current rounding mode.  If `a' is a NaN, the largest
2326
positive integer is returned.  Otherwise, if the conversion overflows, the
2327
largest integer with the same sign as `a' is returned.
2328
-------------------------------------------------------------------------------
2329
*/
2330
int32 float64_to_int32( float64 a )
2331
{
2332
    flag aSign;
2333
    int16 aExp, shiftCount;
2334
    bits64 aSig;
2335

2336
    aSig = extractFloat64Frac( a );
2337
    aExp = extractFloat64Exp( a );
2338
    aSign = extractFloat64Sign( a );
2339
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2340
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2341
    shiftCount = 0x42C - aExp;
2342
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2343
    return roundAndPackInt32( aSign, aSig );
2344

2345
}
2346
#endif /* !SOFTFLOAT_FOR_GCC */
2347

2348
/*
2349
-------------------------------------------------------------------------------
2350
Returns the result of converting the double-precision floating-point value
2351
`a' to the 32-bit two's complement integer format.  The conversion is
2352
performed according to the IEC/IEEE Standard for Binary Floating-Point
2353
Arithmetic, except that the conversion is always rounded toward zero.
2354
If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2355
the conversion overflows, the largest integer with the same sign as `a' is
2356
returned.
2357
-------------------------------------------------------------------------------
2358
*/
2359
int32 float64_to_int32_round_to_zero( float64 a )
2360
{
2361
    flag aSign;
2362
    int16 aExp, shiftCount;
2363
    bits64 aSig, savedASig;
2364
    int32 z;
2365

2366
    aSig = extractFloat64Frac( a );
2367
    aExp = extractFloat64Exp( a );
2368
    aSign = extractFloat64Sign( a );
2369
    if ( 0x41E < aExp ) {
2370
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2371
        goto invalid;
2372
    }
2373
    else if ( aExp < 0x3FF ) {
2374
        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2375
        return 0;
2376
    }
2377
    aSig |= LIT64( 0x0010000000000000 );
2378
    shiftCount = 0x433 - aExp;
2379
    savedASig = aSig;
2380
    aSig >>= shiftCount;
2381
    z = aSig;
2382
    if ( aSign ) z = - z;
2383
    if ( ( z < 0 ) ^ aSign ) {
2384
 invalid:
2385
        float_raise( float_flag_invalid );
2386
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2387
    }
2388
    if ( ( aSig<<shiftCount ) != savedASig ) {
2389
        float_exception_flags |= float_flag_inexact;
2390
    }
2391
    return z;
2392

2393
}
2394

2395
#ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2396
/*
2397
-------------------------------------------------------------------------------
2398
Returns the result of converting the double-precision floating-point value
2399
`a' to the 64-bit two's complement integer format.  The conversion is
2400
performed according to the IEC/IEEE Standard for Binary Floating-Point
2401
Arithmetic---which means in particular that the conversion is rounded
2402
according to the current rounding mode.  If `a' is a NaN, the largest
2403
positive integer is returned.  Otherwise, if the conversion overflows, the
2404
largest integer with the same sign as `a' is returned.
2405
-------------------------------------------------------------------------------
2406
*/
2407
int64 float64_to_int64( float64 a )
2408
{
2409
    flag aSign;
2410
    int16 aExp, shiftCount;
2411
    bits64 aSig, aSigExtra;
2412

2413
    aSig = extractFloat64Frac( a );
2414
    aExp = extractFloat64Exp( a );
2415
    aSign = extractFloat64Sign( a );
2416
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2417
    shiftCount = 0x433 - aExp;
2418
    if ( shiftCount <= 0 ) {
2419
        if ( 0x43E < aExp ) {
2420
            float_raise( float_flag_invalid );
2421
            if (    ! aSign
2422
                 || (    ( aExp == 0x7FF )
2423
                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2424
               ) {
2425
                return LIT64( 0x7FFFFFFFFFFFFFFF );
2426
            }
2427
            return (sbits64) LIT64( 0x8000000000000000 );
2428
        }
2429
        aSigExtra = 0;
2430
        aSig <<= - shiftCount;
2431
    }
2432
    else {
2433
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2434
    }
2435
    return roundAndPackInt64( aSign, aSig, aSigExtra );
2436

2437
}
2438

2439
/*
2440
-------------------------------------------------------------------------------
2441
Returns the result of converting the double-precision floating-point value
2442
`a' to the 64-bit two's complement integer format.  The conversion is
2443
performed according to the IEC/IEEE Standard for Binary Floating-Point
2444
Arithmetic, except that the conversion is always rounded toward zero.
2445
If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2446
the conversion overflows, the largest integer with the same sign as `a' is
2447
returned.
2448
-------------------------------------------------------------------------------
2449
*/
2450
int64 float64_to_int64_round_to_zero( float64 a )
2451
{
2452
    flag aSign;
2453
    int16 aExp, shiftCount;
2454
    bits64 aSig;
2455
    int64 z;
2456

2457
    aSig = extractFloat64Frac( a );
2458
    aExp = extractFloat64Exp( a );
2459
    aSign = extractFloat64Sign( a );
2460
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2461
    shiftCount = aExp - 0x433;
2462
    if ( 0 <= shiftCount ) {
2463
        if ( 0x43E <= aExp ) {
2464
            if ( a != LIT64( 0xC3E0000000000000 ) ) {
2465
                float_raise( float_flag_invalid );
2466
                if (    ! aSign
2467
                     || (    ( aExp == 0x7FF )
2468
                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2469
                   ) {
2470
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2471
                }
2472
            }
2473
            return (sbits64) LIT64( 0x8000000000000000 );
2474
        }
2475
        z = aSig<<shiftCount;
2476
    }
2477
    else {
2478
        if ( aExp < 0x3FE ) {
2479
            if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2480
            return 0;
2481
        }
2482
        z = aSig>>( - shiftCount );
2483
        if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2484
            float_exception_flags |= float_flag_inexact;
2485
        }
2486
    }
2487
    if ( aSign ) z = - z;
2488
    return z;
2489

2490
}
2491
#endif /* !SOFTFLOAT_FOR_GCC */
2492

2493
/*
2494
-------------------------------------------------------------------------------
2495
Returns the result of converting the double-precision floating-point value
2496
`a' to the single-precision floating-point format.  The conversion is
2497
performed according to the IEC/IEEE Standard for Binary Floating-Point
2498
Arithmetic.
2499
-------------------------------------------------------------------------------
2500
*/
2501
float32 float64_to_float32( float64 a )
2502
{
2503
    flag aSign;
2504
    int16 aExp;
2505
    bits64 aSig;
2506
    bits32 zSig;
2507

2508
    aSig = extractFloat64Frac( a );
2509
    aExp = extractFloat64Exp( a );
2510
    aSign = extractFloat64Sign( a );
2511
    if ( aExp == 0x7FF ) {
2512
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2513
        return packFloat32( aSign, 0xFF, 0 );
2514
    }
2515
    shift64RightJamming( aSig, 22, &aSig );
2516
    zSig = aSig;
2517
    if ( aExp || zSig ) {
2518
        zSig |= 0x40000000;
2519
        aExp -= 0x381;
2520
    }
2521
    return roundAndPackFloat32( aSign, aExp, zSig );
2522

2523
}
2524

2525
#ifdef FLOATX80
2526

2527
/*
2528
-------------------------------------------------------------------------------
2529
Returns the result of converting the double-precision floating-point value
2530
`a' to the extended double-precision floating-point format.  The conversion
2531
is performed according to the IEC/IEEE Standard for Binary Floating-Point
2532
Arithmetic.
2533
-------------------------------------------------------------------------------
2534
*/
2535
floatx80 float64_to_floatx80( float64 a )
2536
{
2537
    flag aSign;
2538
    int16 aExp;
2539
    bits64 aSig;
2540

2541
    aSig = extractFloat64Frac( a );
2542
    aExp = extractFloat64Exp( a );
2543
    aSign = extractFloat64Sign( a );
2544
    if ( aExp == 0x7FF ) {
2545
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2546
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2547
    }
2548
    if ( aExp == 0 ) {
2549
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2550
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2551
    }
2552
    return
2553
        packFloatx80(
2554
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2555

2556
}
2557

2558
#endif
2559

2560
#ifdef FLOAT128
2561

2562
/*
2563
-------------------------------------------------------------------------------
2564
Returns the result of converting the double-precision floating-point value
2565
`a' to the quadruple-precision floating-point format.  The conversion is
2566
performed according to the IEC/IEEE Standard for Binary Floating-Point
2567
Arithmetic.
2568
-------------------------------------------------------------------------------
2569
*/
2570
float128 float64_to_float128( float64 a )
2571
{
2572
    flag aSign;
2573
    int16 aExp;
2574
    bits64 aSig, zSig0, zSig1;
2575

2576
    aSig = extractFloat64Frac( a );
2577
    aExp = extractFloat64Exp( a );
2578
    aSign = extractFloat64Sign( a );
2579
    if ( aExp == 0x7FF ) {
2580
        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2581
        return packFloat128( aSign, 0x7FFF, 0, 0 );
2582
    }
2583
    if ( aExp == 0 ) {
2584
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2585
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2586
        --aExp;
2587
    }
2588
    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2589
    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2590

2591
}
2592

2593
#endif
2594

2595
#ifndef SOFTFLOAT_FOR_GCC
2596
/*
2597
-------------------------------------------------------------------------------
2598
Rounds the double-precision floating-point value `a' to an integer, and
2599
returns the result as a double-precision floating-point value.  The
2600
operation is performed according to the IEC/IEEE Standard for Binary
2601
Floating-Point Arithmetic.
2602
-------------------------------------------------------------------------------
2603
*/
2604
float64 float64_round_to_int( float64 a )
2605
{
2606
    flag aSign;
2607
    int16 aExp;
2608
    bits64 lastBitMask, roundBitsMask;
2609
    int8 roundingMode;
2610
    float64 z;
2611

2612
    aExp = extractFloat64Exp( a );
2613
    if ( 0x433 <= aExp ) {
2614
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2615
            return propagateFloat64NaN( a, a );
2616
        }
2617
        return a;
2618
    }
2619
    if ( aExp < 0x3FF ) {
2620
        if ( (bits64) ( a<<1 ) == 0 ) return a;
2621
        float_exception_flags |= float_flag_inexact;
2622
        aSign = extractFloat64Sign( a );
2623
        switch ( float_rounding_mode ) {
2624
         case float_round_nearest_even:
2625
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2626
                return packFloat64( aSign, 0x3FF, 0 );
2627
            }
2628
            break;
2629
	 case float_round_to_zero:
2630
	    break;
2631
         case float_round_down:
2632
            return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2633
         case float_round_up:
2634
            return
2635
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2636
        }
2637
        return packFloat64( aSign, 0, 0 );
2638
    }
2639
    lastBitMask = 1;
2640
    lastBitMask <<= 0x433 - aExp;
2641
    roundBitsMask = lastBitMask - 1;
2642
    z = a;
2643
    roundingMode = float_rounding_mode;
2644
    if ( roundingMode == float_round_nearest_even ) {
2645
        z += lastBitMask>>1;
2646
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2647
    }
2648
    else if ( roundingMode != float_round_to_zero ) {
2649
        if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2650
            z += roundBitsMask;
2651
        }
2652
    }
2653
    z &= ~ roundBitsMask;
2654
    if ( z != a ) float_exception_flags |= float_flag_inexact;
2655
    return z;
2656

2657
}
2658
#endif
2659

2660
/*
2661
-------------------------------------------------------------------------------
2662
Returns the result of adding the absolute values of the double-precision
2663
floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2664
before being returned.  `zSign' is ignored if the result is a NaN.
2665
The addition is performed according to the IEC/IEEE Standard for Binary
2666
Floating-Point Arithmetic.
2667
-------------------------------------------------------------------------------
2668
*/
2669
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2670
{
2671
    int16 aExp, bExp, zExp;
2672
    bits64 aSig, bSig, zSig;
2673
    int16 expDiff;
2674

2675
    aSig = extractFloat64Frac( a );
2676
    aExp = extractFloat64Exp( a );
2677
    bSig = extractFloat64Frac( b );
2678
    bExp = extractFloat64Exp( b );
2679
    expDiff = aExp - bExp;
2680
    aSig <<= 9;
2681
    bSig <<= 9;
2682
    if ( 0 < expDiff ) {
2683
        if ( aExp == 0x7FF ) {
2684
            if ( aSig ) return propagateFloat64NaN( a, b );
2685
            return a;
2686
        }
2687
        if ( bExp == 0 ) {
2688
            --expDiff;
2689
        }
2690
        else {
2691
            bSig |= LIT64( 0x2000000000000000 );
2692
        }
2693
        shift64RightJamming( bSig, expDiff, &bSig );
2694
        zExp = aExp;
2695
    }
2696
    else if ( expDiff < 0 ) {
2697
        if ( bExp == 0x7FF ) {
2698
            if ( bSig ) return propagateFloat64NaN( a, b );
2699
            return packFloat64( zSign, 0x7FF, 0 );
2700
        }
2701
        if ( aExp == 0 ) {
2702
            ++expDiff;
2703
        }
2704
        else {
2705
            aSig |= LIT64( 0x2000000000000000 );
2706
        }
2707
        shift64RightJamming( aSig, - expDiff, &aSig );
2708
        zExp = bExp;
2709
    }
2710
    else {
2711
        if ( aExp == 0x7FF ) {
2712
            if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2713
            return a;
2714
        }
2715
        if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2716
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2717
        zExp = aExp;
2718
        goto roundAndPack;
2719
    }
2720
    aSig |= LIT64( 0x2000000000000000 );
2721
    zSig = ( aSig + bSig )<<1;
2722
    --zExp;
2723
    if ( (sbits64) zSig < 0 ) {
2724
        zSig = aSig + bSig;
2725
        ++zExp;
2726
    }
2727
 roundAndPack:
2728
    return roundAndPackFloat64( zSign, zExp, zSig );
2729

2730
}
2731

2732
/*
2733
-------------------------------------------------------------------------------
2734
Returns the result of subtracting the absolute values of the double-
2735
precision floating-point values `a' and `b'.  If `zSign' is 1, the
2736
difference is negated before being returned.  `zSign' is ignored if the
2737
result is a NaN.  The subtraction is performed according to the IEC/IEEE
2738
Standard for Binary Floating-Point Arithmetic.
2739
-------------------------------------------------------------------------------
2740
*/
2741
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2742
{
2743
    int16 aExp, bExp, zExp;
2744
    bits64 aSig, bSig, zSig;
2745
    int16 expDiff;
2746

2747
    aSig = extractFloat64Frac( a );
2748
    aExp = extractFloat64Exp( a );
2749
    bSig = extractFloat64Frac( b );
2750
    bExp = extractFloat64Exp( b );
2751
    expDiff = aExp - bExp;
2752
    aSig <<= 10;
2753
    bSig <<= 10;
2754
    if ( 0 < expDiff ) goto aExpBigger;
2755
    if ( expDiff < 0 ) goto bExpBigger;
2756
    if ( aExp == 0x7FF ) {
2757
        if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2758
        float_raise( float_flag_invalid );
2759
        return float64_default_nan;
2760
    }
2761
    if ( aExp == 0 ) {
2762
        aExp = 1;
2763
        bExp = 1;
2764
    }
2765
    if ( bSig < aSig ) goto aBigger;
2766
    if ( aSig < bSig ) goto bBigger;
2767
    return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2768
 bExpBigger:
2769
    if ( bExp == 0x7FF ) {
2770
        if ( bSig ) return propagateFloat64NaN( a, b );
2771
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
2772
    }
2773
    if ( aExp == 0 ) {
2774
        ++expDiff;
2775
    }
2776
    else {
2777
        aSig |= LIT64( 0x4000000000000000 );
2778
    }
2779
    shift64RightJamming( aSig, - expDiff, &aSig );
2780
    bSig |= LIT64( 0x4000000000000000 );
2781
 bBigger:
2782
    zSig = bSig - aSig;
2783
    zExp = bExp;
2784
    zSign ^= 1;
2785
    goto normalizeRoundAndPack;
2786
 aExpBigger:
2787
    if ( aExp == 0x7FF ) {
2788
        if ( aSig ) return propagateFloat64NaN( a, b );
2789
        return a;
2790
    }
2791
    if ( bExp == 0 ) {
2792
        --expDiff;
2793
    }
2794
    else {
2795
        bSig |= LIT64( 0x4000000000000000 );
2796
    }
2797
    shift64RightJamming( bSig, expDiff, &bSig );
2798
    aSig |= LIT64( 0x4000000000000000 );
2799
 aBigger:
2800
    zSig = aSig - bSig;
2801
    zExp = aExp;
2802
 normalizeRoundAndPack:
2803
    --zExp;
2804
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2805

2806
}
2807

2808
/*
2809
-------------------------------------------------------------------------------
2810
Returns the result of adding the double-precision floating-point values `a'
2811
and `b'.  The operation is performed according to the IEC/IEEE Standard for
2812
Binary Floating-Point Arithmetic.
2813
-------------------------------------------------------------------------------
2814
*/
2815
float64 float64_add( float64 a, float64 b )
2816
{
2817
    flag aSign, bSign;
2818

2819
    aSign = extractFloat64Sign( a );
2820
    bSign = extractFloat64Sign( b );
2821
    if ( aSign == bSign ) {
2822
        return addFloat64Sigs( a, b, aSign );
2823
    }
2824
    else {
2825
        return subFloat64Sigs( a, b, aSign );
2826
    }
2827

2828
}
2829

2830
/*
2831
-------------------------------------------------------------------------------
2832
Returns the result of subtracting the double-precision floating-point values
2833
`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2834
for Binary Floating-Point Arithmetic.
2835
-------------------------------------------------------------------------------
2836
*/
2837
float64 float64_sub( float64 a, float64 b )
2838
{
2839
    flag aSign, bSign;
2840

2841
    aSign = extractFloat64Sign( a );
2842
    bSign = extractFloat64Sign( b );
2843
    if ( aSign == bSign ) {
2844
        return subFloat64Sigs( a, b, aSign );
2845
    }
2846
    else {
2847
        return addFloat64Sigs( a, b, aSign );
2848
    }
2849

2850
}
2851

2852
/*
2853
-------------------------------------------------------------------------------
2854
Returns the result of multiplying the double-precision floating-point values
2855
`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2856
for Binary Floating-Point Arithmetic.
2857
-------------------------------------------------------------------------------
2858
*/
2859
float64 float64_mul( float64 a, float64 b )
2860
{
2861
    flag aSign, bSign, zSign;
2862
    int16 aExp, bExp, zExp;
2863
    bits64 aSig, bSig, zSig0, zSig1;
2864

2865
    aSig = extractFloat64Frac( a );
2866
    aExp = extractFloat64Exp( a );
2867
    aSign = extractFloat64Sign( a );
2868
    bSig = extractFloat64Frac( b );
2869
    bExp = extractFloat64Exp( b );
2870
    bSign = extractFloat64Sign( b );
2871
    zSign = aSign ^ bSign;
2872
    if ( aExp == 0x7FF ) {
2873
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2874
            return propagateFloat64NaN( a, b );
2875
        }
2876
        if ( ( bExp | bSig ) == 0 ) {
2877
            float_raise( float_flag_invalid );
2878
            return float64_default_nan;
2879
        }
2880
        return packFloat64( zSign, 0x7FF, 0 );
2881
    }
2882
    if ( bExp == 0x7FF ) {
2883
        if ( bSig ) return propagateFloat64NaN( a, b );
2884
        if ( ( aExp | aSig ) == 0 ) {
2885
            float_raise( float_flag_invalid );
2886
            return float64_default_nan;
2887
        }
2888
        return packFloat64( zSign, 0x7FF, 0 );
2889
    }
2890
    if ( aExp == 0 ) {
2891
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2892
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2893
    }
2894
    if ( bExp == 0 ) {
2895
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2896
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2897
    }
2898
    zExp = aExp + bExp - 0x3FF;
2899
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2900
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2901
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2902
    zSig0 |= ( zSig1 != 0 );
2903
    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2904
        zSig0 <<= 1;
2905
        --zExp;
2906
    }
2907
    return roundAndPackFloat64( zSign, zExp, zSig0 );
2908

2909
}
2910

2911
/*
2912
-------------------------------------------------------------------------------
2913
Returns the result of dividing the double-precision floating-point value `a'
2914
by the corresponding value `b'.  The operation is performed according to
2915
the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2916
-------------------------------------------------------------------------------
2917
*/
2918
float64 float64_div( float64 a, float64 b )
2919
{
2920
    flag aSign, bSign, zSign;
2921
    int16 aExp, bExp, zExp;
2922
    bits64 aSig, bSig, zSig;
2923
    bits64 rem0, rem1;
2924
    bits64 term0, term1;
2925

2926
    aSig = extractFloat64Frac( a );
2927
    aExp = extractFloat64Exp( a );
2928
    aSign = extractFloat64Sign( a );
2929
    bSig = extractFloat64Frac( b );
2930
    bExp = extractFloat64Exp( b );
2931
    bSign = extractFloat64Sign( b );
2932
    zSign = aSign ^ bSign;
2933
    if ( aExp == 0x7FF ) {
2934
        if ( aSig ) return propagateFloat64NaN( a, b );
2935
        if ( bExp == 0x7FF ) {
2936
            if ( bSig ) return propagateFloat64NaN( a, b );
2937
            float_raise( float_flag_invalid );
2938
            return float64_default_nan;
2939
        }
2940
        return packFloat64( zSign, 0x7FF, 0 );
2941
    }
2942
    if ( bExp == 0x7FF ) {
2943
        if ( bSig ) return propagateFloat64NaN( a, b );
2944
        return packFloat64( zSign, 0, 0 );
2945
    }
2946
    if ( bExp == 0 ) {
2947
        if ( bSig == 0 ) {
2948
            if ( ( aExp | aSig ) == 0 ) {
2949
                float_raise( float_flag_invalid );
2950
                return float64_default_nan;
2951
            }
2952
            float_raise( float_flag_divbyzero );
2953
            return packFloat64( zSign, 0x7FF, 0 );
2954
        }
2955
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2956
    }
2957
    if ( aExp == 0 ) {
2958
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2959
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2960
    }
2961
    zExp = aExp - bExp + 0x3FD;
2962
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2963
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2964
    if ( bSig <= ( aSig + aSig ) ) {
2965
        aSig >>= 1;
2966
        ++zExp;
2967
    }
2968
    zSig = estimateDiv128To64( aSig, 0, bSig );
2969
    if ( ( zSig & 0x1FF ) <= 2 ) {
2970
        mul64To128( bSig, zSig, &term0, &term1 );
2971
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2972
        while ( (sbits64) rem0 < 0 ) {
2973
            --zSig;
2974
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2975
        }
2976
        zSig |= ( rem1 != 0 );
2977
    }
2978
    return roundAndPackFloat64( zSign, zExp, zSig );
2979

2980
}
2981

2982
#ifndef SOFTFLOAT_FOR_GCC
2983
/*
2984
-------------------------------------------------------------------------------
2985
Returns the remainder of the double-precision floating-point value `a'
2986
with respect to the corresponding value `b'.  The operation is performed
2987
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2988
-------------------------------------------------------------------------------
2989
*/
2990
float64 float64_rem( float64 a, float64 b )
2991
{
2992
    flag aSign, bSign, zSign;
2993
    int16 aExp, bExp, expDiff;
2994
    bits64 aSig, bSig;
2995
    bits64 q, alternateASig;
2996
    sbits64 sigMean;
2997

2998
    aSig = extractFloat64Frac( a );
2999
    aExp = extractFloat64Exp( a );
3000
    aSign = extractFloat64Sign( a );
3001
    bSig = extractFloat64Frac( b );
3002
    bExp = extractFloat64Exp( b );
3003
    bSign = extractFloat64Sign( b );
3004
    if ( aExp == 0x7FF ) {
3005
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3006
            return propagateFloat64NaN( a, b );
3007
        }
3008
        float_raise( float_flag_invalid );
3009
        return float64_default_nan;
3010
    }
3011
    if ( bExp == 0x7FF ) {
3012
        if ( bSig ) return propagateFloat64NaN( a, b );
3013
        return a;
3014
    }
3015
    if ( bExp == 0 ) {
3016
        if ( bSig == 0 ) {
3017
            float_raise( float_flag_invalid );
3018
            return float64_default_nan;
3019
        }
3020
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3021
    }
3022
    if ( aExp == 0 ) {
3023
        if ( aSig == 0 ) return a;
3024
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3025
    }
3026
    expDiff = aExp - bExp;
3027
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3028
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3029
    if ( expDiff < 0 ) {
3030
        if ( expDiff < -1 ) return a;
3031
        aSig >>= 1;
3032
    }
3033
    q = ( bSig <= aSig );
3034
    if ( q ) aSig -= bSig;
3035
    expDiff -= 64;
3036
    while ( 0 < expDiff ) {
3037
        q = estimateDiv128To64( aSig, 0, bSig );
3038
        q = ( 2 < q ) ? q - 2 : 0;
3039
        aSig = - ( ( bSig>>2 ) * q );
3040
        expDiff -= 62;
3041
    }
3042
    expDiff += 64;
3043
    if ( 0 < expDiff ) {
3044
        q = estimateDiv128To64( aSig, 0, bSig );
3045
        q = ( 2 < q ) ? q - 2 : 0;
3046
        q >>= 64 - expDiff;
3047
        bSig >>= 2;
3048
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3049
    }
3050
    else {
3051
        aSig >>= 2;
3052
        bSig >>= 2;
3053
    }
3054
    do {
3055
        alternateASig = aSig;
3056
        ++q;
3057
        aSig -= bSig;
3058
    } while ( 0 <= (sbits64) aSig );
3059
    sigMean = aSig + alternateASig;
3060
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3061
        aSig = alternateASig;
3062
    }
3063
    zSign = ( (sbits64) aSig < 0 );
3064
    if ( zSign ) aSig = - aSig;
3065
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3066

3067
}
3068

3069
/*
3070
-------------------------------------------------------------------------------
3071
Returns the square root of the double-precision floating-point value `a'.
3072
The operation is performed according to the IEC/IEEE Standard for Binary
3073
Floating-Point Arithmetic.
3074
-------------------------------------------------------------------------------
3075
*/
3076
float64 float64_sqrt( float64 a )
3077
{
3078
    flag aSign;
3079
    int16 aExp, zExp;
3080
    bits64 aSig, zSig, doubleZSig;
3081
    bits64 rem0, rem1, term0, term1;
3082

3083
    aSig = extractFloat64Frac( a );
3084
    aExp = extractFloat64Exp( a );
3085
    aSign = extractFloat64Sign( a );
3086
    if ( aExp == 0x7FF ) {
3087
        if ( aSig ) return propagateFloat64NaN( a, a );
3088
        if ( ! aSign ) return a;
3089
        float_raise( float_flag_invalid );
3090
        return float64_default_nan;
3091
    }
3092
    if ( aSign ) {
3093
        if ( ( aExp | aSig ) == 0 ) return a;
3094
        float_raise( float_flag_invalid );
3095
        return float64_default_nan;
3096
    }
3097
    if ( aExp == 0 ) {
3098
        if ( aSig == 0 ) return 0;
3099
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3100
    }
3101
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3102
    aSig |= LIT64( 0x0010000000000000 );
3103
    zSig = estimateSqrt32( aExp, aSig>>21 );
3104
    aSig <<= 9 - ( aExp & 1 );
3105
    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3106
    if ( ( zSig & 0x1FF ) <= 5 ) {
3107
        doubleZSig = zSig<<1;
3108
        mul64To128( zSig, zSig, &term0, &term1 );
3109
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3110
        while ( (sbits64) rem0 < 0 ) {
3111
            --zSig;
3112
            doubleZSig -= 2;
3113
            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3114
        }
3115
        zSig |= ( ( rem0 | rem1 ) != 0 );
3116
    }
3117
    return roundAndPackFloat64( 0, zExp, zSig );
3118

3119
}
3120
#endif
3121

3122
/*
3123
-------------------------------------------------------------------------------
3124
Returns 1 if the double-precision floating-point value `a' is equal to the
3125
corresponding value `b', and 0 otherwise.  The comparison is performed
3126
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3127
-------------------------------------------------------------------------------
3128
*/
3129
flag float64_eq( float64 a, float64 b )
3130
{
3131

3132
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3133
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3134
       ) {
3135
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3136
            float_raise( float_flag_invalid );
3137
        }
3138
        return 0;
3139
    }
3140
    return ( a == b ) ||
3141
	( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3142

3143
}
3144

3145
/*
3146
-------------------------------------------------------------------------------
3147
Returns 1 if the double-precision floating-point value `a' is less than or
3148
equal to the corresponding value `b', and 0 otherwise.  The comparison is
3149
performed according to the IEC/IEEE Standard for Binary Floating-Point
3150
Arithmetic.
3151
-------------------------------------------------------------------------------
3152
*/
3153
flag float64_le( float64 a, float64 b )
3154
{
3155
    flag aSign, bSign;
3156

3157
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3158
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3159
       ) {
3160
        float_raise( float_flag_invalid );
3161
        return 0;
3162
    }
3163
    aSign = extractFloat64Sign( a );
3164
    bSign = extractFloat64Sign( b );
3165
    if ( aSign != bSign )
3166
	return aSign ||
3167
	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3168
	      0 );
3169
    return ( a == b ) ||
3170
	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3171

3172
}
3173

3174
/*
3175
-------------------------------------------------------------------------------
3176
Returns 1 if the double-precision floating-point value `a' is less than
3177
the corresponding value `b', and 0 otherwise.  The comparison is performed
3178
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3179
-------------------------------------------------------------------------------
3180
*/
3181
flag float64_lt( float64 a, float64 b )
3182
{
3183
    flag aSign, bSign;
3184

3185
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3186
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3187
       ) {
3188
        float_raise( float_flag_invalid );
3189
        return 0;
3190
    }
3191
    aSign = extractFloat64Sign( a );
3192
    bSign = extractFloat64Sign( b );
3193
    if ( aSign != bSign )
3194
	return aSign &&
3195
	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3196
	      0 );
3197
    return ( a != b ) &&
3198
	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3199

3200
}
3201

3202
#ifndef SOFTFLOAT_FOR_GCC
3203
/*
3204
-------------------------------------------------------------------------------
3205
Returns 1 if the double-precision floating-point value `a' is equal to the
3206
corresponding value `b', and 0 otherwise.  The invalid exception is raised
3207
if either operand is a NaN.  Otherwise, the comparison is performed
3208
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3209
-------------------------------------------------------------------------------
3210
*/
3211
flag float64_eq_signaling( float64 a, float64 b )
3212
{
3213

3214
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3215
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3216
       ) {
3217
        float_raise( float_flag_invalid );
3218
        return 0;
3219
    }
3220
    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3221

3222
}
3223

3224
/*
3225
-------------------------------------------------------------------------------
3226
Returns 1 if the double-precision floating-point value `a' is less than or
3227
equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3228
cause an exception.  Otherwise, the comparison is performed according to the
3229
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3230
-------------------------------------------------------------------------------
3231
*/
3232
flag float64_le_quiet( float64 a, float64 b )
3233
{
3234
    flag aSign, bSign;
3235

3236
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3237
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3238
       ) {
3239
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3240
            float_raise( float_flag_invalid );
3241
        }
3242
        return 0;
3243
    }
3244
    aSign = extractFloat64Sign( a );
3245
    bSign = extractFloat64Sign( b );
3246
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3247
    return ( a == b ) || ( aSign ^ ( a < b ) );
3248

3249
}
3250

3251
/*
3252
-------------------------------------------------------------------------------
3253
Returns 1 if the double-precision floating-point value `a' is less than
3254
the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3255
exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3256
Standard for Binary Floating-Point Arithmetic.
3257
-------------------------------------------------------------------------------
3258
*/
3259
flag float64_lt_quiet( float64 a, float64 b )
3260
{
3261
    flag aSign, bSign;
3262

3263
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3264
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3265
       ) {
3266
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3267
            float_raise( float_flag_invalid );
3268
        }
3269
        return 0;
3270
    }
3271
    aSign = extractFloat64Sign( a );
3272
    bSign = extractFloat64Sign( b );
3273
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3274
    return ( a != b ) && ( aSign ^ ( a < b ) );
3275

3276
}
3277
#endif
3278

3279
#ifdef FLOATX80
3280

3281
/*
3282
-------------------------------------------------------------------------------
3283
Returns the result of converting the extended double-precision floating-
3284
point value `a' to the 32-bit two's complement integer format.  The
3285
conversion is performed according to the IEC/IEEE Standard for Binary
3286
Floating-Point Arithmetic---which means in particular that the conversion
3287
is rounded according to the current rounding mode.  If `a' is a NaN, the
3288
largest positive integer is returned.  Otherwise, if the conversion
3289
overflows, the largest integer with the same sign as `a' is returned.
3290
-------------------------------------------------------------------------------
3291
*/
3292
int32 floatx80_to_int32( floatx80 a )
3293
{
3294
    flag aSign;
3295
    int32 aExp, shiftCount;
3296
    bits64 aSig;
3297

3298
    aSig = extractFloatx80Frac( a );
3299
    aExp = extractFloatx80Exp( a );
3300
    aSign = extractFloatx80Sign( a );
3301
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3302
    shiftCount = 0x4037 - aExp;
3303
    if ( shiftCount <= 0 ) shiftCount = 1;
3304
    shift64RightJamming( aSig, shiftCount, &aSig );
3305
    return roundAndPackInt32( aSign, aSig );
3306

3307
}
3308

3309
/*
3310
-------------------------------------------------------------------------------
3311
Returns the result of converting the extended double-precision floating-
3312
point value `a' to the 32-bit two's complement integer format.  The
3313
conversion is performed according to the IEC/IEEE Standard for Binary
3314
Floating-Point Arithmetic, except that the conversion is always rounded
3315
toward zero.  If `a' is a NaN, the largest positive integer is returned.
3316
Otherwise, if the conversion overflows, the largest integer with the same
3317
sign as `a' is returned.
3318
-------------------------------------------------------------------------------
3319
*/
3320
int32 floatx80_to_int32_round_to_zero( floatx80 a )
3321
{
3322
    flag aSign;
3323
    int32 aExp, shiftCount;
3324
    bits64 aSig, savedASig;
3325
    int32 z;
3326

3327
    aSig = extractFloatx80Frac( a );
3328
    aExp = extractFloatx80Exp( a );
3329
    aSign = extractFloatx80Sign( a );
3330
    if ( 0x401E < aExp ) {
3331
        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3332
        goto invalid;
3333
    }
3334
    else if ( aExp < 0x3FFF ) {
3335
        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3336
        return 0;
3337
    }
3338
    shiftCount = 0x403E - aExp;
3339
    savedASig = aSig;
3340
    aSig >>= shiftCount;
3341
    z = aSig;
3342
    if ( aSign ) z = - z;
3343
    if ( ( z < 0 ) ^ aSign ) {
3344
 invalid:
3345
        float_raise( float_flag_invalid );
3346
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3347
    }
3348
    if ( ( aSig<<shiftCount ) != savedASig ) {
3349
        float_exception_flags |= float_flag_inexact;
3350
    }
3351
    return z;
3352

3353
}
3354

3355
/*
3356
-------------------------------------------------------------------------------
3357
Returns the result of converting the extended double-precision floating-
3358
point value `a' to the 64-bit two's complement integer format.  The
3359
conversion is performed according to the IEC/IEEE Standard for Binary
3360
Floating-Point Arithmetic---which means in particular that the conversion
3361
is rounded according to the current rounding mode.  If `a' is a NaN,
3362
the largest positive integer is returned.  Otherwise, if the conversion
3363
overflows, the largest integer with the same sign as `a' is returned.
3364
-------------------------------------------------------------------------------
3365
*/
3366
int64 floatx80_to_int64( floatx80 a )
3367
{
3368
    flag aSign;
3369
    int32 aExp, shiftCount;
3370
    bits64 aSig, aSigExtra;
3371

3372
    aSig = extractFloatx80Frac( a );
3373
    aExp = extractFloatx80Exp( a );
3374
    aSign = extractFloatx80Sign( a );
3375
    shiftCount = 0x403E - aExp;
3376
    if ( shiftCount <= 0 ) {
3377
        if ( shiftCount ) {
3378
            float_raise( float_flag_invalid );
3379
            if (    ! aSign
3380
                 || (    ( aExp == 0x7FFF )
3381
                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
3382
               ) {
3383
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3384
            }
3385
            return (sbits64) LIT64( 0x8000000000000000 );
3386
        }
3387
        aSigExtra = 0;
3388
    }
3389
    else {
3390
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3391
    }
3392
    return roundAndPackInt64( aSign, aSig, aSigExtra );
3393

3394
}
3395

3396
/*
3397
-------------------------------------------------------------------------------
3398
Returns the result of converting the extended double-precision floating-
3399
point value `a' to the 64-bit two's complement integer format.  The
3400
conversion is performed according to the IEC/IEEE Standard for Binary
3401
Floating-Point Arithmetic, except that the conversion is always rounded
3402
toward zero.  If `a' is a NaN, the largest positive integer is returned.
3403
Otherwise, if the conversion overflows, the largest integer with the same
3404
sign as `a' is returned.
3405
-------------------------------------------------------------------------------
3406
*/
3407
int64 floatx80_to_int64_round_to_zero( floatx80 a )
3408
{
3409
    flag aSign;
3410
    int32 aExp, shiftCount;
3411
    bits64 aSig;
3412
    int64 z;
3413

3414
    aSig = extractFloatx80Frac( a );
3415
    aExp = extractFloatx80Exp( a );
3416
    aSign = extractFloatx80Sign( a );
3417
    shiftCount = aExp - 0x403E;
3418
    if ( 0 <= shiftCount ) {
3419
        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3420
        if ( ( a.high != 0xC03E ) || aSig ) {
3421
            float_raise( float_flag_invalid );
3422
            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3423
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3424
            }
3425
        }
3426
        return (sbits64) LIT64( 0x8000000000000000 );
3427
    }
3428
    else if ( aExp < 0x3FFF ) {
3429
        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3430
        return 0;
3431
    }
3432
    z = aSig>>( - shiftCount );
3433
    if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3434
        float_exception_flags |= float_flag_inexact;
3435
    }
3436
    if ( aSign ) z = - z;
3437
    return z;
3438

3439
}
3440

3441
/*
3442
-------------------------------------------------------------------------------
3443
Returns the result of converting the extended double-precision floating-
3444
point value `a' to the single-precision floating-point format.  The
3445
conversion is performed according to the IEC/IEEE Standard for Binary
3446
Floating-Point Arithmetic.
3447
-------------------------------------------------------------------------------
3448
*/
3449
float32 floatx80_to_float32( floatx80 a )
3450
{
3451
    flag aSign;
3452
    int32 aExp;
3453
    bits64 aSig;
3454

3455
    aSig = extractFloatx80Frac( a );
3456
    aExp = extractFloatx80Exp( a );
3457
    aSign = extractFloatx80Sign( a );
3458
    if ( aExp == 0x7FFF ) {
3459
        if ( (bits64) ( aSig<<1 ) ) {
3460
            return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3461
        }
3462
        return packFloat32( aSign, 0xFF, 0 );
3463
    }
3464
    shift64RightJamming( aSig, 33, &aSig );
3465
    if ( aExp || aSig ) aExp -= 0x3F81;
3466
    return roundAndPackFloat32( aSign, aExp, aSig );
3467

3468
}
3469

3470
/*
3471
-------------------------------------------------------------------------------
3472
Returns the result of converting the extended double-precision floating-
3473
point value `a' to the double-precision floating-point format.  The
3474
conversion is performed according to the IEC/IEEE Standard for Binary
3475
Floating-Point Arithmetic.
3476
-------------------------------------------------------------------------------
3477
*/
3478
float64 floatx80_to_float64( floatx80 a )
3479
{
3480
    flag aSign;
3481
    int32 aExp;
3482
    bits64 aSig, zSig;
3483

3484
    aSig = extractFloatx80Frac( a );
3485
    aExp = extractFloatx80Exp( a );
3486
    aSign = extractFloatx80Sign( a );
3487
    if ( aExp == 0x7FFF ) {
3488
        if ( (bits64) ( aSig<<1 ) ) {
3489
            return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3490
        }
3491
        return packFloat64( aSign, 0x7FF, 0 );
3492
    }
3493
    shift64RightJamming( aSig, 1, &zSig );
3494
    if ( aExp || aSig ) aExp -= 0x3C01;
3495
    return roundAndPackFloat64( aSign, aExp, zSig );
3496

3497
}
3498

3499
#ifdef FLOAT128
3500

3501
/*
3502
-------------------------------------------------------------------------------
3503
Returns the result of converting the extended double-precision floating-
3504
point value `a' to the quadruple-precision floating-point format.  The
3505
conversion is performed according to the IEC/IEEE Standard for Binary
3506
Floating-Point Arithmetic.
3507
-------------------------------------------------------------------------------
3508
*/
3509
float128 floatx80_to_float128( floatx80 a )
3510
{
3511
    flag aSign;
3512
    int16 aExp;
3513
    bits64 aSig, zSig0, zSig1;
3514

3515
    aSig = extractFloatx80Frac( a );
3516
    aExp = extractFloatx80Exp( a );
3517
    aSign = extractFloatx80Sign( a );
3518
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3519
        return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3520
    }
3521
    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3522
    return packFloat128( aSign, aExp, zSig0, zSig1 );
3523

3524
}
3525

3526
#endif
3527

3528
/*
3529
-------------------------------------------------------------------------------
3530
Rounds the extended double-precision floating-point value `a' to an integer,
3531
and returns the result as an extended quadruple-precision floating-point
3532
value.  The operation is performed according to the IEC/IEEE Standard for
3533
Binary Floating-Point Arithmetic.
3534
-------------------------------------------------------------------------------
3535
*/
3536
floatx80 floatx80_round_to_int( floatx80 a )
3537
{
3538
    flag aSign;
3539
    int32 aExp;
3540
    bits64 lastBitMask, roundBitsMask;
3541
    int8 roundingMode;
3542
    floatx80 z;
3543

3544
    aExp = extractFloatx80Exp( a );
3545
    if ( 0x403E <= aExp ) {
3546
        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3547
            return propagateFloatx80NaN( a, a );
3548
        }
3549
        return a;
3550
    }
3551
    if ( aExp < 0x3FFF ) {
3552
        if (    ( aExp == 0 )
3553
             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3554
            return a;
3555
        }
3556
        float_exception_flags |= float_flag_inexact;
3557
        aSign = extractFloatx80Sign( a );
3558
        switch ( float_rounding_mode ) {
3559
         case float_round_nearest_even:
3560
            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3561
               ) {
3562
                return
3563
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3564
            }
3565
            break;
3566
	 case float_round_to_zero:
3567
	    break;
3568
         case float_round_down:
3569
            return
3570
                  aSign ?
3571
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3572
                : packFloatx80( 0, 0, 0 );
3573
         case float_round_up:
3574
            return
3575
                  aSign ? packFloatx80( 1, 0, 0 )
3576
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3577
        }
3578
        return packFloatx80( aSign, 0, 0 );
3579
    }
3580
    lastBitMask = 1;
3581
    lastBitMask <<= 0x403E - aExp;
3582
    roundBitsMask = lastBitMask - 1;
3583
    z = a;
3584
    roundingMode = float_rounding_mode;
3585
    if ( roundingMode == float_round_nearest_even ) {
3586
        z.low += lastBitMask>>1;
3587
        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3588
    }
3589
    else if ( roundingMode != float_round_to_zero ) {
3590
        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3591
            z.low += roundBitsMask;
3592
        }
3593
    }
3594
    z.low &= ~ roundBitsMask;
3595
    if ( z.low == 0 ) {
3596
        ++z.high;
3597
        z.low = LIT64( 0x8000000000000000 );
3598
    }
3599
    if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3600
    return z;
3601

3602
}
3603

3604
/*
3605
-------------------------------------------------------------------------------
3606
Returns the result of adding the absolute values of the extended double-
3607
precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3608
negated before being returned.  `zSign' is ignored if the result is a NaN.
3609
The addition is performed according to the IEC/IEEE Standard for Binary
3610
Floating-Point Arithmetic.
3611
-------------------------------------------------------------------------------
3612
*/
3613
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3614
{
3615
    int32 aExp, bExp, zExp;
3616
    bits64 aSig, bSig, zSig0, zSig1;
3617
    int32 expDiff;
3618

3619
    aSig = extractFloatx80Frac( a );
3620
    aExp = extractFloatx80Exp( a );
3621
    bSig = extractFloatx80Frac( b );
3622
    bExp = extractFloatx80Exp( b );
3623
    expDiff = aExp - bExp;
3624
    if ( 0 < expDiff ) {
3625
        if ( aExp == 0x7FFF ) {
3626
            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3627
            return a;
3628
        }
3629
        if ( bExp == 0 ) --expDiff;
3630
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3631
        zExp = aExp;
3632
    }
3633
    else if ( expDiff < 0 ) {
3634
        if ( bExp == 0x7FFF ) {
3635
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3636
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3637
        }
3638
        if ( aExp == 0 ) ++expDiff;
3639
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3640
        zExp = bExp;
3641
    }
3642
    else {
3643
        if ( aExp == 0x7FFF ) {
3644
            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3645
                return propagateFloatx80NaN( a, b );
3646
            }
3647
            return a;
3648
        }
3649
        zSig1 = 0;
3650
        zSig0 = aSig + bSig;
3651
        if ( aExp == 0 ) {
3652
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3653
            goto roundAndPack;
3654
        }
3655
        zExp = aExp;
3656
        goto shiftRight1;
3657
    }
3658
    zSig0 = aSig + bSig;
3659
    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3660
 shiftRight1:
3661
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3662
    zSig0 |= LIT64( 0x8000000000000000 );
3663
    ++zExp;
3664
 roundAndPack:
3665
    return
3666
        roundAndPackFloatx80(
3667
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3668

3669
}
3670

3671
/*
3672
-------------------------------------------------------------------------------
3673
Returns the result of subtracting the absolute values of the extended
3674
double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3675
difference is negated before being returned.  `zSign' is ignored if the
3676
result is a NaN.  The subtraction is performed according to the IEC/IEEE
3677
Standard for Binary Floating-Point Arithmetic.
3678
-------------------------------------------------------------------------------
3679
*/
3680
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3681
{
3682
    int32 aExp, bExp, zExp;
3683
    bits64 aSig, bSig, zSig0, zSig1;
3684
    int32 expDiff;
3685
    floatx80 z;
3686

3687
    aSig = extractFloatx80Frac( a );
3688
    aExp = extractFloatx80Exp( a );
3689
    bSig = extractFloatx80Frac( b );
3690
    bExp = extractFloatx80Exp( b );
3691
    expDiff = aExp - bExp;
3692
    if ( 0 < expDiff ) goto aExpBigger;
3693
    if ( expDiff < 0 ) goto bExpBigger;
3694
    if ( aExp == 0x7FFF ) {
3695
        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3696
            return propagateFloatx80NaN( a, b );
3697
        }
3698
        float_raise( float_flag_invalid );
3699
        z.low = floatx80_default_nan_low;
3700
        z.high = floatx80_default_nan_high;
3701
        return z;
3702
    }
3703
    if ( aExp == 0 ) {
3704
        aExp = 1;
3705
        bExp = 1;
3706
    }
3707
    zSig1 = 0;
3708
    if ( bSig < aSig ) goto aBigger;
3709
    if ( aSig < bSig ) goto bBigger;
3710
    return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3711
 bExpBigger:
3712
    if ( bExp == 0x7FFF ) {
3713
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3714
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3715
    }
3716
    if ( aExp == 0 ) ++expDiff;
3717
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3718
 bBigger:
3719
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3720
    zExp = bExp;
3721
    zSign ^= 1;
3722
    goto normalizeRoundAndPack;
3723
 aExpBigger:
3724
    if ( aExp == 0x7FFF ) {
3725
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3726
        return a;
3727
    }
3728
    if ( bExp == 0 ) --expDiff;
3729
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3730
 aBigger:
3731
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3732
    zExp = aExp;
3733
 normalizeRoundAndPack:
3734
    return
3735
        normalizeRoundAndPackFloatx80(
3736
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3737

3738
}
3739

3740
/*
3741
-------------------------------------------------------------------------------
3742
Returns the result of adding the extended double-precision floating-point
3743
values `a' and `b'.  The operation is performed according to the IEC/IEEE
3744
Standard for Binary Floating-Point Arithmetic.
3745
-------------------------------------------------------------------------------
3746
*/
3747
floatx80 floatx80_add( floatx80 a, floatx80 b )
3748
{
3749
    flag aSign, bSign;
3750

3751
    aSign = extractFloatx80Sign( a );
3752
    bSign = extractFloatx80Sign( b );
3753
    if ( aSign == bSign ) {
3754
        return addFloatx80Sigs( a, b, aSign );
3755
    }
3756
    else {
3757
        return subFloatx80Sigs( a, b, aSign );
3758
    }
3759

3760
}
3761

3762
/*
3763
-------------------------------------------------------------------------------
3764
Returns the result of subtracting the extended double-precision floating-
3765
point values `a' and `b'.  The operation is performed according to the
3766
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3767
-------------------------------------------------------------------------------
3768
*/
3769
floatx80 floatx80_sub( floatx80 a, floatx80 b )
3770
{
3771
    flag aSign, bSign;
3772

3773
    aSign = extractFloatx80Sign( a );
3774
    bSign = extractFloatx80Sign( b );
3775
    if ( aSign == bSign ) {
3776
        return subFloatx80Sigs( a, b, aSign );
3777
    }
3778
    else {
3779
        return addFloatx80Sigs( a, b, aSign );
3780
    }
3781

3782
}
3783

3784
/*
3785
-------------------------------------------------------------------------------
3786
Returns the result of multiplying the extended double-precision floating-
3787
point values `a' and `b'.  The operation is performed according to the
3788
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3789
-------------------------------------------------------------------------------
3790
*/
3791
floatx80 floatx80_mul( floatx80 a, floatx80 b )
3792
{
3793
    flag aSign, bSign, zSign;
3794
    int32 aExp, bExp, zExp;
3795
    bits64 aSig, bSig, zSig0, zSig1;
3796
    floatx80 z;
3797

3798
    aSig = extractFloatx80Frac( a );
3799
    aExp = extractFloatx80Exp( a );
3800
    aSign = extractFloatx80Sign( a );
3801
    bSig = extractFloatx80Frac( b );
3802
    bExp = extractFloatx80Exp( b );
3803
    bSign = extractFloatx80Sign( b );
3804
    zSign = aSign ^ bSign;
3805
    if ( aExp == 0x7FFF ) {
3806
        if (    (bits64) ( aSig<<1 )
3807
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3808
            return propagateFloatx80NaN( a, b );
3809
        }
3810
        if ( ( bExp | bSig ) == 0 ) goto invalid;
3811
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3812
    }
3813
    if ( bExp == 0x7FFF ) {
3814
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3815
        if ( ( aExp | aSig ) == 0 ) {
3816
 invalid:
3817
            float_raise( float_flag_invalid );
3818
            z.low = floatx80_default_nan_low;
3819
            z.high = floatx80_default_nan_high;
3820
            return z;
3821
        }
3822
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3823
    }
3824
    if ( aExp == 0 ) {
3825
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3826
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3827
    }
3828
    if ( bExp == 0 ) {
3829
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3830
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3831
    }
3832
    zExp = aExp + bExp - 0x3FFE;
3833
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3834
    if ( 0 < (sbits64) zSig0 ) {
3835
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3836
        --zExp;
3837
    }
3838
    return
3839
        roundAndPackFloatx80(
3840
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3841

3842
}
3843

3844
/*
3845
-------------------------------------------------------------------------------
3846
Returns the result of dividing the extended double-precision floating-point
3847
value `a' by the corresponding value `b'.  The operation is performed
3848
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3849
-------------------------------------------------------------------------------
3850
*/
3851
floatx80 floatx80_div( floatx80 a, floatx80 b )
3852
{
3853
    flag aSign, bSign, zSign;
3854
    int32 aExp, bExp, zExp;
3855
    bits64 aSig, bSig, zSig0, zSig1;
3856
    bits64 rem0, rem1, rem2, term0, term1, term2;
3857
    floatx80 z;
3858

3859
    aSig = extractFloatx80Frac( a );
3860
    aExp = extractFloatx80Exp( a );
3861
    aSign = extractFloatx80Sign( a );
3862
    bSig = extractFloatx80Frac( b );
3863
    bExp = extractFloatx80Exp( b );
3864
    bSign = extractFloatx80Sign( b );
3865
    zSign = aSign ^ bSign;
3866
    if ( aExp == 0x7FFF ) {
3867
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3868
        if ( bExp == 0x7FFF ) {
3869
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3870
            goto invalid;
3871
        }
3872
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3873
    }
3874
    if ( bExp == 0x7FFF ) {
3875
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3876
        return packFloatx80( zSign, 0, 0 );
3877
    }
3878
    if ( bExp == 0 ) {
3879
        if ( bSig == 0 ) {
3880
            if ( ( aExp | aSig ) == 0 ) {
3881
 invalid:
3882
                float_raise( float_flag_invalid );
3883
                z.low = floatx80_default_nan_low;
3884
                z.high = floatx80_default_nan_high;
3885
                return z;
3886
            }
3887
            float_raise( float_flag_divbyzero );
3888
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3889
        }
3890
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3891
    }
3892
    if ( aExp == 0 ) {
3893
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3894
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3895
    }
3896
    zExp = aExp - bExp + 0x3FFE;
3897
    rem1 = 0;
3898
    if ( bSig <= aSig ) {
3899
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
3900
        ++zExp;
3901
    }
3902
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3903
    mul64To128( bSig, zSig0, &term0, &term1 );
3904
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3905
    while ( (sbits64) rem0 < 0 ) {
3906
        --zSig0;
3907
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3908
    }
3909
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
3910
    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3911
        mul64To128( bSig, zSig1, &term1, &term2 );
3912
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3913
        while ( (sbits64) rem1 < 0 ) {
3914
            --zSig1;
3915
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3916
        }
3917
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
3918
    }
3919
    return
3920
        roundAndPackFloatx80(
3921
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3922

3923
}
3924

3925
/*
3926
-------------------------------------------------------------------------------
3927
Returns the remainder of the extended double-precision floating-point value
3928
`a' with respect to the corresponding value `b'.  The operation is performed
3929
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3930
-------------------------------------------------------------------------------
3931
*/
3932
floatx80 floatx80_rem( floatx80 a, floatx80 b )
3933
{
3934
    flag aSign, bSign, zSign;
3935
    int32 aExp, bExp, expDiff;
3936
    bits64 aSig0, aSig1, bSig;
3937
    bits64 q, term0, term1, alternateASig0, alternateASig1;
3938
    floatx80 z;
3939

3940
    aSig0 = extractFloatx80Frac( a );
3941
    aExp = extractFloatx80Exp( a );
3942
    aSign = extractFloatx80Sign( a );
3943
    bSig = extractFloatx80Frac( b );
3944
    bExp = extractFloatx80Exp( b );
3945
    bSign = extractFloatx80Sign( b );
3946
    if ( aExp == 0x7FFF ) {
3947
        if (    (bits64) ( aSig0<<1 )
3948
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3949
            return propagateFloatx80NaN( a, b );
3950
        }
3951
        goto invalid;
3952
    }
3953
    if ( bExp == 0x7FFF ) {
3954
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3955
        return a;
3956
    }
3957
    if ( bExp == 0 ) {
3958
        if ( bSig == 0 ) {
3959
 invalid:
3960
            float_raise( float_flag_invalid );
3961
            z.low = floatx80_default_nan_low;
3962
            z.high = floatx80_default_nan_high;
3963
            return z;
3964
        }
3965
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3966
    }
3967
    if ( aExp == 0 ) {
3968
        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3969
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3970
    }
3971
    bSig |= LIT64( 0x8000000000000000 );
3972
    zSign = aSign;
3973
    expDiff = aExp - bExp;
3974
    aSig1 = 0;
3975
    if ( expDiff < 0 ) {
3976
        if ( expDiff < -1 ) return a;
3977
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3978
        expDiff = 0;
3979
    }
3980
    q = ( bSig <= aSig0 );
3981
    if ( q ) aSig0 -= bSig;
3982
    expDiff -= 64;
3983
    while ( 0 < expDiff ) {
3984
        q = estimateDiv128To64( aSig0, aSig1, bSig );
3985
        q = ( 2 < q ) ? q - 2 : 0;
3986
        mul64To128( bSig, q, &term0, &term1 );
3987
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3988
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3989
        expDiff -= 62;
3990
    }
3991
    expDiff += 64;
3992
    if ( 0 < expDiff ) {
3993
        q = estimateDiv128To64( aSig0, aSig1, bSig );
3994
        q = ( 2 < q ) ? q - 2 : 0;
3995
        q >>= 64 - expDiff;
3996
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3997
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3998
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3999
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
4000
            ++q;
4001
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4002
        }
4003
    }
4004
    else {
4005
        term1 = 0;
4006
        term0 = bSig;
4007
    }
4008
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4009
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4010
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4011
              && ( q & 1 ) )
4012
       ) {
4013
        aSig0 = alternateASig0;
4014
        aSig1 = alternateASig1;
4015
        zSign = ! zSign;
4016
    }
4017
    return
4018
        normalizeRoundAndPackFloatx80(
4019
            80, zSign, bExp + expDiff, aSig0, aSig1 );
4020

4021
}
4022

4023
/*
4024
-------------------------------------------------------------------------------
4025
Returns the square root of the extended double-precision floating-point
4026
value `a'.  The operation is performed according to the IEC/IEEE Standard
4027
for Binary Floating-Point Arithmetic.
4028
-------------------------------------------------------------------------------
4029
*/
4030
floatx80 floatx80_sqrt( floatx80 a )
4031
{
4032
    flag aSign;
4033
    int32 aExp, zExp;
4034
    bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4035
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4036
    floatx80 z;
4037

4038
    aSig0 = extractFloatx80Frac( a );
4039
    aExp = extractFloatx80Exp( a );
4040
    aSign = extractFloatx80Sign( a );
4041
    if ( aExp == 0x7FFF ) {
4042
        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4043
        if ( ! aSign ) return a;
4044
        goto invalid;
4045
    }
4046
    if ( aSign ) {
4047
        if ( ( aExp | aSig0 ) == 0 ) return a;
4048
 invalid:
4049
        float_raise( float_flag_invalid );
4050
        z.low = floatx80_default_nan_low;
4051
        z.high = floatx80_default_nan_high;
4052
        return z;
4053
    }
4054
    if ( aExp == 0 ) {
4055
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4056
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4057
    }
4058
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4059
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4060
    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4061
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4062
    doubleZSig0 = zSig0<<1;
4063
    mul64To128( zSig0, zSig0, &term0, &term1 );
4064
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4065
    while ( (sbits64) rem0 < 0 ) {
4066
        --zSig0;
4067
        doubleZSig0 -= 2;
4068
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4069
    }
4070
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4071
    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4072
        if ( zSig1 == 0 ) zSig1 = 1;
4073
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4074
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4075
        mul64To128( zSig1, zSig1, &term2, &term3 );
4076
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4077
        while ( (sbits64) rem1 < 0 ) {
4078
            --zSig1;
4079
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4080
            term3 |= 1;
4081
            term2 |= doubleZSig0;
4082
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4083
        }
4084
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4085
    }
4086
    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4087
    zSig0 |= doubleZSig0;
4088
    return
4089
        roundAndPackFloatx80(
4090
            floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4091

4092
}
4093

4094
/*
4095
-------------------------------------------------------------------------------
4096
Returns 1 if the extended double-precision floating-point value `a' is
4097
equal to the corresponding value `b', and 0 otherwise.  The comparison is
4098
performed according to the IEC/IEEE Standard for Binary Floating-Point
4099
Arithmetic.
4100
-------------------------------------------------------------------------------
4101
*/
4102
flag floatx80_eq( floatx80 a, floatx80 b )
4103
{
4104

4105
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4106
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4107
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4108
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4109
       ) {
4110
        if (    floatx80_is_signaling_nan( a )
4111
             || floatx80_is_signaling_nan( b ) ) {
4112
            float_raise( float_flag_invalid );
4113
        }
4114
        return 0;
4115
    }
4116
    return
4117
           ( a.low == b.low )
4118
        && (    ( a.high == b.high )
4119
             || (    ( a.low == 0 )
4120
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4121
           );
4122

4123
}
4124

4125
/*
4126
-------------------------------------------------------------------------------
4127
Returns 1 if the extended double-precision floating-point value `a' is
4128
less than or equal to the corresponding value `b', and 0 otherwise.  The
4129
comparison is performed according to the IEC/IEEE Standard for Binary
4130
Floating-Point Arithmetic.
4131
-------------------------------------------------------------------------------
4132
*/
4133
flag floatx80_le( floatx80 a, floatx80 b )
4134
{
4135
    flag aSign, bSign;
4136

4137
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4138
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4139
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4140
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4141
       ) {
4142
        float_raise( float_flag_invalid );
4143
        return 0;
4144
    }
4145
    aSign = extractFloatx80Sign( a );
4146
    bSign = extractFloatx80Sign( b );
4147
    if ( aSign != bSign ) {
4148
        return
4149
               aSign
4150
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4151
                 == 0 );
4152
    }
4153
    return
4154
          aSign ? le128( b.high, b.low, a.high, a.low )
4155
        : le128( a.high, a.low, b.high, b.low );
4156

4157
}
4158

4159
/*
4160
-------------------------------------------------------------------------------
4161
Returns 1 if the extended double-precision floating-point value `a' is
4162
less than the corresponding value `b', and 0 otherwise.  The comparison
4163
is performed according to the IEC/IEEE Standard for Binary Floating-Point
4164
Arithmetic.
4165
-------------------------------------------------------------------------------
4166
*/
4167
flag floatx80_lt( floatx80 a, floatx80 b )
4168
{
4169
    flag aSign, bSign;
4170

4171
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4172
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4173
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4174
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4175
       ) {
4176
        float_raise( float_flag_invalid );
4177
        return 0;
4178
    }
4179
    aSign = extractFloatx80Sign( a );
4180
    bSign = extractFloatx80Sign( b );
4181
    if ( aSign != bSign ) {
4182
        return
4183
               aSign
4184
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4185
                 != 0 );
4186
    }
4187
    return
4188
          aSign ? lt128( b.high, b.low, a.high, a.low )
4189
        : lt128( a.high, a.low, b.high, b.low );
4190

4191
}
4192

4193
/*
4194
-------------------------------------------------------------------------------
4195
Returns 1 if the extended double-precision floating-point value `a' is equal
4196
to the corresponding value `b', and 0 otherwise.  The invalid exception is
4197
raised if either operand is a NaN.  Otherwise, the comparison is performed
4198
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4199
-------------------------------------------------------------------------------
4200
*/
4201
flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4202
{
4203

4204
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4205
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4206
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4207
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4208
       ) {
4209
        float_raise( float_flag_invalid );
4210
        return 0;
4211
    }
4212
    return
4213
           ( a.low == b.low )
4214
        && (    ( a.high == b.high )
4215
             || (    ( a.low == 0 )
4216
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4217
           );
4218

4219
}
4220

4221
/*
4222
-------------------------------------------------------------------------------
4223
Returns 1 if the extended double-precision floating-point value `a' is less
4224
than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4225
do not cause an exception.  Otherwise, the comparison is performed according
4226
to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4227
-------------------------------------------------------------------------------
4228
*/
4229
flag floatx80_le_quiet( floatx80 a, floatx80 b )
4230
{
4231
    flag aSign, bSign;
4232

4233
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4234
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4235
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4236
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4237
       ) {
4238
        if (    floatx80_is_signaling_nan( a )
4239
             || floatx80_is_signaling_nan( b ) ) {
4240
            float_raise( float_flag_invalid );
4241
        }
4242
        return 0;
4243
    }
4244
    aSign = extractFloatx80Sign( a );
4245
    bSign = extractFloatx80Sign( b );
4246
    if ( aSign != bSign ) {
4247
        return
4248
               aSign
4249
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4250
                 == 0 );
4251
    }
4252
    return
4253
          aSign ? le128( b.high, b.low, a.high, a.low )
4254
        : le128( a.high, a.low, b.high, b.low );
4255

4256
}
4257

4258
/*
4259
-------------------------------------------------------------------------------
4260
Returns 1 if the extended double-precision floating-point value `a' is less
4261
than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4262
an exception.  Otherwise, the comparison is performed according to the
4263
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4264
-------------------------------------------------------------------------------
4265
*/
4266
flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4267
{
4268
    flag aSign, bSign;
4269

4270
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4271
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4272
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4273
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4274
       ) {
4275
        if (    floatx80_is_signaling_nan( a )
4276
             || floatx80_is_signaling_nan( b ) ) {
4277
            float_raise( float_flag_invalid );
4278
        }
4279
        return 0;
4280
    }
4281
    aSign = extractFloatx80Sign( a );
4282
    bSign = extractFloatx80Sign( b );
4283
    if ( aSign != bSign ) {
4284
        return
4285
               aSign
4286
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4287
                 != 0 );
4288
    }
4289
    return
4290
          aSign ? lt128( b.high, b.low, a.high, a.low )
4291
        : lt128( a.high, a.low, b.high, b.low );
4292

4293
}
4294

4295
#endif
4296

4297
#ifdef FLOAT128
4298

4299
/*
4300
-------------------------------------------------------------------------------
4301
Returns the result of converting the quadruple-precision floating-point
4302
value `a' to the 32-bit two's complement integer format.  The conversion
4303
is performed according to the IEC/IEEE Standard for Binary Floating-Point
4304
Arithmetic---which means in particular that the conversion is rounded
4305
according to the current rounding mode.  If `a' is a NaN, the largest
4306
positive integer is returned.  Otherwise, if the conversion overflows, the
4307
largest integer with the same sign as `a' is returned.
4308
-------------------------------------------------------------------------------
4309
*/
4310
int32 float128_to_int32( float128 a )
4311
{
4312
    flag aSign;
4313
    int32 aExp, shiftCount;
4314
    bits64 aSig0, aSig1;
4315

4316
    aSig1 = extractFloat128Frac1( a );
4317
    aSig0 = extractFloat128Frac0( a );
4318
    aExp = extractFloat128Exp( a );
4319
    aSign = extractFloat128Sign( a );
4320
    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4321
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4322
    aSig0 |= ( aSig1 != 0 );
4323
    shiftCount = 0x4028 - aExp;
4324
    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4325
    return roundAndPackInt32( aSign, aSig0 );
4326

4327
}
4328

4329
/*
4330
-------------------------------------------------------------------------------
4331
Returns the result of converting the quadruple-precision floating-point
4332
value `a' to the 32-bit two's complement integer format.  The conversion
4333
is performed according to the IEC/IEEE Standard for Binary Floating-Point
4334
Arithmetic, except that the conversion is always rounded toward zero.  If
4335
`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4336
conversion overflows, the largest integer with the same sign as `a' is
4337
returned.
4338
-------------------------------------------------------------------------------
4339
*/
4340
int32 float128_to_int32_round_to_zero( float128 a )
4341
{
4342
    flag aSign;
4343
    int32 aExp, shiftCount;
4344
    bits64 aSig0, aSig1, savedASig;
4345
    int32 z;
4346

4347
    aSig1 = extractFloat128Frac1( a );
4348
    aSig0 = extractFloat128Frac0( a );
4349
    aExp = extractFloat128Exp( a );
4350
    aSign = extractFloat128Sign( a );
4351
    aSig0 |= ( aSig1 != 0 );
4352
    if ( 0x401E < aExp ) {
4353
        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4354
        goto invalid;
4355
    }
4356
    else if ( aExp < 0x3FFF ) {
4357
        if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4358
        return 0;
4359
    }
4360
    aSig0 |= LIT64( 0x0001000000000000 );
4361
    shiftCount = 0x402F - aExp;
4362
    savedASig = aSig0;
4363
    aSig0 >>= shiftCount;
4364
    z = aSig0;
4365
    if ( aSign ) z = - z;
4366
    if ( ( z < 0 ) ^ aSign ) {
4367
 invalid:
4368
        float_raise( float_flag_invalid );
4369
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4370
    }
4371
    if ( ( aSig0<<shiftCount ) != savedASig ) {
4372
        float_exception_flags |= float_flag_inexact;
4373
    }
4374
    return z;
4375

4376
}
4377

4378
/*
4379
-------------------------------------------------------------------------------
4380
Returns the result of converting the quadruple-precision floating-point
4381
value `a' to the 64-bit two's complement integer format.  The conversion
4382
is performed according to the IEC/IEEE Standard for Binary Floating-Point
4383
Arithmetic---which means in particular that the conversion is rounded
4384
according to the current rounding mode.  If `a' is a NaN, the largest
4385
positive integer is returned.  Otherwise, if the conversion overflows, the
4386
largest integer with the same sign as `a' is returned.
4387
-------------------------------------------------------------------------------
4388
*/
4389
int64 float128_to_int64( float128 a )
4390
{
4391
    flag aSign;
4392
    int32 aExp, shiftCount;
4393
    bits64 aSig0, aSig1;
4394

4395
    aSig1 = extractFloat128Frac1( a );
4396
    aSig0 = extractFloat128Frac0( a );
4397
    aExp = extractFloat128Exp( a );
4398
    aSign = extractFloat128Sign( a );
4399
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4400
    shiftCount = 0x402F - aExp;
4401
    if ( shiftCount <= 0 ) {
4402
        if ( 0x403E < aExp ) {
4403
            float_raise( float_flag_invalid );
4404
            if (    ! aSign
4405
                 || (    ( aExp == 0x7FFF )
4406
                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4407
                    )
4408
               ) {
4409
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4410
            }
4411
            return (sbits64) LIT64( 0x8000000000000000 );
4412
        }
4413
        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4414
    }
4415
    else {
4416
        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4417
    }
4418
    return roundAndPackInt64( aSign, aSig0, aSig1 );
4419

4420
}
4421

4422
/*
4423
-------------------------------------------------------------------------------
4424
Returns the result of converting the quadruple-precision floating-point
4425
value `a' to the 64-bit two's complement integer format.  The conversion
4426
is performed according to the IEC/IEEE Standard for Binary Floating-Point
4427
Arithmetic, except that the conversion is always rounded toward zero.
4428
If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4429
the conversion overflows, the largest integer with the same sign as `a' is
4430
returned.
4431
-------------------------------------------------------------------------------
4432
*/
4433
int64 float128_to_int64_round_to_zero( float128 a )
4434
{
4435
    flag aSign;
4436
    int32 aExp, shiftCount;
4437
    bits64 aSig0, aSig1;
4438
    int64 z;
4439

4440
    aSig1 = extractFloat128Frac1( a );
4441
    aSig0 = extractFloat128Frac0( a );
4442
    aExp = extractFloat128Exp( a );
4443
    aSign = extractFloat128Sign( a );
4444
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4445
    shiftCount = aExp - 0x402F;
4446
    if ( 0 < shiftCount ) {
4447
        if ( 0x403E <= aExp ) {
4448
            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4449
            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4450
                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4451
                if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4452
            }
4453
            else {
4454
                float_raise( float_flag_invalid );
4455
                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4456
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
4457
                }
4458
            }
4459
            return (sbits64) LIT64( 0x8000000000000000 );
4460
        }
4461
        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4462
        if ( (bits64) ( aSig1<<shiftCount ) ) {
4463
            float_exception_flags |= float_flag_inexact;
4464
        }
4465
    }
4466
    else {
4467
        if ( aExp < 0x3FFF ) {
4468
            if ( aExp | aSig0 | aSig1 ) {
4469
                float_exception_flags |= float_flag_inexact;
4470
            }
4471
            return 0;
4472
        }
4473
        z = aSig0>>( - shiftCount );
4474
        if (    aSig1
4475
             || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4476
            float_exception_flags |= float_flag_inexact;
4477
        }
4478
    }
4479
    if ( aSign ) z = - z;
4480
    return z;
4481

4482
}
4483

4484
#if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \
4485
    && defined(SOFTFLOAT_NEED_FIXUNS)
4486
/*
4487
 * just like above - but do not care for overflow of signed results
4488
 */
4489
uint64 float128_to_uint64_round_to_zero( float128 a )
4490
{
4491
    flag aSign;
4492
    int32 aExp, shiftCount;
4493
    bits64 aSig0, aSig1;
4494
    uint64 z;
4495

4496
    aSig1 = extractFloat128Frac1( a );
4497
    aSig0 = extractFloat128Frac0( a );
4498
    aExp = extractFloat128Exp( a );
4499
    aSign = extractFloat128Sign( a );
4500
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4501
    shiftCount = aExp - 0x402F;
4502
    if ( 0 < shiftCount ) {
4503
        if ( 0x403F <= aExp ) {
4504
            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4505
            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4506
                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4507
                if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4508
            }
4509
            else {
4510
                float_raise( float_flag_invalid );
4511
            }
4512
            return LIT64( 0xFFFFFFFFFFFFFFFF );
4513
        }
4514
        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4515
        if ( (bits64) ( aSig1<<shiftCount ) ) {
4516
            float_exception_flags |= float_flag_inexact;
4517
        }
4518
    }
4519
    else {
4520
        if ( aExp < 0x3FFF ) {
4521
            if ( aExp | aSig0 | aSig1 ) {
4522
                float_exception_flags |= float_flag_inexact;
4523
            }
4524
            return 0;
4525
        }
4526
        z = aSig0>>( - shiftCount );
4527
        if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4528
            float_exception_flags |= float_flag_inexact;
4529
        }
4530
    }
4531
    if ( aSign ) z = - z;
4532
    return z;
4533

4534
}
4535
#endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */
4536

4537
/*
4538
-------------------------------------------------------------------------------
4539
Returns the result of converting the quadruple-precision floating-point
4540
value `a' to the single-precision floating-point format.  The conversion
4541
is performed according to the IEC/IEEE Standard for Binary Floating-Point
4542
Arithmetic.
4543
-------------------------------------------------------------------------------
4544
*/
4545
float32 float128_to_float32( float128 a )
4546
{
4547
    flag aSign;
4548
    int32 aExp;
4549
    bits64 aSig0, aSig1;
4550
    bits32 zSig;
4551

4552
    aSig1 = extractFloat128Frac1( a );
4553
    aSig0 = extractFloat128Frac0( a );
4554
    aExp = extractFloat128Exp( a );
4555
    aSign = extractFloat128Sign( a );
4556
    if ( aExp == 0x7FFF ) {
4557
        if ( aSig0 | aSig1 ) {
4558
            return commonNaNToFloat32( float128ToCommonNaN( a ) );
4559
        }
4560
        return packFloat32( aSign, 0xFF, 0 );
4561
    }
4562
    aSig0 |= ( aSig1 != 0 );
4563
    shift64RightJamming( aSig0, 18, &aSig0 );
4564
    zSig = aSig0;
4565
    if ( aExp || zSig ) {
4566
        zSig |= 0x40000000;
4567
        aExp -= 0x3F81;
4568
    }
4569
    return roundAndPackFloat32( aSign, aExp, zSig );
4570

4571
}
4572

4573
/*
4574
-------------------------------------------------------------------------------
4575
Returns the result of converting the quadruple-precision floating-point
4576
value `a' to the double-precision floating-point format.  The conversion
4577
is performed according to the IEC/IEEE Standard for Binary Floating-Point
4578
Arithmetic.
4579
-------------------------------------------------------------------------------
4580
*/
4581
float64 float128_to_float64( float128 a )
4582
{
4583
    flag aSign;
4584
    int32 aExp;
4585
    bits64 aSig0, aSig1;
4586

4587
    aSig1 = extractFloat128Frac1( a );
4588
    aSig0 = extractFloat128Frac0( a );
4589
    aExp = extractFloat128Exp( a );
4590
    aSign = extractFloat128Sign( a );
4591
    if ( aExp == 0x7FFF ) {
4592
        if ( aSig0 | aSig1 ) {
4593
            return commonNaNToFloat64( float128ToCommonNaN( a ) );
4594
        }
4595
        return packFloat64( aSign, 0x7FF, 0 );
4596
    }
4597
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4598
    aSig0 |= ( aSig1 != 0 );
4599
    if ( aExp || aSig0 ) {
4600
        aSig0 |= LIT64( 0x4000000000000000 );
4601
        aExp -= 0x3C01;
4602
    }
4603
    return roundAndPackFloat64( aSign, aExp, aSig0 );
4604

4605
}
4606

4607
#ifdef FLOATX80
4608

4609
/*
4610
-------------------------------------------------------------------------------
4611
Returns the result of converting the quadruple-precision floating-point
4612
value `a' to the extended double-precision floating-point format.  The
4613
conversion is performed according to the IEC/IEEE Standard for Binary
4614
Floating-Point Arithmetic.
4615
-------------------------------------------------------------------------------
4616
*/
4617
floatx80 float128_to_floatx80( float128 a )
4618
{
4619
    flag aSign;
4620
    int32 aExp;
4621
    bits64 aSig0, aSig1;
4622

4623
    aSig1 = extractFloat128Frac1( a );
4624
    aSig0 = extractFloat128Frac0( a );
4625
    aExp = extractFloat128Exp( a );
4626
    aSign = extractFloat128Sign( a );
4627
    if ( aExp == 0x7FFF ) {
4628
        if ( aSig0 | aSig1 ) {
4629
            return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4630
        }
4631
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4632
    }
4633
    if ( aExp == 0 ) {
4634
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4635
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4636
    }
4637
    else {
4638
        aSig0 |= LIT64( 0x0001000000000000 );
4639
    }
4640
    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4641
    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4642

4643
}
4644

4645
#endif
4646

4647
/*
4648
-------------------------------------------------------------------------------
4649
Rounds the quadruple-precision floating-point value `a' to an integer, and
4650
returns the result as a quadruple-precision floating-point value.  The
4651
operation is performed according to the IEC/IEEE Standard for Binary
4652
Floating-Point Arithmetic.
4653
-------------------------------------------------------------------------------
4654
*/
4655
float128 float128_round_to_int( float128 a )
4656
{
4657
    flag aSign;
4658
    int32 aExp;
4659
    bits64 lastBitMask, roundBitsMask;
4660
    int8 roundingMode;
4661
    float128 z;
4662

4663
    aExp = extractFloat128Exp( a );
4664
    if ( 0x402F <= aExp ) {
4665
        if ( 0x406F <= aExp ) {
4666
            if (    ( aExp == 0x7FFF )
4667
                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4668
               ) {
4669
                return propagateFloat128NaN( a, a );
4670
            }
4671
            return a;
4672
        }
4673
        lastBitMask = 1;
4674
        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4675
        roundBitsMask = lastBitMask - 1;
4676
        z = a;
4677
        roundingMode = float_rounding_mode;
4678
        if ( roundingMode == float_round_nearest_even ) {
4679
            if ( lastBitMask ) {
4680
                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4681
                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4682
            }
4683
            else {
4684
                if ( (sbits64) z.low < 0 ) {
4685
                    ++z.high;
4686
                    if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4687
                }
4688
            }
4689
        }
4690
        else if ( roundingMode != float_round_to_zero ) {
4691
            if (   extractFloat128Sign( z )
4692
                 ^ ( roundingMode == float_round_up ) ) {
4693
                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4694
            }
4695
        }
4696
        z.low &= ~ roundBitsMask;
4697
    }
4698
    else {
4699
        if ( aExp < 0x3FFF ) {
4700
            if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4701
            float_exception_flags |= float_flag_inexact;
4702
            aSign = extractFloat128Sign( a );
4703
            switch ( float_rounding_mode ) {
4704
             case float_round_nearest_even:
4705
                if (    ( aExp == 0x3FFE )
4706
                     && (   extractFloat128Frac0( a )
4707
                          | extractFloat128Frac1( a ) )
4708
                   ) {
4709
                    return packFloat128( aSign, 0x3FFF, 0, 0 );
4710
                }
4711
                break;
4712
	     case float_round_to_zero:
4713
		break;
4714
             case float_round_down:
4715
                return
4716
                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4717
                    : packFloat128( 0, 0, 0, 0 );
4718
             case float_round_up:
4719
                return
4720
                      aSign ? packFloat128( 1, 0, 0, 0 )
4721
                    : packFloat128( 0, 0x3FFF, 0, 0 );
4722
            }
4723
            return packFloat128( aSign, 0, 0, 0 );
4724
        }
4725
        lastBitMask = 1;
4726
        lastBitMask <<= 0x402F - aExp;
4727
        roundBitsMask = lastBitMask - 1;
4728
        z.low = 0;
4729
        z.high = a.high;
4730
        roundingMode = float_rounding_mode;
4731
        if ( roundingMode == float_round_nearest_even ) {
4732
            z.high += lastBitMask>>1;
4733
            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4734
                z.high &= ~ lastBitMask;
4735
            }
4736
        }
4737
        else if ( roundingMode != float_round_to_zero ) {
4738
            if (   extractFloat128Sign( z )
4739
                 ^ ( roundingMode == float_round_up ) ) {
4740
                z.high |= ( a.low != 0 );
4741
                z.high += roundBitsMask;
4742
            }
4743
        }
4744
        z.high &= ~ roundBitsMask;
4745
    }
4746
    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4747
        float_exception_flags |= float_flag_inexact;
4748
    }
4749
    return z;
4750

4751
}
4752

4753
/*
4754
-------------------------------------------------------------------------------
4755
Returns the result of adding the absolute values of the quadruple-precision
4756
floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4757
before being returned.  `zSign' is ignored if the result is a NaN.
4758
The addition is performed according to the IEC/IEEE Standard for Binary
4759
Floating-Point Arithmetic.
4760
-------------------------------------------------------------------------------
4761
*/
4762
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4763
{
4764
    int32 aExp, bExp, zExp;
4765
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4766
    int32 expDiff;
4767

4768
    aSig1 = extractFloat128Frac1( a );
4769
    aSig0 = extractFloat128Frac0( a );
4770
    aExp = extractFloat128Exp( a );
4771
    bSig1 = extractFloat128Frac1( b );
4772
    bSig0 = extractFloat128Frac0( b );
4773
    bExp = extractFloat128Exp( b );
4774
    expDiff = aExp - bExp;
4775
    if ( 0 < expDiff ) {
4776
        if ( aExp == 0x7FFF ) {
4777
            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4778
            return a;
4779
        }
4780
        if ( bExp == 0 ) {
4781
            --expDiff;
4782
        }
4783
        else {
4784
            bSig0 |= LIT64( 0x0001000000000000 );
4785
        }
4786
        shift128ExtraRightJamming(
4787
            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4788
        zExp = aExp;
4789
    }
4790
    else if ( expDiff < 0 ) {
4791
        if ( bExp == 0x7FFF ) {
4792
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4793
            return packFloat128( zSign, 0x7FFF, 0, 0 );
4794
        }
4795
        if ( aExp == 0 ) {
4796
            ++expDiff;
4797
        }
4798
        else {
4799
            aSig0 |= LIT64( 0x0001000000000000 );
4800
        }
4801
        shift128ExtraRightJamming(
4802
            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4803
        zExp = bExp;
4804
    }
4805
    else {
4806
        if ( aExp == 0x7FFF ) {
4807
            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4808
                return propagateFloat128NaN( a, b );
4809
            }
4810
            return a;
4811
        }
4812
        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4813
        if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4814
        zSig2 = 0;
4815
        zSig0 |= LIT64( 0x0002000000000000 );
4816
        zExp = aExp;
4817
        goto shiftRight1;
4818
    }
4819
    aSig0 |= LIT64( 0x0001000000000000 );
4820
    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4821
    --zExp;
4822
    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4823
    ++zExp;
4824
 shiftRight1:
4825
    shift128ExtraRightJamming(
4826
        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4827
 roundAndPack:
4828
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4829

4830
}
4831

4832
/*
4833
-------------------------------------------------------------------------------
4834
Returns the result of subtracting the absolute values of the quadruple-
4835
precision floating-point values `a' and `b'.  If `zSign' is 1, the
4836
difference is negated before being returned.  `zSign' is ignored if the
4837
result is a NaN.  The subtraction is performed according to the IEC/IEEE
4838
Standard for Binary Floating-Point Arithmetic.
4839
-------------------------------------------------------------------------------
4840
*/
4841
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4842
{
4843
    int32 aExp, bExp, zExp;
4844
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4845
    int32 expDiff;
4846
    float128 z;
4847

4848
    aSig1 = extractFloat128Frac1( a );
4849
    aSig0 = extractFloat128Frac0( a );
4850
    aExp = extractFloat128Exp( a );
4851
    bSig1 = extractFloat128Frac1( b );
4852
    bSig0 = extractFloat128Frac0( b );
4853
    bExp = extractFloat128Exp( b );
4854
    expDiff = aExp - bExp;
4855
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4856
    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4857
    if ( 0 < expDiff ) goto aExpBigger;
4858
    if ( expDiff < 0 ) goto bExpBigger;
4859
    if ( aExp == 0x7FFF ) {
4860
        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4861
            return propagateFloat128NaN( a, b );
4862
        }
4863
        float_raise( float_flag_invalid );
4864
        z.low = float128_default_nan_low;
4865
        z.high = float128_default_nan_high;
4866
        return z;
4867
    }
4868
    if ( aExp == 0 ) {
4869
        aExp = 1;
4870
        bExp = 1;
4871
    }
4872
    if ( bSig0 < aSig0 ) goto aBigger;
4873
    if ( aSig0 < bSig0 ) goto bBigger;
4874
    if ( bSig1 < aSig1 ) goto aBigger;
4875
    if ( aSig1 < bSig1 ) goto bBigger;
4876
    return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4877
 bExpBigger:
4878
    if ( bExp == 0x7FFF ) {
4879
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4880
        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4881
    }
4882
    if ( aExp == 0 ) {
4883
        ++expDiff;
4884
    }
4885
    else {
4886
        aSig0 |= LIT64( 0x4000000000000000 );
4887
    }
4888
    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4889
    bSig0 |= LIT64( 0x4000000000000000 );
4890
 bBigger:
4891
    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4892
    zExp = bExp;
4893
    zSign ^= 1;
4894
    goto normalizeRoundAndPack;
4895
 aExpBigger:
4896
    if ( aExp == 0x7FFF ) {
4897
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4898
        return a;
4899
    }
4900
    if ( bExp == 0 ) {
4901
        --expDiff;
4902
    }
4903
    else {
4904
        bSig0 |= LIT64( 0x4000000000000000 );
4905
    }
4906
    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4907
    aSig0 |= LIT64( 0x4000000000000000 );
4908
 aBigger:
4909
    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4910
    zExp = aExp;
4911
 normalizeRoundAndPack:
4912
    --zExp;
4913
    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4914

4915
}
4916

4917
/*
4918
-------------------------------------------------------------------------------
4919
Returns the result of adding the quadruple-precision floating-point values
4920
`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4921
for Binary Floating-Point Arithmetic.
4922
-------------------------------------------------------------------------------
4923
*/
4924
float128 float128_add( float128 a, float128 b )
4925
{
4926
    flag aSign, bSign;
4927

4928
    aSign = extractFloat128Sign( a );
4929
    bSign = extractFloat128Sign( b );
4930
    if ( aSign == bSign ) {
4931
        return addFloat128Sigs( a, b, aSign );
4932
    }
4933
    else {
4934
        return subFloat128Sigs( a, b, aSign );
4935
    }
4936

4937
}
4938

4939
/*
4940
-------------------------------------------------------------------------------
4941
Returns the result of subtracting the quadruple-precision floating-point
4942
values `a' and `b'.  The operation is performed according to the IEC/IEEE
4943
Standard for Binary Floating-Point Arithmetic.
4944
-------------------------------------------------------------------------------
4945
*/
4946
float128 float128_sub( float128 a, float128 b )
4947
{
4948
    flag aSign, bSign;
4949

4950
    aSign = extractFloat128Sign( a );
4951
    bSign = extractFloat128Sign( b );
4952
    if ( aSign == bSign ) {
4953
        return subFloat128Sigs( a, b, aSign );
4954
    }
4955
    else {
4956
        return addFloat128Sigs( a, b, aSign );
4957
    }
4958

4959
}
4960

4961
/*
4962
-------------------------------------------------------------------------------
4963
Returns the result of multiplying the quadruple-precision floating-point
4964
values `a' and `b'.  The operation is performed according to the IEC/IEEE
4965
Standard for Binary Floating-Point Arithmetic.
4966
-------------------------------------------------------------------------------
4967
*/
4968
float128 float128_mul( float128 a, float128 b )
4969
{
4970
    flag aSign, bSign, zSign;
4971
    int32 aExp, bExp, zExp;
4972
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4973
    float128 z;
4974

4975
    aSig1 = extractFloat128Frac1( a );
4976
    aSig0 = extractFloat128Frac0( a );
4977
    aExp = extractFloat128Exp( a );
4978
    aSign = extractFloat128Sign( a );
4979
    bSig1 = extractFloat128Frac1( b );
4980
    bSig0 = extractFloat128Frac0( b );
4981
    bExp = extractFloat128Exp( b );
4982
    bSign = extractFloat128Sign( b );
4983
    zSign = aSign ^ bSign;
4984
    if ( aExp == 0x7FFF ) {
4985
        if (    ( aSig0 | aSig1 )
4986
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4987
            return propagateFloat128NaN( a, b );
4988
        }
4989
        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4990
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4991
    }
4992
    if ( bExp == 0x7FFF ) {
4993
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4994
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4995
 invalid:
4996
            float_raise( float_flag_invalid );
4997
            z.low = float128_default_nan_low;
4998
            z.high = float128_default_nan_high;
4999
            return z;
5000
        }
5001
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5002
    }
5003
    if ( aExp == 0 ) {
5004
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5005
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5006
    }
5007
    if ( bExp == 0 ) {
5008
        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5009
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5010
    }
5011
    zExp = aExp + bExp - 0x4000;
5012
    aSig0 |= LIT64( 0x0001000000000000 );
5013
    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5014
    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5015
    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5016
    zSig2 |= ( zSig3 != 0 );
5017
    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5018
        shift128ExtraRightJamming(
5019
            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5020
        ++zExp;
5021
    }
5022
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5023

5024
}
5025

5026
/*
5027
-------------------------------------------------------------------------------
5028
Returns the result of dividing the quadruple-precision floating-point value
5029
`a' by the corresponding value `b'.  The operation is performed according to
5030
the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5031
-------------------------------------------------------------------------------
5032
*/
5033
float128 float128_div( float128 a, float128 b )
5034
{
5035
    flag aSign, bSign, zSign;
5036
    int32 aExp, bExp, zExp;
5037
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5038
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5039
    float128 z;
5040

5041
    aSig1 = extractFloat128Frac1( a );
5042
    aSig0 = extractFloat128Frac0( a );
5043
    aExp = extractFloat128Exp( a );
5044
    aSign = extractFloat128Sign( a );
5045
    bSig1 = extractFloat128Frac1( b );
5046
    bSig0 = extractFloat128Frac0( b );
5047
    bExp = extractFloat128Exp( b );
5048
    bSign = extractFloat128Sign( b );
5049
    zSign = aSign ^ bSign;
5050
    if ( aExp == 0x7FFF ) {
5051
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5052
        if ( bExp == 0x7FFF ) {
5053
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5054
            goto invalid;
5055
        }
5056
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5057
    }
5058
    if ( bExp == 0x7FFF ) {
5059
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5060
        return packFloat128( zSign, 0, 0, 0 );
5061
    }
5062
    if ( bExp == 0 ) {
5063
        if ( ( bSig0 | bSig1 ) == 0 ) {
5064
            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5065
 invalid:
5066
                float_raise( float_flag_invalid );
5067
                z.low = float128_default_nan_low;
5068
                z.high = float128_default_nan_high;
5069
                return z;
5070
            }
5071
            float_raise( float_flag_divbyzero );
5072
            return packFloat128( zSign, 0x7FFF, 0, 0 );
5073
        }
5074
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5075
    }
5076
    if ( aExp == 0 ) {
5077
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5078
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5079
    }
5080
    zExp = aExp - bExp + 0x3FFD;
5081
    shortShift128Left(
5082
        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5083
    shortShift128Left(
5084
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5085
    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5086
        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5087
        ++zExp;
5088
    }
5089
    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5090
    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5091
    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5092
    while ( (sbits64) rem0 < 0 ) {
5093
        --zSig0;
5094
        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5095
    }
5096
    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5097
    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5098
        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5099
        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5100
        while ( (sbits64) rem1 < 0 ) {
5101
            --zSig1;
5102
            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5103
        }
5104
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5105
    }
5106
    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5107
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5108

5109
}
5110

5111
/*
5112
-------------------------------------------------------------------------------
5113
Returns the remainder of the quadruple-precision floating-point value `a'
5114
with respect to the corresponding value `b'.  The operation is performed
5115
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5116
-------------------------------------------------------------------------------
5117
*/
5118
float128 float128_rem( float128 a, float128 b )
5119
{
5120
    flag aSign, bSign, zSign;
5121
    int32 aExp, bExp, expDiff;
5122
    bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5123
    bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5124
    sbits64 sigMean0;
5125
    float128 z;
5126

5127
    aSig1 = extractFloat128Frac1( a );
5128
    aSig0 = extractFloat128Frac0( a );
5129
    aExp = extractFloat128Exp( a );
5130
    aSign = extractFloat128Sign( a );
5131
    bSig1 = extractFloat128Frac1( b );
5132
    bSig0 = extractFloat128Frac0( b );
5133
    bExp = extractFloat128Exp( b );
5134
    bSign = extractFloat128Sign( b );
5135
    if ( aExp == 0x7FFF ) {
5136
        if (    ( aSig0 | aSig1 )
5137
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5138
            return propagateFloat128NaN( a, b );
5139
        }
5140
        goto invalid;
5141
    }
5142
    if ( bExp == 0x7FFF ) {
5143
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5144
        return a;
5145
    }
5146
    if ( bExp == 0 ) {
5147
        if ( ( bSig0 | bSig1 ) == 0 ) {
5148
 invalid:
5149
            float_raise( float_flag_invalid );
5150
            z.low = float128_default_nan_low;
5151
            z.high = float128_default_nan_high;
5152
            return z;
5153
        }
5154
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5155
    }
5156
    if ( aExp == 0 ) {
5157
        if ( ( aSig0 | aSig1 ) == 0 ) return a;
5158
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5159
    }
5160
    expDiff = aExp - bExp;
5161
    if ( expDiff < -1 ) return a;
5162
    shortShift128Left(
5163
        aSig0 | LIT64( 0x0001000000000000 ),
5164
        aSig1,
5165
        15 - ( expDiff < 0 ),
5166
        &aSig0,
5167
        &aSig1
5168
    );
5169
    shortShift128Left(
5170
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5171
    q = le128( bSig0, bSig1, aSig0, aSig1 );
5172
    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5173
    expDiff -= 64;
5174
    while ( 0 < expDiff ) {
5175
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5176
        q = ( 4 < q ) ? q - 4 : 0;
5177
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5178
        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5179
        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5180
        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5181
        expDiff -= 61;
5182
    }
5183
    if ( -64 < expDiff ) {
5184
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5185
        q = ( 4 < q ) ? q - 4 : 0;
5186
        q >>= - expDiff;
5187
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5188
        expDiff += 52;
5189
        if ( expDiff < 0 ) {
5190
            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5191
        }
5192
        else {
5193
            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5194
        }
5195
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5196
        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5197
    }
5198
    else {
5199
        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5200
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5201
    }
5202
    do {
5203
        alternateASig0 = aSig0;
5204
        alternateASig1 = aSig1;
5205
        ++q;
5206
        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5207
    } while ( 0 <= (sbits64) aSig0 );
5208
    add128(
5209
        aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5210
    if (    ( sigMean0 < 0 )
5211
         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5212
        aSig0 = alternateASig0;
5213
        aSig1 = alternateASig1;
5214
    }
5215
    zSign = ( (sbits64) aSig0 < 0 );
5216
    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5217
    return
5218
        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5219

5220
}
5221

5222
/*
5223
-------------------------------------------------------------------------------
5224
Returns the square root of the quadruple-precision floating-point value `a'.
5225
The operation is performed according to the IEC/IEEE Standard for Binary
5226
Floating-Point Arithmetic.
5227
-------------------------------------------------------------------------------
5228
*/
5229
float128 float128_sqrt( float128 a )
5230
{
5231
    flag aSign;
5232
    int32 aExp, zExp;
5233
    bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5234
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5235
    float128 z;
5236

5237
    aSig1 = extractFloat128Frac1( a );
5238
    aSig0 = extractFloat128Frac0( a );
5239
    aExp = extractFloat128Exp( a );
5240
    aSign = extractFloat128Sign( a );
5241
    if ( aExp == 0x7FFF ) {
5242
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5243
        if ( ! aSign ) return a;
5244
        goto invalid;
5245
    }
5246
    if ( aSign ) {
5247
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5248
 invalid:
5249
        float_raise( float_flag_invalid );
5250
        z.low = float128_default_nan_low;
5251
        z.high = float128_default_nan_high;
5252
        return z;
5253
    }
5254
    if ( aExp == 0 ) {
5255
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5256
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5257
    }
5258
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5259
    aSig0 |= LIT64( 0x0001000000000000 );
5260
    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5261
    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5262
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5263
    doubleZSig0 = zSig0<<1;
5264
    mul64To128( zSig0, zSig0, &term0, &term1 );
5265
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5266
    while ( (sbits64) rem0 < 0 ) {
5267
        --zSig0;
5268
        doubleZSig0 -= 2;
5269
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5270
    }
5271
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5272
    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5273
        if ( zSig1 == 0 ) zSig1 = 1;
5274
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5275
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5276
        mul64To128( zSig1, zSig1, &term2, &term3 );
5277
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5278
        while ( (sbits64) rem1 < 0 ) {
5279
            --zSig1;
5280
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5281
            term3 |= 1;
5282
            term2 |= doubleZSig0;
5283
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5284
        }
5285
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5286
    }
5287
    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5288
    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5289

5290
}
5291

5292
/*
5293
-------------------------------------------------------------------------------
5294
Returns 1 if the quadruple-precision floating-point value `a' is equal to
5295
the corresponding value `b', and 0 otherwise.  The comparison is performed
5296
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5297
-------------------------------------------------------------------------------
5298
*/
5299
flag float128_eq( float128 a, float128 b )
5300
{
5301

5302
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5303
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5304
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5305
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5306
       ) {
5307
        if (    float128_is_signaling_nan( a )
5308
             || float128_is_signaling_nan( b ) ) {
5309
            float_raise( float_flag_invalid );
5310
        }
5311
        return 0;
5312
    }
5313
    return
5314
           ( a.low == b.low )
5315
        && (    ( a.high == b.high )
5316
             || (    ( a.low == 0 )
5317
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5318
           );
5319

5320
}
5321

5322
/*
5323
-------------------------------------------------------------------------------
5324
Returns 1 if the quadruple-precision floating-point value `a' is less than
5325
or equal to the corresponding value `b', and 0 otherwise.  The comparison
5326
is performed according to the IEC/IEEE Standard for Binary Floating-Point
5327
Arithmetic.
5328
-------------------------------------------------------------------------------
5329
*/
5330
flag float128_le( float128 a, float128 b )
5331
{
5332
    flag aSign, bSign;
5333

5334
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5335
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5336
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5337
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5338
       ) {
5339
        float_raise( float_flag_invalid );
5340
        return 0;
5341
    }
5342
    aSign = extractFloat128Sign( a );
5343
    bSign = extractFloat128Sign( b );
5344
    if ( aSign != bSign ) {
5345
        return
5346
               aSign
5347
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5348
                 == 0 );
5349
    }
5350
    return
5351
          aSign ? le128( b.high, b.low, a.high, a.low )
5352
        : le128( a.high, a.low, b.high, b.low );
5353

5354
}
5355

5356
/*
5357
-------------------------------------------------------------------------------
5358
Returns 1 if the quadruple-precision floating-point value `a' is less than
5359
the corresponding value `b', and 0 otherwise.  The comparison is performed
5360
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5361
-------------------------------------------------------------------------------
5362
*/
5363
flag float128_lt( float128 a, float128 b )
5364
{
5365
    flag aSign, bSign;
5366

5367
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5368
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5369
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5370
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5371
       ) {
5372
        float_raise( float_flag_invalid );
5373
        return 0;
5374
    }
5375
    aSign = extractFloat128Sign( a );
5376
    bSign = extractFloat128Sign( b );
5377
    if ( aSign != bSign ) {
5378
        return
5379
               aSign
5380
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5381
                 != 0 );
5382
    }
5383
    return
5384
          aSign ? lt128( b.high, b.low, a.high, a.low )
5385
        : lt128( a.high, a.low, b.high, b.low );
5386

5387
}
5388

5389
/*
5390
-------------------------------------------------------------------------------
5391
Returns 1 if the quadruple-precision floating-point value `a' is equal to
5392
the corresponding value `b', and 0 otherwise.  The invalid exception is
5393
raised if either operand is a NaN.  Otherwise, the comparison is performed
5394
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5395
-------------------------------------------------------------------------------
5396
*/
5397
flag float128_eq_signaling( float128 a, float128 b )
5398
{
5399

5400
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5401
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5402
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5403
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5404
       ) {
5405
        float_raise( float_flag_invalid );
5406
        return 0;
5407
    }
5408
    return
5409
           ( a.low == b.low )
5410
        && (    ( a.high == b.high )
5411
             || (    ( a.low == 0 )
5412
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5413
           );
5414

5415
}
5416

5417
/*
5418
-------------------------------------------------------------------------------
5419
Returns 1 if the quadruple-precision floating-point value `a' is less than
5420
or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5421
cause an exception.  Otherwise, the comparison is performed according to the
5422
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5423
-------------------------------------------------------------------------------
5424
*/
5425
flag float128_le_quiet( float128 a, float128 b )
5426
{
5427
    flag aSign, bSign;
5428

5429
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5430
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5431
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5432
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5433
       ) {
5434
        if (    float128_is_signaling_nan( a )
5435
             || float128_is_signaling_nan( b ) ) {
5436
            float_raise( float_flag_invalid );
5437
        }
5438
        return 0;
5439
    }
5440
    aSign = extractFloat128Sign( a );
5441
    bSign = extractFloat128Sign( b );
5442
    if ( aSign != bSign ) {
5443
        return
5444
               aSign
5445
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5446
                 == 0 );
5447
    }
5448
    return
5449
          aSign ? le128( b.high, b.low, a.high, a.low )
5450
        : le128( a.high, a.low, b.high, b.low );
5451

5452
}
5453

5454
/*
5455
-------------------------------------------------------------------------------
5456
Returns 1 if the quadruple-precision floating-point value `a' is less than
5457
the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5458
exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5459
Standard for Binary Floating-Point Arithmetic.
5460
-------------------------------------------------------------------------------
5461
*/
5462
flag float128_lt_quiet( float128 a, float128 b )
5463
{
5464
    flag aSign, bSign;
5465

5466
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5467
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5468
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5469
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5470
       ) {
5471
        if (    float128_is_signaling_nan( a )
5472
             || float128_is_signaling_nan( b ) ) {
5473
            float_raise( float_flag_invalid );
5474
        }
5475
        return 0;
5476
    }
5477
    aSign = extractFloat128Sign( a );
5478
    bSign = extractFloat128Sign( b );
5479
    if ( aSign != bSign ) {
5480
        return
5481
               aSign
5482
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5483
                 != 0 );
5484
    }
5485
    return
5486
          aSign ? lt128( b.high, b.low, a.high, a.low )
5487
        : lt128( a.high, a.low, b.high, b.low );
5488

5489
}
5490

5491
#endif
5492

5493

5494
#if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5495

5496
/*
5497
 * These two routines are not part of the original softfloat distribution.
5498
 *
5499
 * They are based on the corresponding conversions to integer but return
5500
 * unsigned numbers instead since these functions are required by GCC.
5501
 *
5502
 * Added by Mark Brinicombe <[email protected]>	27/09/97
5503
 *
5504
 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5505
 */
5506

5507
/*
5508
-------------------------------------------------------------------------------
5509
Returns the result of converting the double-precision floating-point value
5510
`a' to the 32-bit unsigned integer format.  The conversion is
5511
performed according to the IEC/IEEE Standard for Binary Floating-point
5512
Arithmetic, except that the conversion is always rounded toward zero.  If
5513
`a' is a NaN, the largest positive integer is returned.  If the conversion
5514
overflows, the largest integer positive is returned.
5515
-------------------------------------------------------------------------------
5516
*/
5517
uint32 float64_to_uint32_round_to_zero( float64 a )
5518
{
5519
    flag aSign;
5520
    int16 aExp, shiftCount;
5521
    bits64 aSig, savedASig;
5522
    uint32 z;
5523

5524
    aSig = extractFloat64Frac( a );
5525
    aExp = extractFloat64Exp( a );
5526
    aSign = extractFloat64Sign( a );
5527

5528
    if (aSign) {
5529
        float_raise( float_flag_invalid );
5530
    	return(0);
5531
    }
5532

5533
    if ( 0x41E < aExp ) {
5534
        float_raise( float_flag_invalid );
5535
        return 0xffffffff;
5536
    }
5537
    else if ( aExp < 0x3FF ) {
5538
        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
5539
        return 0;
5540
    }
5541
    aSig |= LIT64( 0x0010000000000000 );
5542
    shiftCount = 0x433 - aExp;
5543
    savedASig = aSig;
5544
    aSig >>= shiftCount;
5545
    z = aSig;
5546
    if ( ( aSig<<shiftCount ) != savedASig ) {
5547
        float_exception_flags |= float_flag_inexact;
5548
    }
5549
    return z;
5550

5551
}
5552

5553
/*
5554
-------------------------------------------------------------------------------
5555
Returns the result of converting the single-precision floating-point value
5556
`a' to the 32-bit unsigned integer format.  The conversion is
5557
performed according to the IEC/IEEE Standard for Binary Floating-point
5558
Arithmetic, except that the conversion is always rounded toward zero.  If
5559
`a' is a NaN, the largest positive integer is returned.  If the conversion
5560
overflows, the largest positive integer is returned.
5561
-------------------------------------------------------------------------------
5562
*/
5563
uint32 float32_to_uint32_round_to_zero( float32 a )
5564
{
5565
    flag aSign;
5566
    int16 aExp, shiftCount;
5567
    bits32 aSig;
5568
    uint32 z;
5569

5570
    aSig = extractFloat32Frac( a );
5571
    aExp = extractFloat32Exp( a );
5572
    aSign = extractFloat32Sign( a );
5573
    shiftCount = aExp - 0x9E;
5574

5575
    if (aSign) {
5576
        float_raise( float_flag_invalid );
5577
    	return(0);
5578
    }
5579
    if ( 0 < shiftCount ) {
5580
        float_raise( float_flag_invalid );
5581
        return 0xFFFFFFFF;
5582
    }
5583
    else if ( aExp <= 0x7E ) {
5584
        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
5585
        return 0;
5586
    }
5587
    aSig = ( aSig | 0x800000 )<<8;
5588
    z = aSig>>( - shiftCount );
5589
    if ( aSig<<( shiftCount & 31 ) ) {
5590
        float_exception_flags |= float_flag_inexact;
5591
    }
5592
    return z;
5593

5594
}
5595

5596
#endif
5597

5598
Product

Resources

Company