Path: blob/main/lib/libc/softfloat/bits64/softfloat.c
39530 views
/* $NetBSD: softfloat.c,v 1.8 2011/07/10 04:52:23 matt Exp $ */12/*3* This version hacked for use with gcc -msoft-float by bjh21.4* (Mostly a case of #ifdefing out things GCC doesn't need or provides5* itself).6*/78/*9* Things you may want to define:10*11* SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with12* -msoft-float) to work. Include "softfloat-for-gcc.h" to get them13* properly renamed.14*/1516/*17===============================================================================1819This C source file is part of the SoftFloat IEC/IEEE Floating-point20Arithmetic Package, Release 2a.2122Written by John R. Hauser. This work was made possible in part by the23International Computer Science Institute, located at Suite 600, 1947 Center24Street, Berkeley, California 94704. Funding was partially provided by the25National Science Foundation under grant MIP-9311980. The original version26of this code was written as part of a project to build a fixed-point vector27processor in collaboration with the University of California at Berkeley,28overseen by Profs. Nelson Morgan and John Wawrzynek. More information29is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/30arithmetic/SoftFloat.html'.3132THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort33has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT34TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO35PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY36AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.3738Derivative works are acceptable, even for commercial purposes, so long as39(1) they include prominent notice that the work is derivative, and (2) they40include prominent notice akin to these four paragraphs for those parts of41this code that are retained.4243===============================================================================44*/4546#ifdef SOFTFLOAT_FOR_GCC47#include "softfloat-for-gcc.h"48#endif4950#include "milieu.h"51#include "softfloat.h"5253/*54* Conversions between floats as stored in memory and floats as55* SoftFloat uses them56*/57#ifndef FLOAT64_DEMANGLE58#define FLOAT64_DEMANGLE(a) (a)59#endif60#ifndef FLOAT64_MANGLE61#define FLOAT64_MANGLE(a) (a)62#endif6364/*65-------------------------------------------------------------------------------66Floating-point rounding mode, extended double-precision rounding precision,67and exception flags.68-------------------------------------------------------------------------------69*/70int float_rounding_mode = float_round_nearest_even;71int float_exception_flags = 0;72#ifdef FLOATX8073int8 floatx80_rounding_precision = 80;74#endif7576/*77-------------------------------------------------------------------------------78Primitive arithmetic functions, including multi-word arithmetic, and79division and square root approximations. (Can be specialized to target if80desired.)81-------------------------------------------------------------------------------82*/83#include "softfloat-macros"8485/*86-------------------------------------------------------------------------------87Functions and definitions to determine: (1) whether tininess for underflow88is detected before or after rounding by default, (2) what (if anything)89happens when exceptions are raised, (3) how signaling NaNs are distinguished90from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs91are propagated from function inputs to output. These details are target-92specific.93-------------------------------------------------------------------------------94*/95#include "softfloat-specialize"9697#if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)98/*99-------------------------------------------------------------------------------100Takes a 64-bit fixed-point value `absZ' with binary point between bits 6101and 7, and returns the properly rounded 32-bit integer corresponding to the102input. If `zSign' is 1, the input is negated before being converted to an103integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input104is simply rounded to an integer, with the inexact exception raised if the105input cannot be represented exactly as an integer. However, if the fixed-106point input is too large, the invalid exception is raised and the largest107positive or negative integer is returned.108-------------------------------------------------------------------------------109*/110static int32 roundAndPackInt32( flag zSign, bits64 absZ )111{112int8 roundingMode;113flag roundNearestEven;114int8 roundIncrement, roundBits;115int32 z;116117roundingMode = float_rounding_mode;118roundNearestEven = ( roundingMode == float_round_nearest_even );119roundIncrement = 0x40;120if ( ! roundNearestEven ) {121if ( roundingMode == float_round_to_zero ) {122roundIncrement = 0;123}124else {125roundIncrement = 0x7F;126if ( zSign ) {127if ( roundingMode == float_round_up ) roundIncrement = 0;128}129else {130if ( roundingMode == float_round_down ) roundIncrement = 0;131}132}133}134roundBits = absZ & 0x7F;135absZ = ( absZ + roundIncrement )>>7;136absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );137z = absZ;138if ( zSign ) z = - z;139if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {140float_raise( float_flag_invalid );141return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;142}143if ( roundBits ) float_exception_flags |= float_flag_inexact;144return z;145146}147148/*149-------------------------------------------------------------------------------150Takes the 128-bit fixed-point value formed by concatenating `absZ0' and151`absZ1', with binary point between bits 63 and 64 (between the input words),152and returns the properly rounded 64-bit integer corresponding to the input.153If `zSign' is 1, the input is negated before being converted to an integer.154Ordinarily, the fixed-point input is simply rounded to an integer, with155the inexact exception raised if the input cannot be represented exactly as156an integer. However, if the fixed-point input is too large, the invalid157exception is raised and the largest positive or negative integer is158returned.159-------------------------------------------------------------------------------160*/161static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )162{163int8 roundingMode;164flag roundNearestEven, increment;165int64 z;166167roundingMode = float_rounding_mode;168roundNearestEven = ( roundingMode == float_round_nearest_even );169increment = ( (sbits64) absZ1 < 0 );170if ( ! roundNearestEven ) {171if ( roundingMode == float_round_to_zero ) {172increment = 0;173}174else {175if ( zSign ) {176increment = ( roundingMode == float_round_down ) && absZ1;177}178else {179increment = ( roundingMode == float_round_up ) && absZ1;180}181}182}183if ( increment ) {184++absZ0;185if ( absZ0 == 0 ) goto overflow;186absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );187}188z = absZ0;189if ( zSign ) z = - z;190if ( z && ( ( z < 0 ) ^ zSign ) ) {191overflow:192float_raise( float_flag_invalid );193return194zSign ? (sbits64) LIT64( 0x8000000000000000 )195: LIT64( 0x7FFFFFFFFFFFFFFF );196}197if ( absZ1 ) float_exception_flags |= float_flag_inexact;198return z;199200}201#endif202203/*204-------------------------------------------------------------------------------205Returns the fraction bits of the single-precision floating-point value `a'.206-------------------------------------------------------------------------------207*/208INLINE bits32 extractFloat32Frac( float32 a )209{210211return a & 0x007FFFFF;212213}214215/*216-------------------------------------------------------------------------------217Returns the exponent bits of the single-precision floating-point value `a'.218-------------------------------------------------------------------------------219*/220INLINE int16 extractFloat32Exp( float32 a )221{222223return ( a>>23 ) & 0xFF;224225}226227/*228-------------------------------------------------------------------------------229Returns the sign bit of the single-precision floating-point value `a'.230-------------------------------------------------------------------------------231*/232INLINE flag extractFloat32Sign( float32 a )233{234235return a>>31;236237}238239/*240-------------------------------------------------------------------------------241Normalizes the subnormal single-precision floating-point value represented242by the denormalized significand `aSig'. The normalized exponent and243significand are stored at the locations pointed to by `zExpPtr' and244`zSigPtr', respectively.245-------------------------------------------------------------------------------246*/247static void248normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )249{250int8 shiftCount;251252shiftCount = countLeadingZeros32( aSig ) - 8;253*zSigPtr = aSig<<shiftCount;254*zExpPtr = 1 - shiftCount;255256}257258/*259-------------------------------------------------------------------------------260Packs the sign `zSign', exponent `zExp', and significand `zSig' into a261single-precision floating-point value, returning the result. After being262shifted into the proper positions, the three fields are simply added263together to form the result. This means that any integer portion of `zSig'264will be added into the exponent. Since a properly normalized significand265will have an integer portion equal to 1, the `zExp' input should be 1 less266than the desired result exponent whenever `zSig' is a complete, normalized267significand.268-------------------------------------------------------------------------------269*/270INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )271{272273return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;274275}276277/*278-------------------------------------------------------------------------------279Takes an abstract floating-point value having sign `zSign', exponent `zExp',280and significand `zSig', and returns the proper single-precision floating-281point value corresponding to the abstract input. Ordinarily, the abstract282value is simply rounded and packed into the single-precision format, with283the inexact exception raised if the abstract input cannot be represented284exactly. However, if the abstract value is too large, the overflow and285inexact exceptions are raised and an infinity or maximal finite value is286returned. If the abstract value is too small, the input value is rounded to287a subnormal number, and the underflow and inexact exceptions are raised if288the abstract input cannot be represented exactly as a subnormal single-289precision floating-point number.290The input significand `zSig' has its binary point between bits 30291and 29, which is 7 bits to the left of the usual location. This shifted292significand must be normalized or smaller. If `zSig' is not normalized,293`zExp' must be 0; in that case, the result returned is a subnormal number,294and it must not require rounding. In the usual case that `zSig' is295normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.296The handling of underflow and overflow follows the IEC/IEEE Standard for297Binary Floating-Point Arithmetic.298-------------------------------------------------------------------------------299*/300static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )301{302int8 roundingMode;303flag roundNearestEven;304int8 roundIncrement, roundBits;305flag isTiny;306307roundingMode = float_rounding_mode;308roundNearestEven = ( roundingMode == float_round_nearest_even );309roundIncrement = 0x40;310if ( ! roundNearestEven ) {311if ( roundingMode == float_round_to_zero ) {312roundIncrement = 0;313}314else {315roundIncrement = 0x7F;316if ( zSign ) {317if ( roundingMode == float_round_up ) roundIncrement = 0;318}319else {320if ( roundingMode == float_round_down ) roundIncrement = 0;321}322}323}324roundBits = zSig & 0x7F;325if ( 0xFD <= (bits16) zExp ) {326if ( ( 0xFD < zExp )327|| ( ( zExp == 0xFD )328&& ( (sbits32) ( zSig + roundIncrement ) < 0 ) )329) {330float_raise( float_flag_overflow | float_flag_inexact );331return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );332}333if ( zExp < 0 ) {334isTiny =335( float_detect_tininess == float_tininess_before_rounding )336|| ( zExp < -1 )337|| ( zSig + roundIncrement < 0x80000000 );338shift32RightJamming( zSig, - zExp, &zSig );339zExp = 0;340roundBits = zSig & 0x7F;341if ( isTiny && roundBits ) float_raise( float_flag_underflow );342}343}344if ( roundBits ) float_exception_flags |= float_flag_inexact;345zSig = ( zSig + roundIncrement )>>7;346zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );347if ( zSig == 0 ) zExp = 0;348return packFloat32( zSign, zExp, zSig );349350}351352/*353-------------------------------------------------------------------------------354Takes an abstract floating-point value having sign `zSign', exponent `zExp',355and significand `zSig', and returns the proper single-precision floating-356point value corresponding to the abstract input. This routine is just like357`roundAndPackFloat32' except that `zSig' does not have to be normalized.358Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''359floating-point exponent.360-------------------------------------------------------------------------------361*/362static float32363normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )364{365int8 shiftCount;366367shiftCount = countLeadingZeros32( zSig ) - 1;368return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );369370}371372/*373-------------------------------------------------------------------------------374Returns the fraction bits of the double-precision floating-point value `a'.375-------------------------------------------------------------------------------376*/377INLINE bits64 extractFloat64Frac( float64 a )378{379380return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );381382}383384/*385-------------------------------------------------------------------------------386Returns the exponent bits of the double-precision floating-point value `a'.387-------------------------------------------------------------------------------388*/389INLINE int16 extractFloat64Exp( float64 a )390{391392return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;393394}395396/*397-------------------------------------------------------------------------------398Returns the sign bit of the double-precision floating-point value `a'.399-------------------------------------------------------------------------------400*/401INLINE flag extractFloat64Sign( float64 a )402{403404return FLOAT64_DEMANGLE(a)>>63;405406}407408/*409-------------------------------------------------------------------------------410Normalizes the subnormal double-precision floating-point value represented411by the denormalized significand `aSig'. The normalized exponent and412significand are stored at the locations pointed to by `zExpPtr' and413`zSigPtr', respectively.414-------------------------------------------------------------------------------415*/416static void417normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )418{419int8 shiftCount;420421shiftCount = countLeadingZeros64( aSig ) - 11;422*zSigPtr = aSig<<shiftCount;423*zExpPtr = 1 - shiftCount;424425}426427/*428-------------------------------------------------------------------------------429Packs the sign `zSign', exponent `zExp', and significand `zSig' into a430double-precision floating-point value, returning the result. After being431shifted into the proper positions, the three fields are simply added432together to form the result. This means that any integer portion of `zSig'433will be added into the exponent. Since a properly normalized significand434will have an integer portion equal to 1, the `zExp' input should be 1 less435than the desired result exponent whenever `zSig' is a complete, normalized436significand.437-------------------------------------------------------------------------------438*/439INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )440{441442return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +443( ( (bits64) zExp )<<52 ) + zSig );444445}446447/*448-------------------------------------------------------------------------------449Takes an abstract floating-point value having sign `zSign', exponent `zExp',450and significand `zSig', and returns the proper double-precision floating-451point value corresponding to the abstract input. Ordinarily, the abstract452value is simply rounded and packed into the double-precision format, with453the inexact exception raised if the abstract input cannot be represented454exactly. However, if the abstract value is too large, the overflow and455inexact exceptions are raised and an infinity or maximal finite value is456returned. If the abstract value is too small, the input value is rounded to457a subnormal number, and the underflow and inexact exceptions are raised if458the abstract input cannot be represented exactly as a subnormal double-459precision floating-point number.460The input significand `zSig' has its binary point between bits 62461and 61, which is 10 bits to the left of the usual location. This shifted462significand must be normalized or smaller. If `zSig' is not normalized,463`zExp' must be 0; in that case, the result returned is a subnormal number,464and it must not require rounding. In the usual case that `zSig' is465normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.466The handling of underflow and overflow follows the IEC/IEEE Standard for467Binary Floating-Point Arithmetic.468-------------------------------------------------------------------------------469*/470static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )471{472int8 roundingMode;473flag roundNearestEven;474int16 roundIncrement, roundBits;475flag isTiny;476477roundingMode = float_rounding_mode;478roundNearestEven = ( roundingMode == float_round_nearest_even );479roundIncrement = 0x200;480if ( ! roundNearestEven ) {481if ( roundingMode == float_round_to_zero ) {482roundIncrement = 0;483}484else {485roundIncrement = 0x3FF;486if ( zSign ) {487if ( roundingMode == float_round_up ) roundIncrement = 0;488}489else {490if ( roundingMode == float_round_down ) roundIncrement = 0;491}492}493}494roundBits = zSig & 0x3FF;495if ( 0x7FD <= (bits16) zExp ) {496if ( ( 0x7FD < zExp )497|| ( ( zExp == 0x7FD )498&& ( (sbits64) ( zSig + roundIncrement ) < 0 ) )499) {500float_raise( float_flag_overflow | float_flag_inexact );501return FLOAT64_MANGLE(502FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -503( roundIncrement == 0 ));504}505if ( zExp < 0 ) {506isTiny =507( float_detect_tininess == float_tininess_before_rounding )508|| ( zExp < -1 )509|| ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );510shift64RightJamming( zSig, - zExp, &zSig );511zExp = 0;512roundBits = zSig & 0x3FF;513if ( isTiny && roundBits ) float_raise( float_flag_underflow );514}515}516if ( roundBits ) float_exception_flags |= float_flag_inexact;517zSig = ( zSig + roundIncrement )>>10;518zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );519if ( zSig == 0 ) zExp = 0;520return packFloat64( zSign, zExp, zSig );521522}523524/*525-------------------------------------------------------------------------------526Takes an abstract floating-point value having sign `zSign', exponent `zExp',527and significand `zSig', and returns the proper double-precision floating-528point value corresponding to the abstract input. This routine is just like529`roundAndPackFloat64' except that `zSig' does not have to be normalized.530Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''531floating-point exponent.532-------------------------------------------------------------------------------533*/534static float64535normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )536{537int8 shiftCount;538539shiftCount = countLeadingZeros64( zSig ) - 1;540return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );541542}543544#ifdef FLOATX80545546/*547-------------------------------------------------------------------------------548Returns the fraction bits of the extended double-precision floating-point549value `a'.550-------------------------------------------------------------------------------551*/552INLINE bits64 extractFloatx80Frac( floatx80 a )553{554555return a.low;556557}558559/*560-------------------------------------------------------------------------------561Returns the exponent bits of the extended double-precision floating-point562value `a'.563-------------------------------------------------------------------------------564*/565INLINE int32 extractFloatx80Exp( floatx80 a )566{567568return a.high & 0x7FFF;569570}571572/*573-------------------------------------------------------------------------------574Returns the sign bit of the extended double-precision floating-point value575`a'.576-------------------------------------------------------------------------------577*/578INLINE flag extractFloatx80Sign( floatx80 a )579{580581return a.high>>15;582583}584585/*586-------------------------------------------------------------------------------587Normalizes the subnormal extended double-precision floating-point value588represented by the denormalized significand `aSig'. The normalized exponent589and significand are stored at the locations pointed to by `zExpPtr' and590`zSigPtr', respectively.591-------------------------------------------------------------------------------592*/593static void594normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )595{596int8 shiftCount;597598shiftCount = countLeadingZeros64( aSig );599*zSigPtr = aSig<<shiftCount;600*zExpPtr = 1 - shiftCount;601602}603604/*605-------------------------------------------------------------------------------606Packs the sign `zSign', exponent `zExp', and significand `zSig' into an607extended double-precision floating-point value, returning the result.608-------------------------------------------------------------------------------609*/610INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )611{612floatx80 z;613614z.low = zSig;615z.high = ( ( (bits16) zSign )<<15 ) + zExp;616return z;617618}619620/*621-------------------------------------------------------------------------------622Takes an abstract floating-point value having sign `zSign', exponent `zExp',623and extended significand formed by the concatenation of `zSig0' and `zSig1',624and returns the proper extended double-precision floating-point value625corresponding to the abstract input. Ordinarily, the abstract value is626rounded and packed into the extended double-precision format, with the627inexact exception raised if the abstract input cannot be represented628exactly. However, if the abstract value is too large, the overflow and629inexact exceptions are raised and an infinity or maximal finite value is630returned. If the abstract value is too small, the input value is rounded to631a subnormal number, and the underflow and inexact exceptions are raised if632the abstract input cannot be represented exactly as a subnormal extended633double-precision floating-point number.634If `roundingPrecision' is 32 or 64, the result is rounded to the same635number of bits as single or double precision, respectively. Otherwise, the636result is rounded to the full precision of the extended double-precision637format.638The input significand must be normalized or smaller. If the input639significand is not normalized, `zExp' must be 0; in that case, the result640returned is a subnormal number, and it must not require rounding. The641handling of underflow and overflow follows the IEC/IEEE Standard for Binary642Floating-Point Arithmetic.643-------------------------------------------------------------------------------644*/645static floatx80646roundAndPackFloatx80(647int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1648)649{650int8 roundingMode;651flag roundNearestEven, increment, isTiny;652int64 roundIncrement, roundMask, roundBits;653654roundingMode = float_rounding_mode;655roundNearestEven = ( roundingMode == float_round_nearest_even );656if ( roundingPrecision == 80 ) goto precision80;657if ( roundingPrecision == 64 ) {658roundIncrement = LIT64( 0x0000000000000400 );659roundMask = LIT64( 0x00000000000007FF );660}661else if ( roundingPrecision == 32 ) {662roundIncrement = LIT64( 0x0000008000000000 );663roundMask = LIT64( 0x000000FFFFFFFFFF );664}665else {666goto precision80;667}668zSig0 |= ( zSig1 != 0 );669if ( ! roundNearestEven ) {670if ( roundingMode == float_round_to_zero ) {671roundIncrement = 0;672}673else {674roundIncrement = roundMask;675if ( zSign ) {676if ( roundingMode == float_round_up ) roundIncrement = 0;677}678else {679if ( roundingMode == float_round_down ) roundIncrement = 0;680}681}682}683roundBits = zSig0 & roundMask;684if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {685if ( ( 0x7FFE < zExp )686|| ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )687) {688goto overflow;689}690if ( zExp <= 0 ) {691isTiny =692( float_detect_tininess == float_tininess_before_rounding )693|| ( zExp < 0 )694|| ( zSig0 <= zSig0 + roundIncrement );695shift64RightJamming( zSig0, 1 - zExp, &zSig0 );696zExp = 0;697roundBits = zSig0 & roundMask;698if ( isTiny && roundBits ) float_raise( float_flag_underflow );699if ( roundBits ) float_exception_flags |= float_flag_inexact;700zSig0 += roundIncrement;701if ( (sbits64) zSig0 < 0 ) zExp = 1;702roundIncrement = roundMask + 1;703if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {704roundMask |= roundIncrement;705}706zSig0 &= ~ roundMask;707return packFloatx80( zSign, zExp, zSig0 );708}709}710if ( roundBits ) float_exception_flags |= float_flag_inexact;711zSig0 += roundIncrement;712if ( zSig0 < roundIncrement ) {713++zExp;714zSig0 = LIT64( 0x8000000000000000 );715}716roundIncrement = roundMask + 1;717if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {718roundMask |= roundIncrement;719}720zSig0 &= ~ roundMask;721if ( zSig0 == 0 ) zExp = 0;722return packFloatx80( zSign, zExp, zSig0 );723precision80:724increment = ( (sbits64) zSig1 < 0 );725if ( ! roundNearestEven ) {726if ( roundingMode == float_round_to_zero ) {727increment = 0;728}729else {730if ( zSign ) {731increment = ( roundingMode == float_round_down ) && zSig1;732}733else {734increment = ( roundingMode == float_round_up ) && zSig1;735}736}737}738if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {739if ( ( 0x7FFE < zExp )740|| ( ( zExp == 0x7FFE )741&& ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )742&& increment743)744) {745roundMask = 0;746overflow:747float_raise( float_flag_overflow | float_flag_inexact );748if ( ( roundingMode == float_round_to_zero )749|| ( zSign && ( roundingMode == float_round_up ) )750|| ( ! zSign && ( roundingMode == float_round_down ) )751) {752return packFloatx80( zSign, 0x7FFE, ~ roundMask );753}754return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );755}756if ( zExp <= 0 ) {757isTiny =758( float_detect_tininess == float_tininess_before_rounding )759|| ( zExp < 0 )760|| ! increment761|| ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );762shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );763zExp = 0;764if ( isTiny && zSig1 ) float_raise( float_flag_underflow );765if ( zSig1 ) float_exception_flags |= float_flag_inexact;766if ( roundNearestEven ) {767increment = ( (sbits64) zSig1 < 0 );768}769else {770if ( zSign ) {771increment = ( roundingMode == float_round_down ) && zSig1;772}773else {774increment = ( roundingMode == float_round_up ) && zSig1;775}776}777if ( increment ) {778++zSig0;779zSig0 &=780~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );781if ( (sbits64) zSig0 < 0 ) zExp = 1;782}783return packFloatx80( zSign, zExp, zSig0 );784}785}786if ( zSig1 ) float_exception_flags |= float_flag_inexact;787if ( increment ) {788++zSig0;789if ( zSig0 == 0 ) {790++zExp;791zSig0 = LIT64( 0x8000000000000000 );792}793else {794zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );795}796}797else {798if ( zSig0 == 0 ) zExp = 0;799}800return packFloatx80( zSign, zExp, zSig0 );801802}803804/*805-------------------------------------------------------------------------------806Takes an abstract floating-point value having sign `zSign', exponent807`zExp', and significand formed by the concatenation of `zSig0' and `zSig1',808and returns the proper extended double-precision floating-point value809corresponding to the abstract input. This routine is just like810`roundAndPackFloatx80' except that the input significand does not have to be811normalized.812-------------------------------------------------------------------------------813*/814static floatx80815normalizeRoundAndPackFloatx80(816int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1817)818{819int8 shiftCount;820821if ( zSig0 == 0 ) {822zSig0 = zSig1;823zSig1 = 0;824zExp -= 64;825}826shiftCount = countLeadingZeros64( zSig0 );827shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );828zExp -= shiftCount;829return830roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );831832}833834#endif835836#ifdef FLOAT128837838/*839-------------------------------------------------------------------------------840Returns the least-significant 64 fraction bits of the quadruple-precision841floating-point value `a'.842-------------------------------------------------------------------------------843*/844INLINE bits64 extractFloat128Frac1( float128 a )845{846847return a.low;848849}850851/*852-------------------------------------------------------------------------------853Returns the most-significant 48 fraction bits of the quadruple-precision854floating-point value `a'.855-------------------------------------------------------------------------------856*/857INLINE bits64 extractFloat128Frac0( float128 a )858{859860return a.high & LIT64( 0x0000FFFFFFFFFFFF );861862}863864/*865-------------------------------------------------------------------------------866Returns the exponent bits of the quadruple-precision floating-point value867`a'.868-------------------------------------------------------------------------------869*/870INLINE int32 extractFloat128Exp( float128 a )871{872873return ( a.high>>48 ) & 0x7FFF;874875}876877/*878-------------------------------------------------------------------------------879Returns the sign bit of the quadruple-precision floating-point value `a'.880-------------------------------------------------------------------------------881*/882INLINE flag extractFloat128Sign( float128 a )883{884885return a.high>>63;886887}888889/*890-------------------------------------------------------------------------------891Normalizes the subnormal quadruple-precision floating-point value892represented by the denormalized significand formed by the concatenation of893`aSig0' and `aSig1'. The normalized exponent is stored at the location894pointed to by `zExpPtr'. The most significant 49 bits of the normalized895significand are stored at the location pointed to by `zSig0Ptr', and the896least significant 64 bits of the normalized significand are stored at the897location pointed to by `zSig1Ptr'.898-------------------------------------------------------------------------------899*/900static void901normalizeFloat128Subnormal(902bits64 aSig0,903bits64 aSig1,904int32 *zExpPtr,905bits64 *zSig0Ptr,906bits64 *zSig1Ptr907)908{909int8 shiftCount;910911if ( aSig0 == 0 ) {912shiftCount = countLeadingZeros64( aSig1 ) - 15;913if ( shiftCount < 0 ) {914*zSig0Ptr = aSig1>>( - shiftCount );915*zSig1Ptr = aSig1<<( shiftCount & 63 );916}917else {918*zSig0Ptr = aSig1<<shiftCount;919*zSig1Ptr = 0;920}921*zExpPtr = - shiftCount - 63;922}923else {924shiftCount = countLeadingZeros64( aSig0 ) - 15;925shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );926*zExpPtr = 1 - shiftCount;927}928929}930931/*932-------------------------------------------------------------------------------933Packs the sign `zSign', the exponent `zExp', and the significand formed934by the concatenation of `zSig0' and `zSig1' into a quadruple-precision935floating-point value, returning the result. After being shifted into the936proper positions, the three fields `zSign', `zExp', and `zSig0' are simply937added together to form the most significant 32 bits of the result. This938means that any integer portion of `zSig0' will be added into the exponent.939Since a properly normalized significand will have an integer portion equal940to 1, the `zExp' input should be 1 less than the desired result exponent941whenever `zSig0' and `zSig1' concatenated form a complete, normalized942significand.943-------------------------------------------------------------------------------944*/945INLINE float128946packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )947{948float128 z;949950z.low = zSig1;951z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;952return z;953954}955956/*957-------------------------------------------------------------------------------958Takes an abstract floating-point value having sign `zSign', exponent `zExp',959and extended significand formed by the concatenation of `zSig0', `zSig1',960and `zSig2', and returns the proper quadruple-precision floating-point value961corresponding to the abstract input. Ordinarily, the abstract value is962simply rounded and packed into the quadruple-precision format, with the963inexact exception raised if the abstract input cannot be represented964exactly. However, if the abstract value is too large, the overflow and965inexact exceptions are raised and an infinity or maximal finite value is966returned. If the abstract value is too small, the input value is rounded to967a subnormal number, and the underflow and inexact exceptions are raised if968the abstract input cannot be represented exactly as a subnormal quadruple-969precision floating-point number.970The input significand must be normalized or smaller. If the input971significand is not normalized, `zExp' must be 0; in that case, the result972returned is a subnormal number, and it must not require rounding. In the973usual case that the input significand is normalized, `zExp' must be 1 less974than the ``true'' floating-point exponent. The handling of underflow and975overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.976-------------------------------------------------------------------------------977*/978static float128979roundAndPackFloat128(980flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )981{982int8 roundingMode;983flag roundNearestEven, increment, isTiny;984985roundingMode = float_rounding_mode;986roundNearestEven = ( roundingMode == float_round_nearest_even );987increment = ( (sbits64) zSig2 < 0 );988if ( ! roundNearestEven ) {989if ( roundingMode == float_round_to_zero ) {990increment = 0;991}992else {993if ( zSign ) {994increment = ( roundingMode == float_round_down ) && zSig2;995}996else {997increment = ( roundingMode == float_round_up ) && zSig2;998}999}1000}1001if ( 0x7FFD <= (bits32) zExp ) {1002if ( ( 0x7FFD < zExp )1003|| ( ( zExp == 0x7FFD )1004&& eq128(1005LIT64( 0x0001FFFFFFFFFFFF ),1006LIT64( 0xFFFFFFFFFFFFFFFF ),1007zSig0,1008zSig11009)1010&& increment1011)1012) {1013float_raise( float_flag_overflow | float_flag_inexact );1014if ( ( roundingMode == float_round_to_zero )1015|| ( zSign && ( roundingMode == float_round_up ) )1016|| ( ! zSign && ( roundingMode == float_round_down ) )1017) {1018return1019packFloat128(1020zSign,10210x7FFE,1022LIT64( 0x0000FFFFFFFFFFFF ),1023LIT64( 0xFFFFFFFFFFFFFFFF )1024);1025}1026return packFloat128( zSign, 0x7FFF, 0, 0 );1027}1028if ( zExp < 0 ) {1029isTiny =1030( float_detect_tininess == float_tininess_before_rounding )1031|| ( zExp < -1 )1032|| ! increment1033|| lt128(1034zSig0,1035zSig1,1036LIT64( 0x0001FFFFFFFFFFFF ),1037LIT64( 0xFFFFFFFFFFFFFFFF )1038);1039shift128ExtraRightJamming(1040zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );1041zExp = 0;1042if ( isTiny && zSig2 ) float_raise( float_flag_underflow );1043if ( roundNearestEven ) {1044increment = ( (sbits64) zSig2 < 0 );1045}1046else {1047if ( zSign ) {1048increment = ( roundingMode == float_round_down ) && zSig2;1049}1050else {1051increment = ( roundingMode == float_round_up ) && zSig2;1052}1053}1054}1055}1056if ( zSig2 ) float_exception_flags |= float_flag_inexact;1057if ( increment ) {1058add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );1059zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );1060}1061else {1062if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;1063}1064return packFloat128( zSign, zExp, zSig0, zSig1 );10651066}10671068/*1069-------------------------------------------------------------------------------1070Takes an abstract floating-point value having sign `zSign', exponent `zExp',1071and significand formed by the concatenation of `zSig0' and `zSig1', and1072returns the proper quadruple-precision floating-point value corresponding1073to the abstract input. This routine is just like `roundAndPackFloat128'1074except that the input significand has fewer bits and does not have to be1075normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-1076point exponent.1077-------------------------------------------------------------------------------1078*/1079static float1281080normalizeRoundAndPackFloat128(1081flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )1082{1083int8 shiftCount;1084bits64 zSig2;10851086if ( zSig0 == 0 ) {1087zSig0 = zSig1;1088zSig1 = 0;1089zExp -= 64;1090}1091shiftCount = countLeadingZeros64( zSig0 ) - 15;1092if ( 0 <= shiftCount ) {1093zSig2 = 0;1094shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );1095}1096else {1097shift128ExtraRightJamming(1098zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );1099}1100zExp -= shiftCount;1101return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );11021103}11041105#endif11061107/*1108-------------------------------------------------------------------------------1109Returns the result of converting the 32-bit two's complement integer `a'1110to the single-precision floating-point format. The conversion is performed1111according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.1112-------------------------------------------------------------------------------1113*/1114float32 int32_to_float32( int32 a )1115{1116flag zSign;11171118if ( a == 0 ) return 0;1119if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );1120zSign = ( a < 0 );1121return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );11221123}11241125#ifndef SOFTFLOAT_FOR_GCC /* __floatunsisf is in libgcc */1126float32 uint32_to_float32( uint32 a )1127{1128if ( a == 0 ) return 0;1129if ( a & (bits32) 0x80000000 )1130return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 );1131return normalizeRoundAndPackFloat32( 0, 0x9C, a );1132}1133#endif113411351136/*1137-------------------------------------------------------------------------------1138Returns the result of converting the 32-bit two's complement integer `a'1139to the double-precision floating-point format. The conversion is performed1140according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.1141-------------------------------------------------------------------------------1142*/1143float64 int32_to_float64( int32 a )1144{1145flag zSign;1146uint32 absA;1147int8 shiftCount;1148bits64 zSig;11491150if ( a == 0 ) return 0;1151zSign = ( a < 0 );1152absA = zSign ? - a : a;1153shiftCount = countLeadingZeros32( absA ) + 21;1154zSig = absA;1155return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );11561157}11581159#ifndef SOFTFLOAT_FOR_GCC /* __floatunsidf is in libgcc */1160float64 uint32_to_float64( uint32 a )1161{1162int8 shiftCount;1163bits64 zSig = a;11641165if ( a == 0 ) return 0;1166shiftCount = countLeadingZeros32( a ) + 21;1167return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount );11681169}1170#endif11711172#ifdef FLOATX8011731174/*1175-------------------------------------------------------------------------------1176Returns the result of converting the 32-bit two's complement integer `a'1177to the extended double-precision floating-point format. The conversion1178is performed according to the IEC/IEEE Standard for Binary Floating-Point1179Arithmetic.1180-------------------------------------------------------------------------------1181*/1182floatx80 int32_to_floatx80( int32 a )1183{1184flag zSign;1185uint32 absA;1186int8 shiftCount;1187bits64 zSig;11881189if ( a == 0 ) return packFloatx80( 0, 0, 0 );1190zSign = ( a < 0 );1191absA = zSign ? - a : a;1192shiftCount = countLeadingZeros32( absA ) + 32;1193zSig = absA;1194return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );11951196}11971198floatx80 uint32_to_floatx80( uint32 a )1199{1200int8 shiftCount;1201bits64 zSig = a;12021203if ( a == 0 ) return packFloatx80( 0, 0, 0 );1204shiftCount = countLeadingZeros32( a ) + 32;1205return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount );12061207}12081209#endif12101211#ifdef FLOAT12812121213/*1214-------------------------------------------------------------------------------1215Returns the result of converting the 32-bit two's complement integer `a' to1216the quadruple-precision floating-point format. The conversion is performed1217according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.1218-------------------------------------------------------------------------------1219*/1220float128 int32_to_float128( int32 a )1221{1222flag zSign;1223uint32 absA;1224int8 shiftCount;1225bits64 zSig0;12261227if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );1228zSign = ( a < 0 );1229absA = zSign ? - a : a;1230shiftCount = countLeadingZeros32( absA ) + 17;1231zSig0 = absA;1232return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );12331234}12351236float128 uint32_to_float128( uint32 a )1237{1238int8 shiftCount;1239bits64 zSig0 = a;12401241if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );1242shiftCount = countLeadingZeros32( a ) + 17;1243return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 );12441245}12461247#endif12481249#ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */1250/*1251-------------------------------------------------------------------------------1252Returns the result of converting the 64-bit two's complement integer `a'1253to the single-precision floating-point format. The conversion is performed1254according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.1255-------------------------------------------------------------------------------1256*/1257float32 int64_to_float32( int64 a )1258{1259flag zSign;1260uint64 absA;1261int8 shiftCount;12621263if ( a == 0 ) return 0;1264zSign = ( a < 0 );1265absA = zSign ? - a : a;1266shiftCount = countLeadingZeros64( absA ) - 40;1267if ( 0 <= shiftCount ) {1268return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );1269}1270else {1271shiftCount += 7;1272if ( shiftCount < 0 ) {1273shift64RightJamming( absA, - shiftCount, &absA );1274}1275else {1276absA <<= shiftCount;1277}1278return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );1279}12801281}12821283/*1284-------------------------------------------------------------------------------1285Returns the result of converting the 64-bit two's complement integer `a'1286to the double-precision floating-point format. The conversion is performed1287according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.1288-------------------------------------------------------------------------------1289*/1290float64 int64_to_float64( int64 a )1291{1292flag zSign;12931294if ( a == 0 ) return 0;1295if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {1296return packFloat64( 1, 0x43E, 0 );1297}1298zSign = ( a < 0 );1299return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );13001301}13021303#ifdef FLOATX8013041305/*1306-------------------------------------------------------------------------------1307Returns the result of converting the 64-bit two's complement integer `a'1308to the extended double-precision floating-point format. The conversion1309is performed according to the IEC/IEEE Standard for Binary Floating-Point1310Arithmetic.1311-------------------------------------------------------------------------------1312*/1313floatx80 int64_to_floatx80( int64 a )1314{1315flag zSign;1316uint64 absA;1317int8 shiftCount;13181319if ( a == 0 ) return packFloatx80( 0, 0, 0 );1320zSign = ( a < 0 );1321absA = zSign ? - a : a;1322shiftCount = countLeadingZeros64( absA );1323return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );13241325}13261327#endif13281329#endif /* !SOFTFLOAT_FOR_GCC */13301331#ifdef FLOAT12813321333/*1334-------------------------------------------------------------------------------1335Returns the result of converting the 64-bit two's complement integer `a' to1336the quadruple-precision floating-point format. The conversion is performed1337according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.1338-------------------------------------------------------------------------------1339*/1340float128 int64_to_float128( int64 a )1341{1342flag zSign;1343uint64 absA;1344int8 shiftCount;1345int32 zExp;1346bits64 zSig0, zSig1;13471348if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );1349zSign = ( a < 0 );1350absA = zSign ? - a : a;1351shiftCount = countLeadingZeros64( absA ) + 49;1352zExp = 0x406E - shiftCount;1353if ( 64 <= shiftCount ) {1354zSig1 = 0;1355zSig0 = absA;1356shiftCount -= 64;1357}1358else {1359zSig1 = absA;1360zSig0 = 0;1361}1362shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );1363return packFloat128( zSign, zExp, zSig0, zSig1 );13641365}13661367#endif13681369#ifndef SOFTFLOAT_FOR_GCC /* Not needed */1370/*1371-------------------------------------------------------------------------------1372Returns the result of converting the single-precision floating-point value1373`a' to the 32-bit two's complement integer format. The conversion is1374performed according to the IEC/IEEE Standard for Binary Floating-Point1375Arithmetic---which means in particular that the conversion is rounded1376according to the current rounding mode. If `a' is a NaN, the largest1377positive integer is returned. Otherwise, if the conversion overflows, the1378largest integer with the same sign as `a' is returned.1379-------------------------------------------------------------------------------1380*/1381int32 float32_to_int32( float32 a )1382{1383flag aSign;1384int16 aExp, shiftCount;1385bits32 aSig;1386bits64 aSig64;13871388aSig = extractFloat32Frac( a );1389aExp = extractFloat32Exp( a );1390aSign = extractFloat32Sign( a );1391if ( ( aExp == 0xFF ) && aSig ) aSign = 0;1392if ( aExp ) aSig |= 0x00800000;1393shiftCount = 0xAF - aExp;1394aSig64 = aSig;1395aSig64 <<= 32;1396if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );1397return roundAndPackInt32( aSign, aSig64 );13981399}1400#endif /* !SOFTFLOAT_FOR_GCC */14011402/*1403-------------------------------------------------------------------------------1404Returns the result of converting the single-precision floating-point value1405`a' to the 32-bit two's complement integer format. The conversion is1406performed according to the IEC/IEEE Standard for Binary Floating-Point1407Arithmetic, except that the conversion is always rounded toward zero.1408If `a' is a NaN, the largest positive integer is returned. Otherwise, if1409the conversion overflows, the largest integer with the same sign as `a' is1410returned.1411-------------------------------------------------------------------------------1412*/1413int32 float32_to_int32_round_to_zero( float32 a )1414{1415flag aSign;1416int16 aExp, shiftCount;1417bits32 aSig;1418int32 z;14191420aSig = extractFloat32Frac( a );1421aExp = extractFloat32Exp( a );1422aSign = extractFloat32Sign( a );1423shiftCount = aExp - 0x9E;1424if ( 0 <= shiftCount ) {1425if ( a != 0xCF000000 ) {1426float_raise( float_flag_invalid );1427if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;1428}1429return (sbits32) 0x80000000;1430}1431else if ( aExp <= 0x7E ) {1432if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;1433return 0;1434}1435aSig = ( aSig | 0x00800000 )<<8;1436z = aSig>>( - shiftCount );1437if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {1438float_exception_flags |= float_flag_inexact;1439}1440if ( aSign ) z = - z;1441return z;14421443}14441445#ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */1446/*1447-------------------------------------------------------------------------------1448Returns the result of converting the single-precision floating-point value1449`a' to the 64-bit two's complement integer format. The conversion is1450performed according to the IEC/IEEE Standard for Binary Floating-Point1451Arithmetic---which means in particular that the conversion is rounded1452according to the current rounding mode. If `a' is a NaN, the largest1453positive integer is returned. Otherwise, if the conversion overflows, the1454largest integer with the same sign as `a' is returned.1455-------------------------------------------------------------------------------1456*/1457int64 float32_to_int64( float32 a )1458{1459flag aSign;1460int16 aExp, shiftCount;1461bits32 aSig;1462bits64 aSig64, aSigExtra;14631464aSig = extractFloat32Frac( a );1465aExp = extractFloat32Exp( a );1466aSign = extractFloat32Sign( a );1467shiftCount = 0xBE - aExp;1468if ( shiftCount < 0 ) {1469float_raise( float_flag_invalid );1470if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {1471return LIT64( 0x7FFFFFFFFFFFFFFF );1472}1473return (sbits64) LIT64( 0x8000000000000000 );1474}1475if ( aExp ) aSig |= 0x00800000;1476aSig64 = aSig;1477aSig64 <<= 40;1478shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );1479return roundAndPackInt64( aSign, aSig64, aSigExtra );14801481}14821483/*1484-------------------------------------------------------------------------------1485Returns the result of converting the single-precision floating-point value1486`a' to the 64-bit two's complement integer format. The conversion is1487performed according to the IEC/IEEE Standard for Binary Floating-Point1488Arithmetic, except that the conversion is always rounded toward zero. If1489`a' is a NaN, the largest positive integer is returned. Otherwise, if the1490conversion overflows, the largest integer with the same sign as `a' is1491returned.1492-------------------------------------------------------------------------------1493*/1494int64 float32_to_int64_round_to_zero( float32 a )1495{1496flag aSign;1497int16 aExp, shiftCount;1498bits32 aSig;1499bits64 aSig64;1500int64 z;15011502aSig = extractFloat32Frac( a );1503aExp = extractFloat32Exp( a );1504aSign = extractFloat32Sign( a );1505shiftCount = aExp - 0xBE;1506if ( 0 <= shiftCount ) {1507if ( a != 0xDF000000 ) {1508float_raise( float_flag_invalid );1509if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {1510return LIT64( 0x7FFFFFFFFFFFFFFF );1511}1512}1513return (sbits64) LIT64( 0x8000000000000000 );1514}1515else if ( aExp <= 0x7E ) {1516if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;1517return 0;1518}1519aSig64 = aSig | 0x00800000;1520aSig64 <<= 40;1521z = aSig64>>( - shiftCount );1522if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {1523float_exception_flags |= float_flag_inexact;1524}1525if ( aSign ) z = - z;1526return z;15271528}1529#endif /* !SOFTFLOAT_FOR_GCC */15301531/*1532-------------------------------------------------------------------------------1533Returns the result of converting the single-precision floating-point value1534`a' to the double-precision floating-point format. The conversion is1535performed according to the IEC/IEEE Standard for Binary Floating-Point1536Arithmetic.1537-------------------------------------------------------------------------------1538*/1539float64 float32_to_float64( float32 a )1540{1541flag aSign;1542int16 aExp;1543bits32 aSig;15441545aSig = extractFloat32Frac( a );1546aExp = extractFloat32Exp( a );1547aSign = extractFloat32Sign( a );1548if ( aExp == 0xFF ) {1549if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );1550return packFloat64( aSign, 0x7FF, 0 );1551}1552if ( aExp == 0 ) {1553if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );1554normalizeFloat32Subnormal( aSig, &aExp, &aSig );1555--aExp;1556}1557return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );15581559}15601561#ifdef FLOATX8015621563/*1564-------------------------------------------------------------------------------1565Returns the result of converting the single-precision floating-point value1566`a' to the extended double-precision floating-point format. The conversion1567is performed according to the IEC/IEEE Standard for Binary Floating-Point1568Arithmetic.1569-------------------------------------------------------------------------------1570*/1571floatx80 float32_to_floatx80( float32 a )1572{1573flag aSign;1574int16 aExp;1575bits32 aSig;15761577aSig = extractFloat32Frac( a );1578aExp = extractFloat32Exp( a );1579aSign = extractFloat32Sign( a );1580if ( aExp == 0xFF ) {1581if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );1582return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );1583}1584if ( aExp == 0 ) {1585if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );1586normalizeFloat32Subnormal( aSig, &aExp, &aSig );1587}1588aSig |= 0x00800000;1589return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );15901591}15921593#endif15941595#ifdef FLOAT12815961597/*1598-------------------------------------------------------------------------------1599Returns the result of converting the single-precision floating-point value1600`a' to the double-precision floating-point format. The conversion is1601performed according to the IEC/IEEE Standard for Binary Floating-Point1602Arithmetic.1603-------------------------------------------------------------------------------1604*/1605float128 float32_to_float128( float32 a )1606{1607flag aSign;1608int16 aExp;1609bits32 aSig;16101611aSig = extractFloat32Frac( a );1612aExp = extractFloat32Exp( a );1613aSign = extractFloat32Sign( a );1614if ( aExp == 0xFF ) {1615if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );1616return packFloat128( aSign, 0x7FFF, 0, 0 );1617}1618if ( aExp == 0 ) {1619if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );1620normalizeFloat32Subnormal( aSig, &aExp, &aSig );1621--aExp;1622}1623return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );16241625}16261627#endif16281629#ifndef SOFTFLOAT_FOR_GCC /* Not needed */1630/*1631-------------------------------------------------------------------------------1632Rounds the single-precision floating-point value `a' to an integer, and1633returns the result as a single-precision floating-point value. The1634operation is performed according to the IEC/IEEE Standard for Binary1635Floating-Point Arithmetic.1636-------------------------------------------------------------------------------1637*/1638float32 float32_round_to_int( float32 a )1639{1640flag aSign;1641int16 aExp;1642bits32 lastBitMask, roundBitsMask;1643int8 roundingMode;1644float32 z;16451646aExp = extractFloat32Exp( a );1647if ( 0x96 <= aExp ) {1648if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {1649return propagateFloat32NaN( a, a );1650}1651return a;1652}1653if ( aExp <= 0x7E ) {1654if ( (bits32) ( a<<1 ) == 0 ) return a;1655float_exception_flags |= float_flag_inexact;1656aSign = extractFloat32Sign( a );1657switch ( float_rounding_mode ) {1658case float_round_nearest_even:1659if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {1660return packFloat32( aSign, 0x7F, 0 );1661}1662break;1663case float_round_to_zero:1664break;1665case float_round_down:1666return aSign ? 0xBF800000 : 0;1667case float_round_up:1668return aSign ? 0x80000000 : 0x3F800000;1669}1670return packFloat32( aSign, 0, 0 );1671}1672lastBitMask = 1;1673lastBitMask <<= 0x96 - aExp;1674roundBitsMask = lastBitMask - 1;1675z = a;1676roundingMode = float_rounding_mode;1677if ( roundingMode == float_round_nearest_even ) {1678z += lastBitMask>>1;1679if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;1680}1681else if ( roundingMode != float_round_to_zero ) {1682if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {1683z += roundBitsMask;1684}1685}1686z &= ~ roundBitsMask;1687if ( z != a ) float_exception_flags |= float_flag_inexact;1688return z;16891690}1691#endif /* !SOFTFLOAT_FOR_GCC */16921693/*1694-------------------------------------------------------------------------------1695Returns the result of adding the absolute values of the single-precision1696floating-point values `a' and `b'. If `zSign' is 1, the sum is negated1697before being returned. `zSign' is ignored if the result is a NaN.1698The addition is performed according to the IEC/IEEE Standard for Binary1699Floating-Point Arithmetic.1700-------------------------------------------------------------------------------1701*/1702static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )1703{1704int16 aExp, bExp, zExp;1705bits32 aSig, bSig, zSig;1706int16 expDiff;17071708aSig = extractFloat32Frac( a );1709aExp = extractFloat32Exp( a );1710bSig = extractFloat32Frac( b );1711bExp = extractFloat32Exp( b );1712expDiff = aExp - bExp;1713aSig <<= 6;1714bSig <<= 6;1715if ( 0 < expDiff ) {1716if ( aExp == 0xFF ) {1717if ( aSig ) return propagateFloat32NaN( a, b );1718return a;1719}1720if ( bExp == 0 ) {1721--expDiff;1722}1723else {1724bSig |= 0x20000000;1725}1726shift32RightJamming( bSig, expDiff, &bSig );1727zExp = aExp;1728}1729else if ( expDiff < 0 ) {1730if ( bExp == 0xFF ) {1731if ( bSig ) return propagateFloat32NaN( a, b );1732return packFloat32( zSign, 0xFF, 0 );1733}1734if ( aExp == 0 ) {1735++expDiff;1736}1737else {1738aSig |= 0x20000000;1739}1740shift32RightJamming( aSig, - expDiff, &aSig );1741zExp = bExp;1742}1743else {1744if ( aExp == 0xFF ) {1745if ( aSig | bSig ) return propagateFloat32NaN( a, b );1746return a;1747}1748if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );1749zSig = 0x40000000 + aSig + bSig;1750zExp = aExp;1751goto roundAndPack;1752}1753aSig |= 0x20000000;1754zSig = ( aSig + bSig )<<1;1755--zExp;1756if ( (sbits32) zSig < 0 ) {1757zSig = aSig + bSig;1758++zExp;1759}1760roundAndPack:1761return roundAndPackFloat32( zSign, zExp, zSig );17621763}17641765/*1766-------------------------------------------------------------------------------1767Returns the result of subtracting the absolute values of the single-1768precision floating-point values `a' and `b'. If `zSign' is 1, the1769difference is negated before being returned. `zSign' is ignored if the1770result is a NaN. The subtraction is performed according to the IEC/IEEE1771Standard for Binary Floating-Point Arithmetic.1772-------------------------------------------------------------------------------1773*/1774static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )1775{1776int16 aExp, bExp, zExp;1777bits32 aSig, bSig, zSig;1778int16 expDiff;17791780aSig = extractFloat32Frac( a );1781aExp = extractFloat32Exp( a );1782bSig = extractFloat32Frac( b );1783bExp = extractFloat32Exp( b );1784expDiff = aExp - bExp;1785aSig <<= 7;1786bSig <<= 7;1787if ( 0 < expDiff ) goto aExpBigger;1788if ( expDiff < 0 ) goto bExpBigger;1789if ( aExp == 0xFF ) {1790if ( aSig | bSig ) return propagateFloat32NaN( a, b );1791float_raise( float_flag_invalid );1792return float32_default_nan;1793}1794if ( aExp == 0 ) {1795aExp = 1;1796bExp = 1;1797}1798if ( bSig < aSig ) goto aBigger;1799if ( aSig < bSig ) goto bBigger;1800return packFloat32( float_rounding_mode == float_round_down, 0, 0 );1801bExpBigger:1802if ( bExp == 0xFF ) {1803if ( bSig ) return propagateFloat32NaN( a, b );1804return packFloat32( zSign ^ 1, 0xFF, 0 );1805}1806if ( aExp == 0 ) {1807++expDiff;1808}1809else {1810aSig |= 0x40000000;1811}1812shift32RightJamming( aSig, - expDiff, &aSig );1813bSig |= 0x40000000;1814bBigger:1815zSig = bSig - aSig;1816zExp = bExp;1817zSign ^= 1;1818goto normalizeRoundAndPack;1819aExpBigger:1820if ( aExp == 0xFF ) {1821if ( aSig ) return propagateFloat32NaN( a, b );1822return a;1823}1824if ( bExp == 0 ) {1825--expDiff;1826}1827else {1828bSig |= 0x40000000;1829}1830shift32RightJamming( bSig, expDiff, &bSig );1831aSig |= 0x40000000;1832aBigger:1833zSig = aSig - bSig;1834zExp = aExp;1835normalizeRoundAndPack:1836--zExp;1837return normalizeRoundAndPackFloat32( zSign, zExp, zSig );18381839}18401841/*1842-------------------------------------------------------------------------------1843Returns the result of adding the single-precision floating-point values `a'1844and `b'. The operation is performed according to the IEC/IEEE Standard for1845Binary Floating-Point Arithmetic.1846-------------------------------------------------------------------------------1847*/1848float32 float32_add( float32 a, float32 b )1849{1850flag aSign, bSign;18511852aSign = extractFloat32Sign( a );1853bSign = extractFloat32Sign( b );1854if ( aSign == bSign ) {1855return addFloat32Sigs( a, b, aSign );1856}1857else {1858return subFloat32Sigs( a, b, aSign );1859}18601861}18621863/*1864-------------------------------------------------------------------------------1865Returns the result of subtracting the single-precision floating-point values1866`a' and `b'. The operation is performed according to the IEC/IEEE Standard1867for Binary Floating-Point Arithmetic.1868-------------------------------------------------------------------------------1869*/1870float32 float32_sub( float32 a, float32 b )1871{1872flag aSign, bSign;18731874aSign = extractFloat32Sign( a );1875bSign = extractFloat32Sign( b );1876if ( aSign == bSign ) {1877return subFloat32Sigs( a, b, aSign );1878}1879else {1880return addFloat32Sigs( a, b, aSign );1881}18821883}18841885/*1886-------------------------------------------------------------------------------1887Returns the result of multiplying the single-precision floating-point values1888`a' and `b'. The operation is performed according to the IEC/IEEE Standard1889for Binary Floating-Point Arithmetic.1890-------------------------------------------------------------------------------1891*/1892float32 float32_mul( float32 a, float32 b )1893{1894flag aSign, bSign, zSign;1895int16 aExp, bExp, zExp;1896bits32 aSig, bSig;1897bits64 zSig64;1898bits32 zSig;18991900aSig = extractFloat32Frac( a );1901aExp = extractFloat32Exp( a );1902aSign = extractFloat32Sign( a );1903bSig = extractFloat32Frac( b );1904bExp = extractFloat32Exp( b );1905bSign = extractFloat32Sign( b );1906zSign = aSign ^ bSign;1907if ( aExp == 0xFF ) {1908if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {1909return propagateFloat32NaN( a, b );1910}1911if ( ( bExp | bSig ) == 0 ) {1912float_raise( float_flag_invalid );1913return float32_default_nan;1914}1915return packFloat32( zSign, 0xFF, 0 );1916}1917if ( bExp == 0xFF ) {1918if ( bSig ) return propagateFloat32NaN( a, b );1919if ( ( aExp | aSig ) == 0 ) {1920float_raise( float_flag_invalid );1921return float32_default_nan;1922}1923return packFloat32( zSign, 0xFF, 0 );1924}1925if ( aExp == 0 ) {1926if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );1927normalizeFloat32Subnormal( aSig, &aExp, &aSig );1928}1929if ( bExp == 0 ) {1930if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );1931normalizeFloat32Subnormal( bSig, &bExp, &bSig );1932}1933zExp = aExp + bExp - 0x7F;1934aSig = ( aSig | 0x00800000 )<<7;1935bSig = ( bSig | 0x00800000 )<<8;1936shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );1937zSig = zSig64;1938if ( 0 <= (sbits32) ( zSig<<1 ) ) {1939zSig <<= 1;1940--zExp;1941}1942return roundAndPackFloat32( zSign, zExp, zSig );19431944}19451946/*1947-------------------------------------------------------------------------------1948Returns the result of dividing the single-precision floating-point value `a'1949by the corresponding value `b'. The operation is performed according to the1950IEC/IEEE Standard for Binary Floating-Point Arithmetic.1951-------------------------------------------------------------------------------1952*/1953float32 float32_div( float32 a, float32 b )1954{1955flag aSign, bSign, zSign;1956int16 aExp, bExp, zExp;1957bits32 aSig, bSig, zSig;19581959aSig = extractFloat32Frac( a );1960aExp = extractFloat32Exp( a );1961aSign = extractFloat32Sign( a );1962bSig = extractFloat32Frac( b );1963bExp = extractFloat32Exp( b );1964bSign = extractFloat32Sign( b );1965zSign = aSign ^ bSign;1966if ( aExp == 0xFF ) {1967if ( aSig ) return propagateFloat32NaN( a, b );1968if ( bExp == 0xFF ) {1969if ( bSig ) return propagateFloat32NaN( a, b );1970float_raise( float_flag_invalid );1971return float32_default_nan;1972}1973return packFloat32( zSign, 0xFF, 0 );1974}1975if ( bExp == 0xFF ) {1976if ( bSig ) return propagateFloat32NaN( a, b );1977return packFloat32( zSign, 0, 0 );1978}1979if ( bExp == 0 ) {1980if ( bSig == 0 ) {1981if ( ( aExp | aSig ) == 0 ) {1982float_raise( float_flag_invalid );1983return float32_default_nan;1984}1985float_raise( float_flag_divbyzero );1986return packFloat32( zSign, 0xFF, 0 );1987}1988normalizeFloat32Subnormal( bSig, &bExp, &bSig );1989}1990if ( aExp == 0 ) {1991if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );1992normalizeFloat32Subnormal( aSig, &aExp, &aSig );1993}1994zExp = aExp - bExp + 0x7D;1995aSig = ( aSig | 0x00800000 )<<7;1996bSig = ( bSig | 0x00800000 )<<8;1997if ( bSig <= ( aSig + aSig ) ) {1998aSig >>= 1;1999++zExp;2000}2001zSig = ( ( (bits64) aSig )<<32 ) / bSig;2002if ( ( zSig & 0x3F ) == 0 ) {2003zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );2004}2005return roundAndPackFloat32( zSign, zExp, zSig );20062007}20082009#ifndef SOFTFLOAT_FOR_GCC /* Not needed */2010/*2011-------------------------------------------------------------------------------2012Returns the remainder of the single-precision floating-point value `a'2013with respect to the corresponding value `b'. The operation is performed2014according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.2015-------------------------------------------------------------------------------2016*/2017float32 float32_rem( float32 a, float32 b )2018{2019flag aSign, bSign, zSign;2020int16 aExp, bExp, expDiff;2021bits32 aSig, bSig;2022bits32 q;2023bits64 aSig64, bSig64, q64;2024bits32 alternateASig;2025sbits32 sigMean;20262027aSig = extractFloat32Frac( a );2028aExp = extractFloat32Exp( a );2029aSign = extractFloat32Sign( a );2030bSig = extractFloat32Frac( b );2031bExp = extractFloat32Exp( b );2032bSign = extractFloat32Sign( b );2033if ( aExp == 0xFF ) {2034if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {2035return propagateFloat32NaN( a, b );2036}2037float_raise( float_flag_invalid );2038return float32_default_nan;2039}2040if ( bExp == 0xFF ) {2041if ( bSig ) return propagateFloat32NaN( a, b );2042return a;2043}2044if ( bExp == 0 ) {2045if ( bSig == 0 ) {2046float_raise( float_flag_invalid );2047return float32_default_nan;2048}2049normalizeFloat32Subnormal( bSig, &bExp, &bSig );2050}2051if ( aExp == 0 ) {2052if ( aSig == 0 ) return a;2053normalizeFloat32Subnormal( aSig, &aExp, &aSig );2054}2055expDiff = aExp - bExp;2056aSig |= 0x00800000;2057bSig |= 0x00800000;2058if ( expDiff < 32 ) {2059aSig <<= 8;2060bSig <<= 8;2061if ( expDiff < 0 ) {2062if ( expDiff < -1 ) return a;2063aSig >>= 1;2064}2065q = ( bSig <= aSig );2066if ( q ) aSig -= bSig;2067if ( 0 < expDiff ) {2068q = ( ( (bits64) aSig )<<32 ) / bSig;2069q >>= 32 - expDiff;2070bSig >>= 2;2071aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;2072}2073else {2074aSig >>= 2;2075bSig >>= 2;2076}2077}2078else {2079if ( bSig <= aSig ) aSig -= bSig;2080aSig64 = ( (bits64) aSig )<<40;2081bSig64 = ( (bits64) bSig )<<40;2082expDiff -= 64;2083while ( 0 < expDiff ) {2084q64 = estimateDiv128To64( aSig64, 0, bSig64 );2085q64 = ( 2 < q64 ) ? q64 - 2 : 0;2086aSig64 = - ( ( bSig * q64 )<<38 );2087expDiff -= 62;2088}2089expDiff += 64;2090q64 = estimateDiv128To64( aSig64, 0, bSig64 );2091q64 = ( 2 < q64 ) ? q64 - 2 : 0;2092q = q64>>( 64 - expDiff );2093bSig <<= 6;2094aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;2095}2096do {2097alternateASig = aSig;2098++q;2099aSig -= bSig;2100} while ( 0 <= (sbits32) aSig );2101sigMean = aSig + alternateASig;2102if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {2103aSig = alternateASig;2104}2105zSign = ( (sbits32) aSig < 0 );2106if ( zSign ) aSig = - aSig;2107return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );21082109}2110#endif /* !SOFTFLOAT_FOR_GCC */21112112#ifndef SOFTFLOAT_FOR_GCC /* Not needed */2113/*2114-------------------------------------------------------------------------------2115Returns the square root of the single-precision floating-point value `a'.2116The operation is performed according to the IEC/IEEE Standard for Binary2117Floating-Point Arithmetic.2118-------------------------------------------------------------------------------2119*/2120float32 float32_sqrt( float32 a )2121{2122flag aSign;2123int16 aExp, zExp;2124bits32 aSig, zSig;2125bits64 rem, term;21262127aSig = extractFloat32Frac( a );2128aExp = extractFloat32Exp( a );2129aSign = extractFloat32Sign( a );2130if ( aExp == 0xFF ) {2131if ( aSig ) return propagateFloat32NaN( a, 0 );2132if ( ! aSign ) return a;2133float_raise( float_flag_invalid );2134return float32_default_nan;2135}2136if ( aSign ) {2137if ( ( aExp | aSig ) == 0 ) return a;2138float_raise( float_flag_invalid );2139return float32_default_nan;2140}2141if ( aExp == 0 ) {2142if ( aSig == 0 ) return 0;2143normalizeFloat32Subnormal( aSig, &aExp, &aSig );2144}2145zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;2146aSig = ( aSig | 0x00800000 )<<8;2147zSig = estimateSqrt32( aExp, aSig ) + 2;2148if ( ( zSig & 0x7F ) <= 5 ) {2149if ( zSig < 2 ) {2150zSig = 0x7FFFFFFF;2151goto roundAndPack;2152}2153aSig >>= aExp & 1;2154term = ( (bits64) zSig ) * zSig;2155rem = ( ( (bits64) aSig )<<32 ) - term;2156while ( (sbits64) rem < 0 ) {2157--zSig;2158rem += ( ( (bits64) zSig )<<1 ) | 1;2159}2160zSig |= ( rem != 0 );2161}2162shift32RightJamming( zSig, 1, &zSig );2163roundAndPack:2164return roundAndPackFloat32( 0, zExp, zSig );21652166}2167#endif /* !SOFTFLOAT_FOR_GCC */21682169/*2170-------------------------------------------------------------------------------2171Returns 1 if the single-precision floating-point value `a' is equal to2172the corresponding value `b', and 0 otherwise. The comparison is performed2173according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.2174-------------------------------------------------------------------------------2175*/2176flag float32_eq( float32 a, float32 b )2177{21782179if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )2180|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )2181) {2182if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {2183float_raise( float_flag_invalid );2184}2185return 0;2186}2187return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );21882189}21902191/*2192-------------------------------------------------------------------------------2193Returns 1 if the single-precision floating-point value `a' is less than2194or equal to the corresponding value `b', and 0 otherwise. The comparison2195is performed according to the IEC/IEEE Standard for Binary Floating-Point2196Arithmetic.2197-------------------------------------------------------------------------------2198*/2199flag float32_le( float32 a, float32 b )2200{2201flag aSign, bSign;22022203if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )2204|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )2205) {2206float_raise( float_flag_invalid );2207return 0;2208}2209aSign = extractFloat32Sign( a );2210bSign = extractFloat32Sign( b );2211if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );2212return ( a == b ) || ( aSign ^ ( a < b ) );22132214}22152216/*2217-------------------------------------------------------------------------------2218Returns 1 if the single-precision floating-point value `a' is less than2219the corresponding value `b', and 0 otherwise. The comparison is performed2220according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.2221-------------------------------------------------------------------------------2222*/2223flag float32_lt( float32 a, float32 b )2224{2225flag aSign, bSign;22262227if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )2228|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )2229) {2230float_raise( float_flag_invalid );2231return 0;2232}2233aSign = extractFloat32Sign( a );2234bSign = extractFloat32Sign( b );2235if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );2236return ( a != b ) && ( aSign ^ ( a < b ) );22372238}22392240#ifndef SOFTFLOAT_FOR_GCC /* Not needed */2241/*2242-------------------------------------------------------------------------------2243Returns 1 if the single-precision floating-point value `a' is equal to2244the corresponding value `b', and 0 otherwise. The invalid exception is2245raised if either operand is a NaN. Otherwise, the comparison is performed2246according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.2247-------------------------------------------------------------------------------2248*/2249flag float32_eq_signaling( float32 a, float32 b )2250{22512252if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )2253|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )2254) {2255float_raise( float_flag_invalid );2256return 0;2257}2258return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );22592260}22612262/*2263-------------------------------------------------------------------------------2264Returns 1 if the single-precision floating-point value `a' is less than or2265equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not2266cause an exception. Otherwise, the comparison is performed according to the2267IEC/IEEE Standard for Binary Floating-Point Arithmetic.2268-------------------------------------------------------------------------------2269*/2270flag float32_le_quiet( float32 a, float32 b )2271{2272flag aSign, bSign;22732274if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )2275|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )2276) {2277if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {2278float_raise( float_flag_invalid );2279}2280return 0;2281}2282aSign = extractFloat32Sign( a );2283bSign = extractFloat32Sign( b );2284if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );2285return ( a == b ) || ( aSign ^ ( a < b ) );22862287}22882289/*2290-------------------------------------------------------------------------------2291Returns 1 if the single-precision floating-point value `a' is less than2292the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an2293exception. Otherwise, the comparison is performed according to the IEC/IEEE2294Standard for Binary Floating-Point Arithmetic.2295-------------------------------------------------------------------------------2296*/2297flag float32_lt_quiet( float32 a, float32 b )2298{2299flag aSign, bSign;23002301if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )2302|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )2303) {2304if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {2305float_raise( float_flag_invalid );2306}2307return 0;2308}2309aSign = extractFloat32Sign( a );2310bSign = extractFloat32Sign( b );2311if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );2312return ( a != b ) && ( aSign ^ ( a < b ) );23132314}2315#endif /* !SOFTFLOAT_FOR_GCC */23162317#ifndef SOFTFLOAT_FOR_GCC /* Not needed */2318/*2319-------------------------------------------------------------------------------2320Returns the result of converting the double-precision floating-point value2321`a' to the 32-bit two's complement integer format. The conversion is2322performed according to the IEC/IEEE Standard for Binary Floating-Point2323Arithmetic---which means in particular that the conversion is rounded2324according to the current rounding mode. If `a' is a NaN, the largest2325positive integer is returned. Otherwise, if the conversion overflows, the2326largest integer with the same sign as `a' is returned.2327-------------------------------------------------------------------------------2328*/2329int32 float64_to_int32( float64 a )2330{2331flag aSign;2332int16 aExp, shiftCount;2333bits64 aSig;23342335aSig = extractFloat64Frac( a );2336aExp = extractFloat64Exp( a );2337aSign = extractFloat64Sign( a );2338if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;2339if ( aExp ) aSig |= LIT64( 0x0010000000000000 );2340shiftCount = 0x42C - aExp;2341if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );2342return roundAndPackInt32( aSign, aSig );23432344}2345#endif /* !SOFTFLOAT_FOR_GCC */23462347/*2348-------------------------------------------------------------------------------2349Returns the result of converting the double-precision floating-point value2350`a' to the 32-bit two's complement integer format. The conversion is2351performed according to the IEC/IEEE Standard for Binary Floating-Point2352Arithmetic, except that the conversion is always rounded toward zero.2353If `a' is a NaN, the largest positive integer is returned. Otherwise, if2354the conversion overflows, the largest integer with the same sign as `a' is2355returned.2356-------------------------------------------------------------------------------2357*/2358int32 float64_to_int32_round_to_zero( float64 a )2359{2360flag aSign;2361int16 aExp, shiftCount;2362bits64 aSig, savedASig;2363int32 z;23642365aSig = extractFloat64Frac( a );2366aExp = extractFloat64Exp( a );2367aSign = extractFloat64Sign( a );2368if ( 0x41E < aExp ) {2369if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;2370goto invalid;2371}2372else if ( aExp < 0x3FF ) {2373if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;2374return 0;2375}2376aSig |= LIT64( 0x0010000000000000 );2377shiftCount = 0x433 - aExp;2378savedASig = aSig;2379aSig >>= shiftCount;2380z = aSig;2381if ( aSign ) z = - z;2382if ( ( z < 0 ) ^ aSign ) {2383invalid:2384float_raise( float_flag_invalid );2385return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;2386}2387if ( ( aSig<<shiftCount ) != savedASig ) {2388float_exception_flags |= float_flag_inexact;2389}2390return z;23912392}23932394#ifndef SOFTFLOAT_FOR_GCC /* Not needed */2395/*2396-------------------------------------------------------------------------------2397Returns the result of converting the double-precision floating-point value2398`a' to the 64-bit two's complement integer format. The conversion is2399performed according to the IEC/IEEE Standard for Binary Floating-Point2400Arithmetic---which means in particular that the conversion is rounded2401according to the current rounding mode. If `a' is a NaN, the largest2402positive integer is returned. Otherwise, if the conversion overflows, the2403largest integer with the same sign as `a' is returned.2404-------------------------------------------------------------------------------2405*/2406int64 float64_to_int64( float64 a )2407{2408flag aSign;2409int16 aExp, shiftCount;2410bits64 aSig, aSigExtra;24112412aSig = extractFloat64Frac( a );2413aExp = extractFloat64Exp( a );2414aSign = extractFloat64Sign( a );2415if ( aExp ) aSig |= LIT64( 0x0010000000000000 );2416shiftCount = 0x433 - aExp;2417if ( shiftCount <= 0 ) {2418if ( 0x43E < aExp ) {2419float_raise( float_flag_invalid );2420if ( ! aSign2421|| ( ( aExp == 0x7FF )2422&& ( aSig != LIT64( 0x0010000000000000 ) ) )2423) {2424return LIT64( 0x7FFFFFFFFFFFFFFF );2425}2426return (sbits64) LIT64( 0x8000000000000000 );2427}2428aSigExtra = 0;2429aSig <<= - shiftCount;2430}2431else {2432shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );2433}2434return roundAndPackInt64( aSign, aSig, aSigExtra );24352436}24372438/*2439-------------------------------------------------------------------------------2440Returns the result of converting the double-precision floating-point value2441`a' to the 64-bit two's complement integer format. The conversion is2442performed according to the IEC/IEEE Standard for Binary Floating-Point2443Arithmetic, except that the conversion is always rounded toward zero.2444If `a' is a NaN, the largest positive integer is returned. Otherwise, if2445the conversion overflows, the largest integer with the same sign as `a' is2446returned.2447-------------------------------------------------------------------------------2448*/2449int64 float64_to_int64_round_to_zero( float64 a )2450{2451flag aSign;2452int16 aExp, shiftCount;2453bits64 aSig;2454int64 z;24552456aSig = extractFloat64Frac( a );2457aExp = extractFloat64Exp( a );2458aSign = extractFloat64Sign( a );2459if ( aExp ) aSig |= LIT64( 0x0010000000000000 );2460shiftCount = aExp - 0x433;2461if ( 0 <= shiftCount ) {2462if ( 0x43E <= aExp ) {2463if ( a != LIT64( 0xC3E0000000000000 ) ) {2464float_raise( float_flag_invalid );2465if ( ! aSign2466|| ( ( aExp == 0x7FF )2467&& ( aSig != LIT64( 0x0010000000000000 ) ) )2468) {2469return LIT64( 0x7FFFFFFFFFFFFFFF );2470}2471}2472return (sbits64) LIT64( 0x8000000000000000 );2473}2474z = aSig<<shiftCount;2475}2476else {2477if ( aExp < 0x3FE ) {2478if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;2479return 0;2480}2481z = aSig>>( - shiftCount );2482if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {2483float_exception_flags |= float_flag_inexact;2484}2485}2486if ( aSign ) z = - z;2487return z;24882489}2490#endif /* !SOFTFLOAT_FOR_GCC */24912492/*2493-------------------------------------------------------------------------------2494Returns the result of converting the double-precision floating-point value2495`a' to the single-precision floating-point format. The conversion is2496performed according to the IEC/IEEE Standard for Binary Floating-Point2497Arithmetic.2498-------------------------------------------------------------------------------2499*/2500float32 float64_to_float32( float64 a )2501{2502flag aSign;2503int16 aExp;2504bits64 aSig;2505bits32 zSig;25062507aSig = extractFloat64Frac( a );2508aExp = extractFloat64Exp( a );2509aSign = extractFloat64Sign( a );2510if ( aExp == 0x7FF ) {2511if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );2512return packFloat32( aSign, 0xFF, 0 );2513}2514shift64RightJamming( aSig, 22, &aSig );2515zSig = aSig;2516if ( aExp || zSig ) {2517zSig |= 0x40000000;2518aExp -= 0x381;2519}2520return roundAndPackFloat32( aSign, aExp, zSig );25212522}25232524#ifdef FLOATX8025252526/*2527-------------------------------------------------------------------------------2528Returns the result of converting the double-precision floating-point value2529`a' to the extended double-precision floating-point format. The conversion2530is performed according to the IEC/IEEE Standard for Binary Floating-Point2531Arithmetic.2532-------------------------------------------------------------------------------2533*/2534floatx80 float64_to_floatx80( float64 a )2535{2536flag aSign;2537int16 aExp;2538bits64 aSig;25392540aSig = extractFloat64Frac( a );2541aExp = extractFloat64Exp( a );2542aSign = extractFloat64Sign( a );2543if ( aExp == 0x7FF ) {2544if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );2545return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );2546}2547if ( aExp == 0 ) {2548if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );2549normalizeFloat64Subnormal( aSig, &aExp, &aSig );2550}2551return2552packFloatx80(2553aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );25542555}25562557#endif25582559#ifdef FLOAT12825602561/*2562-------------------------------------------------------------------------------2563Returns the result of converting the double-precision floating-point value2564`a' to the quadruple-precision floating-point format. The conversion is2565performed according to the IEC/IEEE Standard for Binary Floating-Point2566Arithmetic.2567-------------------------------------------------------------------------------2568*/2569float128 float64_to_float128( float64 a )2570{2571flag aSign;2572int16 aExp;2573bits64 aSig, zSig0, zSig1;25742575aSig = extractFloat64Frac( a );2576aExp = extractFloat64Exp( a );2577aSign = extractFloat64Sign( a );2578if ( aExp == 0x7FF ) {2579if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );2580return packFloat128( aSign, 0x7FFF, 0, 0 );2581}2582if ( aExp == 0 ) {2583if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );2584normalizeFloat64Subnormal( aSig, &aExp, &aSig );2585--aExp;2586}2587shift128Right( aSig, 0, 4, &zSig0, &zSig1 );2588return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );25892590}25912592#endif25932594#ifndef SOFTFLOAT_FOR_GCC2595/*2596-------------------------------------------------------------------------------2597Rounds the double-precision floating-point value `a' to an integer, and2598returns the result as a double-precision floating-point value. The2599operation is performed according to the IEC/IEEE Standard for Binary2600Floating-Point Arithmetic.2601-------------------------------------------------------------------------------2602*/2603float64 float64_round_to_int( float64 a )2604{2605flag aSign;2606int16 aExp;2607bits64 lastBitMask, roundBitsMask;2608int8 roundingMode;2609float64 z;26102611aExp = extractFloat64Exp( a );2612if ( 0x433 <= aExp ) {2613if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {2614return propagateFloat64NaN( a, a );2615}2616return a;2617}2618if ( aExp < 0x3FF ) {2619if ( (bits64) ( a<<1 ) == 0 ) return a;2620float_exception_flags |= float_flag_inexact;2621aSign = extractFloat64Sign( a );2622switch ( float_rounding_mode ) {2623case float_round_nearest_even:2624if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {2625return packFloat64( aSign, 0x3FF, 0 );2626}2627break;2628case float_round_to_zero:2629break;2630case float_round_down:2631return aSign ? LIT64( 0xBFF0000000000000 ) : 0;2632case float_round_up:2633return2634aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );2635}2636return packFloat64( aSign, 0, 0 );2637}2638lastBitMask = 1;2639lastBitMask <<= 0x433 - aExp;2640roundBitsMask = lastBitMask - 1;2641z = a;2642roundingMode = float_rounding_mode;2643if ( roundingMode == float_round_nearest_even ) {2644z += lastBitMask>>1;2645if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;2646}2647else if ( roundingMode != float_round_to_zero ) {2648if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {2649z += roundBitsMask;2650}2651}2652z &= ~ roundBitsMask;2653if ( z != a ) float_exception_flags |= float_flag_inexact;2654return z;26552656}2657#endif26582659/*2660-------------------------------------------------------------------------------2661Returns the result of adding the absolute values of the double-precision2662floating-point values `a' and `b'. If `zSign' is 1, the sum is negated2663before being returned. `zSign' is ignored if the result is a NaN.2664The addition is performed according to the IEC/IEEE Standard for Binary2665Floating-Point Arithmetic.2666-------------------------------------------------------------------------------2667*/2668static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )2669{2670int16 aExp, bExp, zExp;2671bits64 aSig, bSig, zSig;2672int16 expDiff;26732674aSig = extractFloat64Frac( a );2675aExp = extractFloat64Exp( a );2676bSig = extractFloat64Frac( b );2677bExp = extractFloat64Exp( b );2678expDiff = aExp - bExp;2679aSig <<= 9;2680bSig <<= 9;2681if ( 0 < expDiff ) {2682if ( aExp == 0x7FF ) {2683if ( aSig ) return propagateFloat64NaN( a, b );2684return a;2685}2686if ( bExp == 0 ) {2687--expDiff;2688}2689else {2690bSig |= LIT64( 0x2000000000000000 );2691}2692shift64RightJamming( bSig, expDiff, &bSig );2693zExp = aExp;2694}2695else if ( expDiff < 0 ) {2696if ( bExp == 0x7FF ) {2697if ( bSig ) return propagateFloat64NaN( a, b );2698return packFloat64( zSign, 0x7FF, 0 );2699}2700if ( aExp == 0 ) {2701++expDiff;2702}2703else {2704aSig |= LIT64( 0x2000000000000000 );2705}2706shift64RightJamming( aSig, - expDiff, &aSig );2707zExp = bExp;2708}2709else {2710if ( aExp == 0x7FF ) {2711if ( aSig | bSig ) return propagateFloat64NaN( a, b );2712return a;2713}2714if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );2715zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;2716zExp = aExp;2717goto roundAndPack;2718}2719aSig |= LIT64( 0x2000000000000000 );2720zSig = ( aSig + bSig )<<1;2721--zExp;2722if ( (sbits64) zSig < 0 ) {2723zSig = aSig + bSig;2724++zExp;2725}2726roundAndPack:2727return roundAndPackFloat64( zSign, zExp, zSig );27282729}27302731/*2732-------------------------------------------------------------------------------2733Returns the result of subtracting the absolute values of the double-2734precision floating-point values `a' and `b'. If `zSign' is 1, the2735difference is negated before being returned. `zSign' is ignored if the2736result is a NaN. The subtraction is performed according to the IEC/IEEE2737Standard for Binary Floating-Point Arithmetic.2738-------------------------------------------------------------------------------2739*/2740static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )2741{2742int16 aExp, bExp, zExp;2743bits64 aSig, bSig, zSig;2744int16 expDiff;27452746aSig = extractFloat64Frac( a );2747aExp = extractFloat64Exp( a );2748bSig = extractFloat64Frac( b );2749bExp = extractFloat64Exp( b );2750expDiff = aExp - bExp;2751aSig <<= 10;2752bSig <<= 10;2753if ( 0 < expDiff ) goto aExpBigger;2754if ( expDiff < 0 ) goto bExpBigger;2755if ( aExp == 0x7FF ) {2756if ( aSig | bSig ) return propagateFloat64NaN( a, b );2757float_raise( float_flag_invalid );2758return float64_default_nan;2759}2760if ( aExp == 0 ) {2761aExp = 1;2762bExp = 1;2763}2764if ( bSig < aSig ) goto aBigger;2765if ( aSig < bSig ) goto bBigger;2766return packFloat64( float_rounding_mode == float_round_down, 0, 0 );2767bExpBigger:2768if ( bExp == 0x7FF ) {2769if ( bSig ) return propagateFloat64NaN( a, b );2770return packFloat64( zSign ^ 1, 0x7FF, 0 );2771}2772if ( aExp == 0 ) {2773++expDiff;2774}2775else {2776aSig |= LIT64( 0x4000000000000000 );2777}2778shift64RightJamming( aSig, - expDiff, &aSig );2779bSig |= LIT64( 0x4000000000000000 );2780bBigger:2781zSig = bSig - aSig;2782zExp = bExp;2783zSign ^= 1;2784goto normalizeRoundAndPack;2785aExpBigger:2786if ( aExp == 0x7FF ) {2787if ( aSig ) return propagateFloat64NaN( a, b );2788return a;2789}2790if ( bExp == 0 ) {2791--expDiff;2792}2793else {2794bSig |= LIT64( 0x4000000000000000 );2795}2796shift64RightJamming( bSig, expDiff, &bSig );2797aSig |= LIT64( 0x4000000000000000 );2798aBigger:2799zSig = aSig - bSig;2800zExp = aExp;2801normalizeRoundAndPack:2802--zExp;2803return normalizeRoundAndPackFloat64( zSign, zExp, zSig );28042805}28062807/*2808-------------------------------------------------------------------------------2809Returns the result of adding the double-precision floating-point values `a'2810and `b'. The operation is performed according to the IEC/IEEE Standard for2811Binary Floating-Point Arithmetic.2812-------------------------------------------------------------------------------2813*/2814float64 float64_add( float64 a, float64 b )2815{2816flag aSign, bSign;28172818aSign = extractFloat64Sign( a );2819bSign = extractFloat64Sign( b );2820if ( aSign == bSign ) {2821return addFloat64Sigs( a, b, aSign );2822}2823else {2824return subFloat64Sigs( a, b, aSign );2825}28262827}28282829/*2830-------------------------------------------------------------------------------2831Returns the result of subtracting the double-precision floating-point values2832`a' and `b'. The operation is performed according to the IEC/IEEE Standard2833for Binary Floating-Point Arithmetic.2834-------------------------------------------------------------------------------2835*/2836float64 float64_sub( float64 a, float64 b )2837{2838flag aSign, bSign;28392840aSign = extractFloat64Sign( a );2841bSign = extractFloat64Sign( b );2842if ( aSign == bSign ) {2843return subFloat64Sigs( a, b, aSign );2844}2845else {2846return addFloat64Sigs( a, b, aSign );2847}28482849}28502851/*2852-------------------------------------------------------------------------------2853Returns the result of multiplying the double-precision floating-point values2854`a' and `b'. The operation is performed according to the IEC/IEEE Standard2855for Binary Floating-Point Arithmetic.2856-------------------------------------------------------------------------------2857*/2858float64 float64_mul( float64 a, float64 b )2859{2860flag aSign, bSign, zSign;2861int16 aExp, bExp, zExp;2862bits64 aSig, bSig, zSig0, zSig1;28632864aSig = extractFloat64Frac( a );2865aExp = extractFloat64Exp( a );2866aSign = extractFloat64Sign( a );2867bSig = extractFloat64Frac( b );2868bExp = extractFloat64Exp( b );2869bSign = extractFloat64Sign( b );2870zSign = aSign ^ bSign;2871if ( aExp == 0x7FF ) {2872if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {2873return propagateFloat64NaN( a, b );2874}2875if ( ( bExp | bSig ) == 0 ) {2876float_raise( float_flag_invalid );2877return float64_default_nan;2878}2879return packFloat64( zSign, 0x7FF, 0 );2880}2881if ( bExp == 0x7FF ) {2882if ( bSig ) return propagateFloat64NaN( a, b );2883if ( ( aExp | aSig ) == 0 ) {2884float_raise( float_flag_invalid );2885return float64_default_nan;2886}2887return packFloat64( zSign, 0x7FF, 0 );2888}2889if ( aExp == 0 ) {2890if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );2891normalizeFloat64Subnormal( aSig, &aExp, &aSig );2892}2893if ( bExp == 0 ) {2894if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );2895normalizeFloat64Subnormal( bSig, &bExp, &bSig );2896}2897zExp = aExp + bExp - 0x3FF;2898aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;2899bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;2900mul64To128( aSig, bSig, &zSig0, &zSig1 );2901zSig0 |= ( zSig1 != 0 );2902if ( 0 <= (sbits64) ( zSig0<<1 ) ) {2903zSig0 <<= 1;2904--zExp;2905}2906return roundAndPackFloat64( zSign, zExp, zSig0 );29072908}29092910/*2911-------------------------------------------------------------------------------2912Returns the result of dividing the double-precision floating-point value `a'2913by the corresponding value `b'. The operation is performed according to2914the IEC/IEEE Standard for Binary Floating-Point Arithmetic.2915-------------------------------------------------------------------------------2916*/2917float64 float64_div( float64 a, float64 b )2918{2919flag aSign, bSign, zSign;2920int16 aExp, bExp, zExp;2921bits64 aSig, bSig, zSig;2922bits64 rem0, rem1;2923bits64 term0, term1;29242925aSig = extractFloat64Frac( a );2926aExp = extractFloat64Exp( a );2927aSign = extractFloat64Sign( a );2928bSig = extractFloat64Frac( b );2929bExp = extractFloat64Exp( b );2930bSign = extractFloat64Sign( b );2931zSign = aSign ^ bSign;2932if ( aExp == 0x7FF ) {2933if ( aSig ) return propagateFloat64NaN( a, b );2934if ( bExp == 0x7FF ) {2935if ( bSig ) return propagateFloat64NaN( a, b );2936float_raise( float_flag_invalid );2937return float64_default_nan;2938}2939return packFloat64( zSign, 0x7FF, 0 );2940}2941if ( bExp == 0x7FF ) {2942if ( bSig ) return propagateFloat64NaN( a, b );2943return packFloat64( zSign, 0, 0 );2944}2945if ( bExp == 0 ) {2946if ( bSig == 0 ) {2947if ( ( aExp | aSig ) == 0 ) {2948float_raise( float_flag_invalid );2949return float64_default_nan;2950}2951float_raise( float_flag_divbyzero );2952return packFloat64( zSign, 0x7FF, 0 );2953}2954normalizeFloat64Subnormal( bSig, &bExp, &bSig );2955}2956if ( aExp == 0 ) {2957if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );2958normalizeFloat64Subnormal( aSig, &aExp, &aSig );2959}2960zExp = aExp - bExp + 0x3FD;2961aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;2962bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;2963if ( bSig <= ( aSig + aSig ) ) {2964aSig >>= 1;2965++zExp;2966}2967zSig = estimateDiv128To64( aSig, 0, bSig );2968if ( ( zSig & 0x1FF ) <= 2 ) {2969mul64To128( bSig, zSig, &term0, &term1 );2970sub128( aSig, 0, term0, term1, &rem0, &rem1 );2971while ( (sbits64) rem0 < 0 ) {2972--zSig;2973add128( rem0, rem1, 0, bSig, &rem0, &rem1 );2974}2975zSig |= ( rem1 != 0 );2976}2977return roundAndPackFloat64( zSign, zExp, zSig );29782979}29802981#ifndef SOFTFLOAT_FOR_GCC2982/*2983-------------------------------------------------------------------------------2984Returns the remainder of the double-precision floating-point value `a'2985with respect to the corresponding value `b'. The operation is performed2986according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.2987-------------------------------------------------------------------------------2988*/2989float64 float64_rem( float64 a, float64 b )2990{2991flag aSign, bSign, zSign;2992int16 aExp, bExp, expDiff;2993bits64 aSig, bSig;2994bits64 q, alternateASig;2995sbits64 sigMean;29962997aSig = extractFloat64Frac( a );2998aExp = extractFloat64Exp( a );2999aSign = extractFloat64Sign( a );3000bSig = extractFloat64Frac( b );3001bExp = extractFloat64Exp( b );3002bSign = extractFloat64Sign( b );3003if ( aExp == 0x7FF ) {3004if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {3005return propagateFloat64NaN( a, b );3006}3007float_raise( float_flag_invalid );3008return float64_default_nan;3009}3010if ( bExp == 0x7FF ) {3011if ( bSig ) return propagateFloat64NaN( a, b );3012return a;3013}3014if ( bExp == 0 ) {3015if ( bSig == 0 ) {3016float_raise( float_flag_invalid );3017return float64_default_nan;3018}3019normalizeFloat64Subnormal( bSig, &bExp, &bSig );3020}3021if ( aExp == 0 ) {3022if ( aSig == 0 ) return a;3023normalizeFloat64Subnormal( aSig, &aExp, &aSig );3024}3025expDiff = aExp - bExp;3026aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;3027bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;3028if ( expDiff < 0 ) {3029if ( expDiff < -1 ) return a;3030aSig >>= 1;3031}3032q = ( bSig <= aSig );3033if ( q ) aSig -= bSig;3034expDiff -= 64;3035while ( 0 < expDiff ) {3036q = estimateDiv128To64( aSig, 0, bSig );3037q = ( 2 < q ) ? q - 2 : 0;3038aSig = - ( ( bSig>>2 ) * q );3039expDiff -= 62;3040}3041expDiff += 64;3042if ( 0 < expDiff ) {3043q = estimateDiv128To64( aSig, 0, bSig );3044q = ( 2 < q ) ? q - 2 : 0;3045q >>= 64 - expDiff;3046bSig >>= 2;3047aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;3048}3049else {3050aSig >>= 2;3051bSig >>= 2;3052}3053do {3054alternateASig = aSig;3055++q;3056aSig -= bSig;3057} while ( 0 <= (sbits64) aSig );3058sigMean = aSig + alternateASig;3059if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {3060aSig = alternateASig;3061}3062zSign = ( (sbits64) aSig < 0 );3063if ( zSign ) aSig = - aSig;3064return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );30653066}30673068/*3069-------------------------------------------------------------------------------3070Returns the square root of the double-precision floating-point value `a'.3071The operation is performed according to the IEC/IEEE Standard for Binary3072Floating-Point Arithmetic.3073-------------------------------------------------------------------------------3074*/3075float64 float64_sqrt( float64 a )3076{3077flag aSign;3078int16 aExp, zExp;3079bits64 aSig, zSig, doubleZSig;3080bits64 rem0, rem1, term0, term1;30813082aSig = extractFloat64Frac( a );3083aExp = extractFloat64Exp( a );3084aSign = extractFloat64Sign( a );3085if ( aExp == 0x7FF ) {3086if ( aSig ) return propagateFloat64NaN( a, a );3087if ( ! aSign ) return a;3088float_raise( float_flag_invalid );3089return float64_default_nan;3090}3091if ( aSign ) {3092if ( ( aExp | aSig ) == 0 ) return a;3093float_raise( float_flag_invalid );3094return float64_default_nan;3095}3096if ( aExp == 0 ) {3097if ( aSig == 0 ) return 0;3098normalizeFloat64Subnormal( aSig, &aExp, &aSig );3099}3100zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;3101aSig |= LIT64( 0x0010000000000000 );3102zSig = estimateSqrt32( aExp, aSig>>21 );3103aSig <<= 9 - ( aExp & 1 );3104zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );3105if ( ( zSig & 0x1FF ) <= 5 ) {3106doubleZSig = zSig<<1;3107mul64To128( zSig, zSig, &term0, &term1 );3108sub128( aSig, 0, term0, term1, &rem0, &rem1 );3109while ( (sbits64) rem0 < 0 ) {3110--zSig;3111doubleZSig -= 2;3112add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );3113}3114zSig |= ( ( rem0 | rem1 ) != 0 );3115}3116return roundAndPackFloat64( 0, zExp, zSig );31173118}3119#endif31203121/*3122-------------------------------------------------------------------------------3123Returns 1 if the double-precision floating-point value `a' is equal to the3124corresponding value `b', and 0 otherwise. The comparison is performed3125according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.3126-------------------------------------------------------------------------------3127*/3128flag float64_eq( float64 a, float64 b )3129{31303131if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )3132|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )3133) {3134if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {3135float_raise( float_flag_invalid );3136}3137return 0;3138}3139return ( a == b ) ||3140( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );31413142}31433144/*3145-------------------------------------------------------------------------------3146Returns 1 if the double-precision floating-point value `a' is less than or3147equal to the corresponding value `b', and 0 otherwise. The comparison is3148performed according to the IEC/IEEE Standard for Binary Floating-Point3149Arithmetic.3150-------------------------------------------------------------------------------3151*/3152flag float64_le( float64 a, float64 b )3153{3154flag aSign, bSign;31553156if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )3157|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )3158) {3159float_raise( float_flag_invalid );3160return 0;3161}3162aSign = extractFloat64Sign( a );3163bSign = extractFloat64Sign( b );3164if ( aSign != bSign )3165return aSign ||3166( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==31670 );3168return ( a == b ) ||3169( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );31703171}31723173/*3174-------------------------------------------------------------------------------3175Returns 1 if the double-precision floating-point value `a' is less than3176the corresponding value `b', and 0 otherwise. The comparison is performed3177according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.3178-------------------------------------------------------------------------------3179*/3180flag float64_lt( float64 a, float64 b )3181{3182flag aSign, bSign;31833184if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )3185|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )3186) {3187float_raise( float_flag_invalid );3188return 0;3189}3190aSign = extractFloat64Sign( a );3191bSign = extractFloat64Sign( b );3192if ( aSign != bSign )3193return aSign &&3194( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=31950 );3196return ( a != b ) &&3197( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );31983199}32003201#ifndef SOFTFLOAT_FOR_GCC3202/*3203-------------------------------------------------------------------------------3204Returns 1 if the double-precision floating-point value `a' is equal to the3205corresponding value `b', and 0 otherwise. The invalid exception is raised3206if either operand is a NaN. Otherwise, the comparison is performed3207according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.3208-------------------------------------------------------------------------------3209*/3210flag float64_eq_signaling( float64 a, float64 b )3211{32123213if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )3214|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )3215) {3216float_raise( float_flag_invalid );3217return 0;3218}3219return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );32203221}32223223/*3224-------------------------------------------------------------------------------3225Returns 1 if the double-precision floating-point value `a' is less than or3226equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not3227cause an exception. Otherwise, the comparison is performed according to the3228IEC/IEEE Standard for Binary Floating-Point Arithmetic.3229-------------------------------------------------------------------------------3230*/3231flag float64_le_quiet( float64 a, float64 b )3232{3233flag aSign, bSign;32343235if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )3236|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )3237) {3238if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {3239float_raise( float_flag_invalid );3240}3241return 0;3242}3243aSign = extractFloat64Sign( a );3244bSign = extractFloat64Sign( b );3245if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );3246return ( a == b ) || ( aSign ^ ( a < b ) );32473248}32493250/*3251-------------------------------------------------------------------------------3252Returns 1 if the double-precision floating-point value `a' is less than3253the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an3254exception. Otherwise, the comparison is performed according to the IEC/IEEE3255Standard for Binary Floating-Point Arithmetic.3256-------------------------------------------------------------------------------3257*/3258flag float64_lt_quiet( float64 a, float64 b )3259{3260flag aSign, bSign;32613262if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )3263|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )3264) {3265if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {3266float_raise( float_flag_invalid );3267}3268return 0;3269}3270aSign = extractFloat64Sign( a );3271bSign = extractFloat64Sign( b );3272if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );3273return ( a != b ) && ( aSign ^ ( a < b ) );32743275}3276#endif32773278#ifdef FLOATX8032793280/*3281-------------------------------------------------------------------------------3282Returns the result of converting the extended double-precision floating-3283point value `a' to the 32-bit two's complement integer format. The3284conversion is performed according to the IEC/IEEE Standard for Binary3285Floating-Point Arithmetic---which means in particular that the conversion3286is rounded according to the current rounding mode. If `a' is a NaN, the3287largest positive integer is returned. Otherwise, if the conversion3288overflows, the largest integer with the same sign as `a' is returned.3289-------------------------------------------------------------------------------3290*/3291int32 floatx80_to_int32( floatx80 a )3292{3293flag aSign;3294int32 aExp, shiftCount;3295bits64 aSig;32963297aSig = extractFloatx80Frac( a );3298aExp = extractFloatx80Exp( a );3299aSign = extractFloatx80Sign( a );3300if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;3301shiftCount = 0x4037 - aExp;3302if ( shiftCount <= 0 ) shiftCount = 1;3303shift64RightJamming( aSig, shiftCount, &aSig );3304return roundAndPackInt32( aSign, aSig );33053306}33073308/*3309-------------------------------------------------------------------------------3310Returns the result of converting the extended double-precision floating-3311point value `a' to the 32-bit two's complement integer format. The3312conversion is performed according to the IEC/IEEE Standard for Binary3313Floating-Point Arithmetic, except that the conversion is always rounded3314toward zero. If `a' is a NaN, the largest positive integer is returned.3315Otherwise, if the conversion overflows, the largest integer with the same3316sign as `a' is returned.3317-------------------------------------------------------------------------------3318*/3319int32 floatx80_to_int32_round_to_zero( floatx80 a )3320{3321flag aSign;3322int32 aExp, shiftCount;3323bits64 aSig, savedASig;3324int32 z;33253326aSig = extractFloatx80Frac( a );3327aExp = extractFloatx80Exp( a );3328aSign = extractFloatx80Sign( a );3329if ( 0x401E < aExp ) {3330if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;3331goto invalid;3332}3333else if ( aExp < 0x3FFF ) {3334if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;3335return 0;3336}3337shiftCount = 0x403E - aExp;3338savedASig = aSig;3339aSig >>= shiftCount;3340z = aSig;3341if ( aSign ) z = - z;3342if ( ( z < 0 ) ^ aSign ) {3343invalid:3344float_raise( float_flag_invalid );3345return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;3346}3347if ( ( aSig<<shiftCount ) != savedASig ) {3348float_exception_flags |= float_flag_inexact;3349}3350return z;33513352}33533354/*3355-------------------------------------------------------------------------------3356Returns the result of converting the extended double-precision floating-3357point value `a' to the 64-bit two's complement integer format. The3358conversion is performed according to the IEC/IEEE Standard for Binary3359Floating-Point Arithmetic---which means in particular that the conversion3360is rounded according to the current rounding mode. If `a' is a NaN,3361the largest positive integer is returned. Otherwise, if the conversion3362overflows, the largest integer with the same sign as `a' is returned.3363-------------------------------------------------------------------------------3364*/3365int64 floatx80_to_int64( floatx80 a )3366{3367flag aSign;3368int32 aExp, shiftCount;3369bits64 aSig, aSigExtra;33703371aSig = extractFloatx80Frac( a );3372aExp = extractFloatx80Exp( a );3373aSign = extractFloatx80Sign( a );3374shiftCount = 0x403E - aExp;3375if ( shiftCount <= 0 ) {3376if ( shiftCount ) {3377float_raise( float_flag_invalid );3378if ( ! aSign3379|| ( ( aExp == 0x7FFF )3380&& ( aSig != LIT64( 0x8000000000000000 ) ) )3381) {3382return LIT64( 0x7FFFFFFFFFFFFFFF );3383}3384return (sbits64) LIT64( 0x8000000000000000 );3385}3386aSigExtra = 0;3387}3388else {3389shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );3390}3391return roundAndPackInt64( aSign, aSig, aSigExtra );33923393}33943395/*3396-------------------------------------------------------------------------------3397Returns the result of converting the extended double-precision floating-3398point value `a' to the 64-bit two's complement integer format. The3399conversion is performed according to the IEC/IEEE Standard for Binary3400Floating-Point Arithmetic, except that the conversion is always rounded3401toward zero. If `a' is a NaN, the largest positive integer is returned.3402Otherwise, if the conversion overflows, the largest integer with the same3403sign as `a' is returned.3404-------------------------------------------------------------------------------3405*/3406int64 floatx80_to_int64_round_to_zero( floatx80 a )3407{3408flag aSign;3409int32 aExp, shiftCount;3410bits64 aSig;3411int64 z;34123413aSig = extractFloatx80Frac( a );3414aExp = extractFloatx80Exp( a );3415aSign = extractFloatx80Sign( a );3416shiftCount = aExp - 0x403E;3417if ( 0 <= shiftCount ) {3418aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );3419if ( ( a.high != 0xC03E ) || aSig ) {3420float_raise( float_flag_invalid );3421if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {3422return LIT64( 0x7FFFFFFFFFFFFFFF );3423}3424}3425return (sbits64) LIT64( 0x8000000000000000 );3426}3427else if ( aExp < 0x3FFF ) {3428if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;3429return 0;3430}3431z = aSig>>( - shiftCount );3432if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {3433float_exception_flags |= float_flag_inexact;3434}3435if ( aSign ) z = - z;3436return z;34373438}34393440/*3441-------------------------------------------------------------------------------3442Returns the result of converting the extended double-precision floating-3443point value `a' to the single-precision floating-point format. The3444conversion is performed according to the IEC/IEEE Standard for Binary3445Floating-Point Arithmetic.3446-------------------------------------------------------------------------------3447*/3448float32 floatx80_to_float32( floatx80 a )3449{3450flag aSign;3451int32 aExp;3452bits64 aSig;34533454aSig = extractFloatx80Frac( a );3455aExp = extractFloatx80Exp( a );3456aSign = extractFloatx80Sign( a );3457if ( aExp == 0x7FFF ) {3458if ( (bits64) ( aSig<<1 ) ) {3459return commonNaNToFloat32( floatx80ToCommonNaN( a ) );3460}3461return packFloat32( aSign, 0xFF, 0 );3462}3463shift64RightJamming( aSig, 33, &aSig );3464if ( aExp || aSig ) aExp -= 0x3F81;3465return roundAndPackFloat32( aSign, aExp, aSig );34663467}34683469/*3470-------------------------------------------------------------------------------3471Returns the result of converting the extended double-precision floating-3472point value `a' to the double-precision floating-point format. The3473conversion is performed according to the IEC/IEEE Standard for Binary3474Floating-Point Arithmetic.3475-------------------------------------------------------------------------------3476*/3477float64 floatx80_to_float64( floatx80 a )3478{3479flag aSign;3480int32 aExp;3481bits64 aSig, zSig;34823483aSig = extractFloatx80Frac( a );3484aExp = extractFloatx80Exp( a );3485aSign = extractFloatx80Sign( a );3486if ( aExp == 0x7FFF ) {3487if ( (bits64) ( aSig<<1 ) ) {3488return commonNaNToFloat64( floatx80ToCommonNaN( a ) );3489}3490return packFloat64( aSign, 0x7FF, 0 );3491}3492shift64RightJamming( aSig, 1, &zSig );3493if ( aExp || aSig ) aExp -= 0x3C01;3494return roundAndPackFloat64( aSign, aExp, zSig );34953496}34973498#ifdef FLOAT12834993500/*3501-------------------------------------------------------------------------------3502Returns the result of converting the extended double-precision floating-3503point value `a' to the quadruple-precision floating-point format. The3504conversion is performed according to the IEC/IEEE Standard for Binary3505Floating-Point Arithmetic.3506-------------------------------------------------------------------------------3507*/3508float128 floatx80_to_float128( floatx80 a )3509{3510flag aSign;3511int16 aExp;3512bits64 aSig, zSig0, zSig1;35133514aSig = extractFloatx80Frac( a );3515aExp = extractFloatx80Exp( a );3516aSign = extractFloatx80Sign( a );3517if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {3518return commonNaNToFloat128( floatx80ToCommonNaN( a ) );3519}3520shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );3521return packFloat128( aSign, aExp, zSig0, zSig1 );35223523}35243525#endif35263527/*3528-------------------------------------------------------------------------------3529Rounds the extended double-precision floating-point value `a' to an integer,3530and returns the result as an extended quadruple-precision floating-point3531value. The operation is performed according to the IEC/IEEE Standard for3532Binary Floating-Point Arithmetic.3533-------------------------------------------------------------------------------3534*/3535floatx80 floatx80_round_to_int( floatx80 a )3536{3537flag aSign;3538int32 aExp;3539bits64 lastBitMask, roundBitsMask;3540int8 roundingMode;3541floatx80 z;35423543aExp = extractFloatx80Exp( a );3544if ( 0x403E <= aExp ) {3545if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {3546return propagateFloatx80NaN( a, a );3547}3548return a;3549}3550if ( aExp < 0x3FFF ) {3551if ( ( aExp == 0 )3552&& ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {3553return a;3554}3555float_exception_flags |= float_flag_inexact;3556aSign = extractFloatx80Sign( a );3557switch ( float_rounding_mode ) {3558case float_round_nearest_even:3559if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )3560) {3561return3562packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );3563}3564break;3565case float_round_to_zero:3566break;3567case float_round_down:3568return3569aSign ?3570packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )3571: packFloatx80( 0, 0, 0 );3572case float_round_up:3573return3574aSign ? packFloatx80( 1, 0, 0 )3575: packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );3576}3577return packFloatx80( aSign, 0, 0 );3578}3579lastBitMask = 1;3580lastBitMask <<= 0x403E - aExp;3581roundBitsMask = lastBitMask - 1;3582z = a;3583roundingMode = float_rounding_mode;3584if ( roundingMode == float_round_nearest_even ) {3585z.low += lastBitMask>>1;3586if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;3587}3588else if ( roundingMode != float_round_to_zero ) {3589if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {3590z.low += roundBitsMask;3591}3592}3593z.low &= ~ roundBitsMask;3594if ( z.low == 0 ) {3595++z.high;3596z.low = LIT64( 0x8000000000000000 );3597}3598if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;3599return z;36003601}36023603/*3604-------------------------------------------------------------------------------3605Returns the result of adding the absolute values of the extended double-3606precision floating-point values `a' and `b'. If `zSign' is 1, the sum is3607negated before being returned. `zSign' is ignored if the result is a NaN.3608The addition is performed according to the IEC/IEEE Standard for Binary3609Floating-Point Arithmetic.3610-------------------------------------------------------------------------------3611*/3612static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )3613{3614int32 aExp, bExp, zExp;3615bits64 aSig, bSig, zSig0, zSig1;3616int32 expDiff;36173618aSig = extractFloatx80Frac( a );3619aExp = extractFloatx80Exp( a );3620bSig = extractFloatx80Frac( b );3621bExp = extractFloatx80Exp( b );3622expDiff = aExp - bExp;3623if ( 0 < expDiff ) {3624if ( aExp == 0x7FFF ) {3625if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );3626return a;3627}3628if ( bExp == 0 ) --expDiff;3629shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );3630zExp = aExp;3631}3632else if ( expDiff < 0 ) {3633if ( bExp == 0x7FFF ) {3634if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );3635return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );3636}3637if ( aExp == 0 ) ++expDiff;3638shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );3639zExp = bExp;3640}3641else {3642if ( aExp == 0x7FFF ) {3643if ( (bits64) ( ( aSig | bSig )<<1 ) ) {3644return propagateFloatx80NaN( a, b );3645}3646return a;3647}3648zSig1 = 0;3649zSig0 = aSig + bSig;3650if ( aExp == 0 ) {3651normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );3652goto roundAndPack;3653}3654zExp = aExp;3655goto shiftRight1;3656}3657zSig0 = aSig + bSig;3658if ( (sbits64) zSig0 < 0 ) goto roundAndPack;3659shiftRight1:3660shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );3661zSig0 |= LIT64( 0x8000000000000000 );3662++zExp;3663roundAndPack:3664return3665roundAndPackFloatx80(3666floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );36673668}36693670/*3671-------------------------------------------------------------------------------3672Returns the result of subtracting the absolute values of the extended3673double-precision floating-point values `a' and `b'. If `zSign' is 1, the3674difference is negated before being returned. `zSign' is ignored if the3675result is a NaN. The subtraction is performed according to the IEC/IEEE3676Standard for Binary Floating-Point Arithmetic.3677-------------------------------------------------------------------------------3678*/3679static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )3680{3681int32 aExp, bExp, zExp;3682bits64 aSig, bSig, zSig0, zSig1;3683int32 expDiff;3684floatx80 z;36853686aSig = extractFloatx80Frac( a );3687aExp = extractFloatx80Exp( a );3688bSig = extractFloatx80Frac( b );3689bExp = extractFloatx80Exp( b );3690expDiff = aExp - bExp;3691if ( 0 < expDiff ) goto aExpBigger;3692if ( expDiff < 0 ) goto bExpBigger;3693if ( aExp == 0x7FFF ) {3694if ( (bits64) ( ( aSig | bSig )<<1 ) ) {3695return propagateFloatx80NaN( a, b );3696}3697float_raise( float_flag_invalid );3698z.low = floatx80_default_nan_low;3699z.high = floatx80_default_nan_high;3700return z;3701}3702if ( aExp == 0 ) {3703aExp = 1;3704bExp = 1;3705}3706zSig1 = 0;3707if ( bSig < aSig ) goto aBigger;3708if ( aSig < bSig ) goto bBigger;3709return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );3710bExpBigger:3711if ( bExp == 0x7FFF ) {3712if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );3713return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );3714}3715if ( aExp == 0 ) ++expDiff;3716shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );3717bBigger:3718sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );3719zExp = bExp;3720zSign ^= 1;3721goto normalizeRoundAndPack;3722aExpBigger:3723if ( aExp == 0x7FFF ) {3724if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );3725return a;3726}3727if ( bExp == 0 ) --expDiff;3728shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );3729aBigger:3730sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );3731zExp = aExp;3732normalizeRoundAndPack:3733return3734normalizeRoundAndPackFloatx80(3735floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );37363737}37383739/*3740-------------------------------------------------------------------------------3741Returns the result of adding the extended double-precision floating-point3742values `a' and `b'. The operation is performed according to the IEC/IEEE3743Standard for Binary Floating-Point Arithmetic.3744-------------------------------------------------------------------------------3745*/3746floatx80 floatx80_add( floatx80 a, floatx80 b )3747{3748flag aSign, bSign;37493750aSign = extractFloatx80Sign( a );3751bSign = extractFloatx80Sign( b );3752if ( aSign == bSign ) {3753return addFloatx80Sigs( a, b, aSign );3754}3755else {3756return subFloatx80Sigs( a, b, aSign );3757}37583759}37603761/*3762-------------------------------------------------------------------------------3763Returns the result of subtracting the extended double-precision floating-3764point values `a' and `b'. The operation is performed according to the3765IEC/IEEE Standard for Binary Floating-Point Arithmetic.3766-------------------------------------------------------------------------------3767*/3768floatx80 floatx80_sub( floatx80 a, floatx80 b )3769{3770flag aSign, bSign;37713772aSign = extractFloatx80Sign( a );3773bSign = extractFloatx80Sign( b );3774if ( aSign == bSign ) {3775return subFloatx80Sigs( a, b, aSign );3776}3777else {3778return addFloatx80Sigs( a, b, aSign );3779}37803781}37823783/*3784-------------------------------------------------------------------------------3785Returns the result of multiplying the extended double-precision floating-3786point values `a' and `b'. The operation is performed according to the3787IEC/IEEE Standard for Binary Floating-Point Arithmetic.3788-------------------------------------------------------------------------------3789*/3790floatx80 floatx80_mul( floatx80 a, floatx80 b )3791{3792flag aSign, bSign, zSign;3793int32 aExp, bExp, zExp;3794bits64 aSig, bSig, zSig0, zSig1;3795floatx80 z;37963797aSig = extractFloatx80Frac( a );3798aExp = extractFloatx80Exp( a );3799aSign = extractFloatx80Sign( a );3800bSig = extractFloatx80Frac( b );3801bExp = extractFloatx80Exp( b );3802bSign = extractFloatx80Sign( b );3803zSign = aSign ^ bSign;3804if ( aExp == 0x7FFF ) {3805if ( (bits64) ( aSig<<1 )3806|| ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {3807return propagateFloatx80NaN( a, b );3808}3809if ( ( bExp | bSig ) == 0 ) goto invalid;3810return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );3811}3812if ( bExp == 0x7FFF ) {3813if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );3814if ( ( aExp | aSig ) == 0 ) {3815invalid:3816float_raise( float_flag_invalid );3817z.low = floatx80_default_nan_low;3818z.high = floatx80_default_nan_high;3819return z;3820}3821return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );3822}3823if ( aExp == 0 ) {3824if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );3825normalizeFloatx80Subnormal( aSig, &aExp, &aSig );3826}3827if ( bExp == 0 ) {3828if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );3829normalizeFloatx80Subnormal( bSig, &bExp, &bSig );3830}3831zExp = aExp + bExp - 0x3FFE;3832mul64To128( aSig, bSig, &zSig0, &zSig1 );3833if ( 0 < (sbits64) zSig0 ) {3834shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );3835--zExp;3836}3837return3838roundAndPackFloatx80(3839floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );38403841}38423843/*3844-------------------------------------------------------------------------------3845Returns the result of dividing the extended double-precision floating-point3846value `a' by the corresponding value `b'. The operation is performed3847according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.3848-------------------------------------------------------------------------------3849*/3850floatx80 floatx80_div( floatx80 a, floatx80 b )3851{3852flag aSign, bSign, zSign;3853int32 aExp, bExp, zExp;3854bits64 aSig, bSig, zSig0, zSig1;3855bits64 rem0, rem1, rem2, term0, term1, term2;3856floatx80 z;38573858aSig = extractFloatx80Frac( a );3859aExp = extractFloatx80Exp( a );3860aSign = extractFloatx80Sign( a );3861bSig = extractFloatx80Frac( b );3862bExp = extractFloatx80Exp( b );3863bSign = extractFloatx80Sign( b );3864zSign = aSign ^ bSign;3865if ( aExp == 0x7FFF ) {3866if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );3867if ( bExp == 0x7FFF ) {3868if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );3869goto invalid;3870}3871return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );3872}3873if ( bExp == 0x7FFF ) {3874if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );3875return packFloatx80( zSign, 0, 0 );3876}3877if ( bExp == 0 ) {3878if ( bSig == 0 ) {3879if ( ( aExp | aSig ) == 0 ) {3880invalid:3881float_raise( float_flag_invalid );3882z.low = floatx80_default_nan_low;3883z.high = floatx80_default_nan_high;3884return z;3885}3886float_raise( float_flag_divbyzero );3887return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );3888}3889normalizeFloatx80Subnormal( bSig, &bExp, &bSig );3890}3891if ( aExp == 0 ) {3892if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );3893normalizeFloatx80Subnormal( aSig, &aExp, &aSig );3894}3895zExp = aExp - bExp + 0x3FFE;3896rem1 = 0;3897if ( bSig <= aSig ) {3898shift128Right( aSig, 0, 1, &aSig, &rem1 );3899++zExp;3900}3901zSig0 = estimateDiv128To64( aSig, rem1, bSig );3902mul64To128( bSig, zSig0, &term0, &term1 );3903sub128( aSig, rem1, term0, term1, &rem0, &rem1 );3904while ( (sbits64) rem0 < 0 ) {3905--zSig0;3906add128( rem0, rem1, 0, bSig, &rem0, &rem1 );3907}3908zSig1 = estimateDiv128To64( rem1, 0, bSig );3909if ( (bits64) ( zSig1<<1 ) <= 8 ) {3910mul64To128( bSig, zSig1, &term1, &term2 );3911sub128( rem1, 0, term1, term2, &rem1, &rem2 );3912while ( (sbits64) rem1 < 0 ) {3913--zSig1;3914add128( rem1, rem2, 0, bSig, &rem1, &rem2 );3915}3916zSig1 |= ( ( rem1 | rem2 ) != 0 );3917}3918return3919roundAndPackFloatx80(3920floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );39213922}39233924/*3925-------------------------------------------------------------------------------3926Returns the remainder of the extended double-precision floating-point value3927`a' with respect to the corresponding value `b'. The operation is performed3928according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.3929-------------------------------------------------------------------------------3930*/3931floatx80 floatx80_rem( floatx80 a, floatx80 b )3932{3933flag aSign, bSign, zSign;3934int32 aExp, bExp, expDiff;3935bits64 aSig0, aSig1, bSig;3936bits64 q, term0, term1, alternateASig0, alternateASig1;3937floatx80 z;39383939aSig0 = extractFloatx80Frac( a );3940aExp = extractFloatx80Exp( a );3941aSign = extractFloatx80Sign( a );3942bSig = extractFloatx80Frac( b );3943bExp = extractFloatx80Exp( b );3944bSign = extractFloatx80Sign( b );3945if ( aExp == 0x7FFF ) {3946if ( (bits64) ( aSig0<<1 )3947|| ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {3948return propagateFloatx80NaN( a, b );3949}3950goto invalid;3951}3952if ( bExp == 0x7FFF ) {3953if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );3954return a;3955}3956if ( bExp == 0 ) {3957if ( bSig == 0 ) {3958invalid:3959float_raise( float_flag_invalid );3960z.low = floatx80_default_nan_low;3961z.high = floatx80_default_nan_high;3962return z;3963}3964normalizeFloatx80Subnormal( bSig, &bExp, &bSig );3965}3966if ( aExp == 0 ) {3967if ( (bits64) ( aSig0<<1 ) == 0 ) return a;3968normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );3969}3970bSig |= LIT64( 0x8000000000000000 );3971zSign = aSign;3972expDiff = aExp - bExp;3973aSig1 = 0;3974if ( expDiff < 0 ) {3975if ( expDiff < -1 ) return a;3976shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );3977expDiff = 0;3978}3979q = ( bSig <= aSig0 );3980if ( q ) aSig0 -= bSig;3981expDiff -= 64;3982while ( 0 < expDiff ) {3983q = estimateDiv128To64( aSig0, aSig1, bSig );3984q = ( 2 < q ) ? q - 2 : 0;3985mul64To128( bSig, q, &term0, &term1 );3986sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );3987shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );3988expDiff -= 62;3989}3990expDiff += 64;3991if ( 0 < expDiff ) {3992q = estimateDiv128To64( aSig0, aSig1, bSig );3993q = ( 2 < q ) ? q - 2 : 0;3994q >>= 64 - expDiff;3995mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );3996sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );3997shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );3998while ( le128( term0, term1, aSig0, aSig1 ) ) {3999++q;4000sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );4001}4002}4003else {4004term1 = 0;4005term0 = bSig;4006}4007sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );4008if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )4009|| ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )4010&& ( q & 1 ) )4011) {4012aSig0 = alternateASig0;4013aSig1 = alternateASig1;4014zSign = ! zSign;4015}4016return4017normalizeRoundAndPackFloatx80(401880, zSign, bExp + expDiff, aSig0, aSig1 );40194020}40214022/*4023-------------------------------------------------------------------------------4024Returns the square root of the extended double-precision floating-point4025value `a'. The operation is performed according to the IEC/IEEE Standard4026for Binary Floating-Point Arithmetic.4027-------------------------------------------------------------------------------4028*/4029floatx80 floatx80_sqrt( floatx80 a )4030{4031flag aSign;4032int32 aExp, zExp;4033bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;4034bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;4035floatx80 z;40364037aSig0 = extractFloatx80Frac( a );4038aExp = extractFloatx80Exp( a );4039aSign = extractFloatx80Sign( a );4040if ( aExp == 0x7FFF ) {4041if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );4042if ( ! aSign ) return a;4043goto invalid;4044}4045if ( aSign ) {4046if ( ( aExp | aSig0 ) == 0 ) return a;4047invalid:4048float_raise( float_flag_invalid );4049z.low = floatx80_default_nan_low;4050z.high = floatx80_default_nan_high;4051return z;4052}4053if ( aExp == 0 ) {4054if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );4055normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );4056}4057zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;4058zSig0 = estimateSqrt32( aExp, aSig0>>32 );4059shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );4060zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );4061doubleZSig0 = zSig0<<1;4062mul64To128( zSig0, zSig0, &term0, &term1 );4063sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );4064while ( (sbits64) rem0 < 0 ) {4065--zSig0;4066doubleZSig0 -= 2;4067add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );4068}4069zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );4070if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {4071if ( zSig1 == 0 ) zSig1 = 1;4072mul64To128( doubleZSig0, zSig1, &term1, &term2 );4073sub128( rem1, 0, term1, term2, &rem1, &rem2 );4074mul64To128( zSig1, zSig1, &term2, &term3 );4075sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );4076while ( (sbits64) rem1 < 0 ) {4077--zSig1;4078shortShift128Left( 0, zSig1, 1, &term2, &term3 );4079term3 |= 1;4080term2 |= doubleZSig0;4081add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );4082}4083zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );4084}4085shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );4086zSig0 |= doubleZSig0;4087return4088roundAndPackFloatx80(4089floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );40904091}40924093/*4094-------------------------------------------------------------------------------4095Returns 1 if the extended double-precision floating-point value `a' is4096equal to the corresponding value `b', and 0 otherwise. The comparison is4097performed according to the IEC/IEEE Standard for Binary Floating-Point4098Arithmetic.4099-------------------------------------------------------------------------------4100*/4101flag floatx80_eq( floatx80 a, floatx80 b )4102{41034104if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )4105&& (bits64) ( extractFloatx80Frac( a )<<1 ) )4106|| ( ( extractFloatx80Exp( b ) == 0x7FFF )4107&& (bits64) ( extractFloatx80Frac( b )<<1 ) )4108) {4109if ( floatx80_is_signaling_nan( a )4110|| floatx80_is_signaling_nan( b ) ) {4111float_raise( float_flag_invalid );4112}4113return 0;4114}4115return4116( a.low == b.low )4117&& ( ( a.high == b.high )4118|| ( ( a.low == 0 )4119&& ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )4120);41214122}41234124/*4125-------------------------------------------------------------------------------4126Returns 1 if the extended double-precision floating-point value `a' is4127less than or equal to the corresponding value `b', and 0 otherwise. The4128comparison is performed according to the IEC/IEEE Standard for Binary4129Floating-Point Arithmetic.4130-------------------------------------------------------------------------------4131*/4132flag floatx80_le( floatx80 a, floatx80 b )4133{4134flag aSign, bSign;41354136if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )4137&& (bits64) ( extractFloatx80Frac( a )<<1 ) )4138|| ( ( extractFloatx80Exp( b ) == 0x7FFF )4139&& (bits64) ( extractFloatx80Frac( b )<<1 ) )4140) {4141float_raise( float_flag_invalid );4142return 0;4143}4144aSign = extractFloatx80Sign( a );4145bSign = extractFloatx80Sign( b );4146if ( aSign != bSign ) {4147return4148aSign4149|| ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )4150== 0 );4151}4152return4153aSign ? le128( b.high, b.low, a.high, a.low )4154: le128( a.high, a.low, b.high, b.low );41554156}41574158/*4159-------------------------------------------------------------------------------4160Returns 1 if the extended double-precision floating-point value `a' is4161less than the corresponding value `b', and 0 otherwise. The comparison4162is performed according to the IEC/IEEE Standard for Binary Floating-Point4163Arithmetic.4164-------------------------------------------------------------------------------4165*/4166flag floatx80_lt( floatx80 a, floatx80 b )4167{4168flag aSign, bSign;41694170if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )4171&& (bits64) ( extractFloatx80Frac( a )<<1 ) )4172|| ( ( extractFloatx80Exp( b ) == 0x7FFF )4173&& (bits64) ( extractFloatx80Frac( b )<<1 ) )4174) {4175float_raise( float_flag_invalid );4176return 0;4177}4178aSign = extractFloatx80Sign( a );4179bSign = extractFloatx80Sign( b );4180if ( aSign != bSign ) {4181return4182aSign4183&& ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )4184!= 0 );4185}4186return4187aSign ? lt128( b.high, b.low, a.high, a.low )4188: lt128( a.high, a.low, b.high, b.low );41894190}41914192/*4193-------------------------------------------------------------------------------4194Returns 1 if the extended double-precision floating-point value `a' is equal4195to the corresponding value `b', and 0 otherwise. The invalid exception is4196raised if either operand is a NaN. Otherwise, the comparison is performed4197according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.4198-------------------------------------------------------------------------------4199*/4200flag floatx80_eq_signaling( floatx80 a, floatx80 b )4201{42024203if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )4204&& (bits64) ( extractFloatx80Frac( a )<<1 ) )4205|| ( ( extractFloatx80Exp( b ) == 0x7FFF )4206&& (bits64) ( extractFloatx80Frac( b )<<1 ) )4207) {4208float_raise( float_flag_invalid );4209return 0;4210}4211return4212( a.low == b.low )4213&& ( ( a.high == b.high )4214|| ( ( a.low == 0 )4215&& ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )4216);42174218}42194220/*4221-------------------------------------------------------------------------------4222Returns 1 if the extended double-precision floating-point value `a' is less4223than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs4224do not cause an exception. Otherwise, the comparison is performed according4225to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.4226-------------------------------------------------------------------------------4227*/4228flag floatx80_le_quiet( floatx80 a, floatx80 b )4229{4230flag aSign, bSign;42314232if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )4233&& (bits64) ( extractFloatx80Frac( a )<<1 ) )4234|| ( ( extractFloatx80Exp( b ) == 0x7FFF )4235&& (bits64) ( extractFloatx80Frac( b )<<1 ) )4236) {4237if ( floatx80_is_signaling_nan( a )4238|| floatx80_is_signaling_nan( b ) ) {4239float_raise( float_flag_invalid );4240}4241return 0;4242}4243aSign = extractFloatx80Sign( a );4244bSign = extractFloatx80Sign( b );4245if ( aSign != bSign ) {4246return4247aSign4248|| ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )4249== 0 );4250}4251return4252aSign ? le128( b.high, b.low, a.high, a.low )4253: le128( a.high, a.low, b.high, b.low );42544255}42564257/*4258-------------------------------------------------------------------------------4259Returns 1 if the extended double-precision floating-point value `a' is less4260than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause4261an exception. Otherwise, the comparison is performed according to the4262IEC/IEEE Standard for Binary Floating-Point Arithmetic.4263-------------------------------------------------------------------------------4264*/4265flag floatx80_lt_quiet( floatx80 a, floatx80 b )4266{4267flag aSign, bSign;42684269if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )4270&& (bits64) ( extractFloatx80Frac( a )<<1 ) )4271|| ( ( extractFloatx80Exp( b ) == 0x7FFF )4272&& (bits64) ( extractFloatx80Frac( b )<<1 ) )4273) {4274if ( floatx80_is_signaling_nan( a )4275|| floatx80_is_signaling_nan( b ) ) {4276float_raise( float_flag_invalid );4277}4278return 0;4279}4280aSign = extractFloatx80Sign( a );4281bSign = extractFloatx80Sign( b );4282if ( aSign != bSign ) {4283return4284aSign4285&& ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )4286!= 0 );4287}4288return4289aSign ? lt128( b.high, b.low, a.high, a.low )4290: lt128( a.high, a.low, b.high, b.low );42914292}42934294#endif42954296#ifdef FLOAT12842974298/*4299-------------------------------------------------------------------------------4300Returns the result of converting the quadruple-precision floating-point4301value `a' to the 32-bit two's complement integer format. The conversion4302is performed according to the IEC/IEEE Standard for Binary Floating-Point4303Arithmetic---which means in particular that the conversion is rounded4304according to the current rounding mode. If `a' is a NaN, the largest4305positive integer is returned. Otherwise, if the conversion overflows, the4306largest integer with the same sign as `a' is returned.4307-------------------------------------------------------------------------------4308*/4309int32 float128_to_int32( float128 a )4310{4311flag aSign;4312int32 aExp, shiftCount;4313bits64 aSig0, aSig1;43144315aSig1 = extractFloat128Frac1( a );4316aSig0 = extractFloat128Frac0( a );4317aExp = extractFloat128Exp( a );4318aSign = extractFloat128Sign( a );4319if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;4320if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );4321aSig0 |= ( aSig1 != 0 );4322shiftCount = 0x4028 - aExp;4323if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );4324return roundAndPackInt32( aSign, aSig0 );43254326}43274328/*4329-------------------------------------------------------------------------------4330Returns the result of converting the quadruple-precision floating-point4331value `a' to the 32-bit two's complement integer format. The conversion4332is performed according to the IEC/IEEE Standard for Binary Floating-Point4333Arithmetic, except that the conversion is always rounded toward zero. If4334`a' is a NaN, the largest positive integer is returned. Otherwise, if the4335conversion overflows, the largest integer with the same sign as `a' is4336returned.4337-------------------------------------------------------------------------------4338*/4339int32 float128_to_int32_round_to_zero( float128 a )4340{4341flag aSign;4342int32 aExp, shiftCount;4343bits64 aSig0, aSig1, savedASig;4344int32 z;43454346aSig1 = extractFloat128Frac1( a );4347aSig0 = extractFloat128Frac0( a );4348aExp = extractFloat128Exp( a );4349aSign = extractFloat128Sign( a );4350aSig0 |= ( aSig1 != 0 );4351if ( 0x401E < aExp ) {4352if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;4353goto invalid;4354}4355else if ( aExp < 0x3FFF ) {4356if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;4357return 0;4358}4359aSig0 |= LIT64( 0x0001000000000000 );4360shiftCount = 0x402F - aExp;4361savedASig = aSig0;4362aSig0 >>= shiftCount;4363z = aSig0;4364if ( aSign ) z = - z;4365if ( ( z < 0 ) ^ aSign ) {4366invalid:4367float_raise( float_flag_invalid );4368return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;4369}4370if ( ( aSig0<<shiftCount ) != savedASig ) {4371float_exception_flags |= float_flag_inexact;4372}4373return z;43744375}43764377/*4378-------------------------------------------------------------------------------4379Returns the result of converting the quadruple-precision floating-point4380value `a' to the 64-bit two's complement integer format. The conversion4381is performed according to the IEC/IEEE Standard for Binary Floating-Point4382Arithmetic---which means in particular that the conversion is rounded4383according to the current rounding mode. If `a' is a NaN, the largest4384positive integer is returned. Otherwise, if the conversion overflows, the4385largest integer with the same sign as `a' is returned.4386-------------------------------------------------------------------------------4387*/4388int64 float128_to_int64( float128 a )4389{4390flag aSign;4391int32 aExp, shiftCount;4392bits64 aSig0, aSig1;43934394aSig1 = extractFloat128Frac1( a );4395aSig0 = extractFloat128Frac0( a );4396aExp = extractFloat128Exp( a );4397aSign = extractFloat128Sign( a );4398if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );4399shiftCount = 0x402F - aExp;4400if ( shiftCount <= 0 ) {4401if ( 0x403E < aExp ) {4402float_raise( float_flag_invalid );4403if ( ! aSign4404|| ( ( aExp == 0x7FFF )4405&& ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )4406)4407) {4408return LIT64( 0x7FFFFFFFFFFFFFFF );4409}4410return (sbits64) LIT64( 0x8000000000000000 );4411}4412shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );4413}4414else {4415shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );4416}4417return roundAndPackInt64( aSign, aSig0, aSig1 );44184419}44204421/*4422-------------------------------------------------------------------------------4423Returns the result of converting the quadruple-precision floating-point4424value `a' to the 64-bit two's complement integer format. The conversion4425is performed according to the IEC/IEEE Standard for Binary Floating-Point4426Arithmetic, except that the conversion is always rounded toward zero.4427If `a' is a NaN, the largest positive integer is returned. Otherwise, if4428the conversion overflows, the largest integer with the same sign as `a' is4429returned.4430-------------------------------------------------------------------------------4431*/4432int64 float128_to_int64_round_to_zero( float128 a )4433{4434flag aSign;4435int32 aExp, shiftCount;4436bits64 aSig0, aSig1;4437int64 z;44384439aSig1 = extractFloat128Frac1( a );4440aSig0 = extractFloat128Frac0( a );4441aExp = extractFloat128Exp( a );4442aSign = extractFloat128Sign( a );4443if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );4444shiftCount = aExp - 0x402F;4445if ( 0 < shiftCount ) {4446if ( 0x403E <= aExp ) {4447aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );4448if ( ( a.high == LIT64( 0xC03E000000000000 ) )4449&& ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {4450if ( aSig1 ) float_exception_flags |= float_flag_inexact;4451}4452else {4453float_raise( float_flag_invalid );4454if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {4455return LIT64( 0x7FFFFFFFFFFFFFFF );4456}4457}4458return (sbits64) LIT64( 0x8000000000000000 );4459}4460z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );4461if ( (bits64) ( aSig1<<shiftCount ) ) {4462float_exception_flags |= float_flag_inexact;4463}4464}4465else {4466if ( aExp < 0x3FFF ) {4467if ( aExp | aSig0 | aSig1 ) {4468float_exception_flags |= float_flag_inexact;4469}4470return 0;4471}4472z = aSig0>>( - shiftCount );4473if ( aSig14474|| ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {4475float_exception_flags |= float_flag_inexact;4476}4477}4478if ( aSign ) z = - z;4479return z;44804481}44824483#if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \4484&& defined(SOFTFLOAT_NEED_FIXUNS)4485/*4486* just like above - but do not care for overflow of signed results4487*/4488uint64 float128_to_uint64_round_to_zero( float128 a )4489{4490flag aSign;4491int32 aExp, shiftCount;4492bits64 aSig0, aSig1;4493uint64 z;44944495aSig1 = extractFloat128Frac1( a );4496aSig0 = extractFloat128Frac0( a );4497aExp = extractFloat128Exp( a );4498aSign = extractFloat128Sign( a );4499if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );4500shiftCount = aExp - 0x402F;4501if ( 0 < shiftCount ) {4502if ( 0x403F <= aExp ) {4503aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );4504if ( ( a.high == LIT64( 0xC03E000000000000 ) )4505&& ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {4506if ( aSig1 ) float_exception_flags |= float_flag_inexact;4507}4508else {4509float_raise( float_flag_invalid );4510}4511return LIT64( 0xFFFFFFFFFFFFFFFF );4512}4513z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );4514if ( (bits64) ( aSig1<<shiftCount ) ) {4515float_exception_flags |= float_flag_inexact;4516}4517}4518else {4519if ( aExp < 0x3FFF ) {4520if ( aExp | aSig0 | aSig1 ) {4521float_exception_flags |= float_flag_inexact;4522}4523return 0;4524}4525z = aSig0>>( - shiftCount );4526if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {4527float_exception_flags |= float_flag_inexact;4528}4529}4530if ( aSign ) z = - z;4531return z;45324533}4534#endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */45354536/*4537-------------------------------------------------------------------------------4538Returns the result of converting the quadruple-precision floating-point4539value `a' to the single-precision floating-point format. The conversion4540is performed according to the IEC/IEEE Standard for Binary Floating-Point4541Arithmetic.4542-------------------------------------------------------------------------------4543*/4544float32 float128_to_float32( float128 a )4545{4546flag aSign;4547int32 aExp;4548bits64 aSig0, aSig1;4549bits32 zSig;45504551aSig1 = extractFloat128Frac1( a );4552aSig0 = extractFloat128Frac0( a );4553aExp = extractFloat128Exp( a );4554aSign = extractFloat128Sign( a );4555if ( aExp == 0x7FFF ) {4556if ( aSig0 | aSig1 ) {4557return commonNaNToFloat32( float128ToCommonNaN( a ) );4558}4559return packFloat32( aSign, 0xFF, 0 );4560}4561aSig0 |= ( aSig1 != 0 );4562shift64RightJamming( aSig0, 18, &aSig0 );4563zSig = aSig0;4564if ( aExp || zSig ) {4565zSig |= 0x40000000;4566aExp -= 0x3F81;4567}4568return roundAndPackFloat32( aSign, aExp, zSig );45694570}45714572/*4573-------------------------------------------------------------------------------4574Returns the result of converting the quadruple-precision floating-point4575value `a' to the double-precision floating-point format. The conversion4576is performed according to the IEC/IEEE Standard for Binary Floating-Point4577Arithmetic.4578-------------------------------------------------------------------------------4579*/4580float64 float128_to_float64( float128 a )4581{4582flag aSign;4583int32 aExp;4584bits64 aSig0, aSig1;45854586aSig1 = extractFloat128Frac1( a );4587aSig0 = extractFloat128Frac0( a );4588aExp = extractFloat128Exp( a );4589aSign = extractFloat128Sign( a );4590if ( aExp == 0x7FFF ) {4591if ( aSig0 | aSig1 ) {4592return commonNaNToFloat64( float128ToCommonNaN( a ) );4593}4594return packFloat64( aSign, 0x7FF, 0 );4595}4596shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );4597aSig0 |= ( aSig1 != 0 );4598if ( aExp || aSig0 ) {4599aSig0 |= LIT64( 0x4000000000000000 );4600aExp -= 0x3C01;4601}4602return roundAndPackFloat64( aSign, aExp, aSig0 );46034604}46054606#ifdef FLOATX8046074608/*4609-------------------------------------------------------------------------------4610Returns the result of converting the quadruple-precision floating-point4611value `a' to the extended double-precision floating-point format. The4612conversion is performed according to the IEC/IEEE Standard for Binary4613Floating-Point Arithmetic.4614-------------------------------------------------------------------------------4615*/4616floatx80 float128_to_floatx80( float128 a )4617{4618flag aSign;4619int32 aExp;4620bits64 aSig0, aSig1;46214622aSig1 = extractFloat128Frac1( a );4623aSig0 = extractFloat128Frac0( a );4624aExp = extractFloat128Exp( a );4625aSign = extractFloat128Sign( a );4626if ( aExp == 0x7FFF ) {4627if ( aSig0 | aSig1 ) {4628return commonNaNToFloatx80( float128ToCommonNaN( a ) );4629}4630return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );4631}4632if ( aExp == 0 ) {4633if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );4634normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );4635}4636else {4637aSig0 |= LIT64( 0x0001000000000000 );4638}4639shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );4640return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );46414642}46434644#endif46454646/*4647-------------------------------------------------------------------------------4648Rounds the quadruple-precision floating-point value `a' to an integer, and4649returns the result as a quadruple-precision floating-point value. The4650operation is performed according to the IEC/IEEE Standard for Binary4651Floating-Point Arithmetic.4652-------------------------------------------------------------------------------4653*/4654float128 float128_round_to_int( float128 a )4655{4656flag aSign;4657int32 aExp;4658bits64 lastBitMask, roundBitsMask;4659int8 roundingMode;4660float128 z;46614662aExp = extractFloat128Exp( a );4663if ( 0x402F <= aExp ) {4664if ( 0x406F <= aExp ) {4665if ( ( aExp == 0x7FFF )4666&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )4667) {4668return propagateFloat128NaN( a, a );4669}4670return a;4671}4672lastBitMask = 1;4673lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;4674roundBitsMask = lastBitMask - 1;4675z = a;4676roundingMode = float_rounding_mode;4677if ( roundingMode == float_round_nearest_even ) {4678if ( lastBitMask ) {4679add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );4680if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;4681}4682else {4683if ( (sbits64) z.low < 0 ) {4684++z.high;4685if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;4686}4687}4688}4689else if ( roundingMode != float_round_to_zero ) {4690if ( extractFloat128Sign( z )4691^ ( roundingMode == float_round_up ) ) {4692add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );4693}4694}4695z.low &= ~ roundBitsMask;4696}4697else {4698if ( aExp < 0x3FFF ) {4699if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;4700float_exception_flags |= float_flag_inexact;4701aSign = extractFloat128Sign( a );4702switch ( float_rounding_mode ) {4703case float_round_nearest_even:4704if ( ( aExp == 0x3FFE )4705&& ( extractFloat128Frac0( a )4706| extractFloat128Frac1( a ) )4707) {4708return packFloat128( aSign, 0x3FFF, 0, 0 );4709}4710break;4711case float_round_to_zero:4712break;4713case float_round_down:4714return4715aSign ? packFloat128( 1, 0x3FFF, 0, 0 )4716: packFloat128( 0, 0, 0, 0 );4717case float_round_up:4718return4719aSign ? packFloat128( 1, 0, 0, 0 )4720: packFloat128( 0, 0x3FFF, 0, 0 );4721}4722return packFloat128( aSign, 0, 0, 0 );4723}4724lastBitMask = 1;4725lastBitMask <<= 0x402F - aExp;4726roundBitsMask = lastBitMask - 1;4727z.low = 0;4728z.high = a.high;4729roundingMode = float_rounding_mode;4730if ( roundingMode == float_round_nearest_even ) {4731z.high += lastBitMask>>1;4732if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {4733z.high &= ~ lastBitMask;4734}4735}4736else if ( roundingMode != float_round_to_zero ) {4737if ( extractFloat128Sign( z )4738^ ( roundingMode == float_round_up ) ) {4739z.high |= ( a.low != 0 );4740z.high += roundBitsMask;4741}4742}4743z.high &= ~ roundBitsMask;4744}4745if ( ( z.low != a.low ) || ( z.high != a.high ) ) {4746float_exception_flags |= float_flag_inexact;4747}4748return z;47494750}47514752/*4753-------------------------------------------------------------------------------4754Returns the result of adding the absolute values of the quadruple-precision4755floating-point values `a' and `b'. If `zSign' is 1, the sum is negated4756before being returned. `zSign' is ignored if the result is a NaN.4757The addition is performed according to the IEC/IEEE Standard for Binary4758Floating-Point Arithmetic.4759-------------------------------------------------------------------------------4760*/4761static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )4762{4763int32 aExp, bExp, zExp;4764bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;4765int32 expDiff;47664767aSig1 = extractFloat128Frac1( a );4768aSig0 = extractFloat128Frac0( a );4769aExp = extractFloat128Exp( a );4770bSig1 = extractFloat128Frac1( b );4771bSig0 = extractFloat128Frac0( b );4772bExp = extractFloat128Exp( b );4773expDiff = aExp - bExp;4774if ( 0 < expDiff ) {4775if ( aExp == 0x7FFF ) {4776if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );4777return a;4778}4779if ( bExp == 0 ) {4780--expDiff;4781}4782else {4783bSig0 |= LIT64( 0x0001000000000000 );4784}4785shift128ExtraRightJamming(4786bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );4787zExp = aExp;4788}4789else if ( expDiff < 0 ) {4790if ( bExp == 0x7FFF ) {4791if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );4792return packFloat128( zSign, 0x7FFF, 0, 0 );4793}4794if ( aExp == 0 ) {4795++expDiff;4796}4797else {4798aSig0 |= LIT64( 0x0001000000000000 );4799}4800shift128ExtraRightJamming(4801aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );4802zExp = bExp;4803}4804else {4805if ( aExp == 0x7FFF ) {4806if ( aSig0 | aSig1 | bSig0 | bSig1 ) {4807return propagateFloat128NaN( a, b );4808}4809return a;4810}4811add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );4812if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );4813zSig2 = 0;4814zSig0 |= LIT64( 0x0002000000000000 );4815zExp = aExp;4816goto shiftRight1;4817}4818aSig0 |= LIT64( 0x0001000000000000 );4819add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );4820--zExp;4821if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;4822++zExp;4823shiftRight1:4824shift128ExtraRightJamming(4825zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );4826roundAndPack:4827return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );48284829}48304831/*4832-------------------------------------------------------------------------------4833Returns the result of subtracting the absolute values of the quadruple-4834precision floating-point values `a' and `b'. If `zSign' is 1, the4835difference is negated before being returned. `zSign' is ignored if the4836result is a NaN. The subtraction is performed according to the IEC/IEEE4837Standard for Binary Floating-Point Arithmetic.4838-------------------------------------------------------------------------------4839*/4840static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )4841{4842int32 aExp, bExp, zExp;4843bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;4844int32 expDiff;4845float128 z;48464847aSig1 = extractFloat128Frac1( a );4848aSig0 = extractFloat128Frac0( a );4849aExp = extractFloat128Exp( a );4850bSig1 = extractFloat128Frac1( b );4851bSig0 = extractFloat128Frac0( b );4852bExp = extractFloat128Exp( b );4853expDiff = aExp - bExp;4854shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );4855shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );4856if ( 0 < expDiff ) goto aExpBigger;4857if ( expDiff < 0 ) goto bExpBigger;4858if ( aExp == 0x7FFF ) {4859if ( aSig0 | aSig1 | bSig0 | bSig1 ) {4860return propagateFloat128NaN( a, b );4861}4862float_raise( float_flag_invalid );4863z.low = float128_default_nan_low;4864z.high = float128_default_nan_high;4865return z;4866}4867if ( aExp == 0 ) {4868aExp = 1;4869bExp = 1;4870}4871if ( bSig0 < aSig0 ) goto aBigger;4872if ( aSig0 < bSig0 ) goto bBigger;4873if ( bSig1 < aSig1 ) goto aBigger;4874if ( aSig1 < bSig1 ) goto bBigger;4875return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );4876bExpBigger:4877if ( bExp == 0x7FFF ) {4878if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );4879return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );4880}4881if ( aExp == 0 ) {4882++expDiff;4883}4884else {4885aSig0 |= LIT64( 0x4000000000000000 );4886}4887shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );4888bSig0 |= LIT64( 0x4000000000000000 );4889bBigger:4890sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );4891zExp = bExp;4892zSign ^= 1;4893goto normalizeRoundAndPack;4894aExpBigger:4895if ( aExp == 0x7FFF ) {4896if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );4897return a;4898}4899if ( bExp == 0 ) {4900--expDiff;4901}4902else {4903bSig0 |= LIT64( 0x4000000000000000 );4904}4905shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );4906aSig0 |= LIT64( 0x4000000000000000 );4907aBigger:4908sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );4909zExp = aExp;4910normalizeRoundAndPack:4911--zExp;4912return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );49134914}49154916/*4917-------------------------------------------------------------------------------4918Returns the result of adding the quadruple-precision floating-point values4919`a' and `b'. The operation is performed according to the IEC/IEEE Standard4920for Binary Floating-Point Arithmetic.4921-------------------------------------------------------------------------------4922*/4923float128 float128_add( float128 a, float128 b )4924{4925flag aSign, bSign;49264927aSign = extractFloat128Sign( a );4928bSign = extractFloat128Sign( b );4929if ( aSign == bSign ) {4930return addFloat128Sigs( a, b, aSign );4931}4932else {4933return subFloat128Sigs( a, b, aSign );4934}49354936}49374938/*4939-------------------------------------------------------------------------------4940Returns the result of subtracting the quadruple-precision floating-point4941values `a' and `b'. The operation is performed according to the IEC/IEEE4942Standard for Binary Floating-Point Arithmetic.4943-------------------------------------------------------------------------------4944*/4945float128 float128_sub( float128 a, float128 b )4946{4947flag aSign, bSign;49484949aSign = extractFloat128Sign( a );4950bSign = extractFloat128Sign( b );4951if ( aSign == bSign ) {4952return subFloat128Sigs( a, b, aSign );4953}4954else {4955return addFloat128Sigs( a, b, aSign );4956}49574958}49594960/*4961-------------------------------------------------------------------------------4962Returns the result of multiplying the quadruple-precision floating-point4963values `a' and `b'. The operation is performed according to the IEC/IEEE4964Standard for Binary Floating-Point Arithmetic.4965-------------------------------------------------------------------------------4966*/4967float128 float128_mul( float128 a, float128 b )4968{4969flag aSign, bSign, zSign;4970int32 aExp, bExp, zExp;4971bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;4972float128 z;49734974aSig1 = extractFloat128Frac1( a );4975aSig0 = extractFloat128Frac0( a );4976aExp = extractFloat128Exp( a );4977aSign = extractFloat128Sign( a );4978bSig1 = extractFloat128Frac1( b );4979bSig0 = extractFloat128Frac0( b );4980bExp = extractFloat128Exp( b );4981bSign = extractFloat128Sign( b );4982zSign = aSign ^ bSign;4983if ( aExp == 0x7FFF ) {4984if ( ( aSig0 | aSig1 )4985|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {4986return propagateFloat128NaN( a, b );4987}4988if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;4989return packFloat128( zSign, 0x7FFF, 0, 0 );4990}4991if ( bExp == 0x7FFF ) {4992if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );4993if ( ( aExp | aSig0 | aSig1 ) == 0 ) {4994invalid:4995float_raise( float_flag_invalid );4996z.low = float128_default_nan_low;4997z.high = float128_default_nan_high;4998return z;4999}5000return packFloat128( zSign, 0x7FFF, 0, 0 );5001}5002if ( aExp == 0 ) {5003if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );5004normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );5005}5006if ( bExp == 0 ) {5007if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );5008normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );5009}5010zExp = aExp + bExp - 0x4000;5011aSig0 |= LIT64( 0x0001000000000000 );5012shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );5013mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );5014add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );5015zSig2 |= ( zSig3 != 0 );5016if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {5017shift128ExtraRightJamming(5018zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );5019++zExp;5020}5021return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );50225023}50245025/*5026-------------------------------------------------------------------------------5027Returns the result of dividing the quadruple-precision floating-point value5028`a' by the corresponding value `b'. The operation is performed according to5029the IEC/IEEE Standard for Binary Floating-Point Arithmetic.5030-------------------------------------------------------------------------------5031*/5032float128 float128_div( float128 a, float128 b )5033{5034flag aSign, bSign, zSign;5035int32 aExp, bExp, zExp;5036bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;5037bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;5038float128 z;50395040aSig1 = extractFloat128Frac1( a );5041aSig0 = extractFloat128Frac0( a );5042aExp = extractFloat128Exp( a );5043aSign = extractFloat128Sign( a );5044bSig1 = extractFloat128Frac1( b );5045bSig0 = extractFloat128Frac0( b );5046bExp = extractFloat128Exp( b );5047bSign = extractFloat128Sign( b );5048zSign = aSign ^ bSign;5049if ( aExp == 0x7FFF ) {5050if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );5051if ( bExp == 0x7FFF ) {5052if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );5053goto invalid;5054}5055return packFloat128( zSign, 0x7FFF, 0, 0 );5056}5057if ( bExp == 0x7FFF ) {5058if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );5059return packFloat128( zSign, 0, 0, 0 );5060}5061if ( bExp == 0 ) {5062if ( ( bSig0 | bSig1 ) == 0 ) {5063if ( ( aExp | aSig0 | aSig1 ) == 0 ) {5064invalid:5065float_raise( float_flag_invalid );5066z.low = float128_default_nan_low;5067z.high = float128_default_nan_high;5068return z;5069}5070float_raise( float_flag_divbyzero );5071return packFloat128( zSign, 0x7FFF, 0, 0 );5072}5073normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );5074}5075if ( aExp == 0 ) {5076if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );5077normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );5078}5079zExp = aExp - bExp + 0x3FFD;5080shortShift128Left(5081aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );5082shortShift128Left(5083bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );5084if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {5085shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );5086++zExp;5087}5088zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );5089mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );5090sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );5091while ( (sbits64) rem0 < 0 ) {5092--zSig0;5093add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );5094}5095zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );5096if ( ( zSig1 & 0x3FFF ) <= 4 ) {5097mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );5098sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );5099while ( (sbits64) rem1 < 0 ) {5100--zSig1;5101add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );5102}5103zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );5104}5105shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );5106return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );51075108}51095110/*5111-------------------------------------------------------------------------------5112Returns the remainder of the quadruple-precision floating-point value `a'5113with respect to the corresponding value `b'. The operation is performed5114according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.5115-------------------------------------------------------------------------------5116*/5117float128 float128_rem( float128 a, float128 b )5118{5119flag aSign, bSign, zSign;5120int32 aExp, bExp, expDiff;5121bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;5122bits64 allZero, alternateASig0, alternateASig1, sigMean1;5123sbits64 sigMean0;5124float128 z;51255126aSig1 = extractFloat128Frac1( a );5127aSig0 = extractFloat128Frac0( a );5128aExp = extractFloat128Exp( a );5129aSign = extractFloat128Sign( a );5130bSig1 = extractFloat128Frac1( b );5131bSig0 = extractFloat128Frac0( b );5132bExp = extractFloat128Exp( b );5133bSign = extractFloat128Sign( b );5134if ( aExp == 0x7FFF ) {5135if ( ( aSig0 | aSig1 )5136|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {5137return propagateFloat128NaN( a, b );5138}5139goto invalid;5140}5141if ( bExp == 0x7FFF ) {5142if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );5143return a;5144}5145if ( bExp == 0 ) {5146if ( ( bSig0 | bSig1 ) == 0 ) {5147invalid:5148float_raise( float_flag_invalid );5149z.low = float128_default_nan_low;5150z.high = float128_default_nan_high;5151return z;5152}5153normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );5154}5155if ( aExp == 0 ) {5156if ( ( aSig0 | aSig1 ) == 0 ) return a;5157normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );5158}5159expDiff = aExp - bExp;5160if ( expDiff < -1 ) return a;5161shortShift128Left(5162aSig0 | LIT64( 0x0001000000000000 ),5163aSig1,516415 - ( expDiff < 0 ),5165&aSig0,5166&aSig15167);5168shortShift128Left(5169bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );5170q = le128( bSig0, bSig1, aSig0, aSig1 );5171if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );5172expDiff -= 64;5173while ( 0 < expDiff ) {5174q = estimateDiv128To64( aSig0, aSig1, bSig0 );5175q = ( 4 < q ) ? q - 4 : 0;5176mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );5177shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );5178shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );5179sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );5180expDiff -= 61;5181}5182if ( -64 < expDiff ) {5183q = estimateDiv128To64( aSig0, aSig1, bSig0 );5184q = ( 4 < q ) ? q - 4 : 0;5185q >>= - expDiff;5186shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );5187expDiff += 52;5188if ( expDiff < 0 ) {5189shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );5190}5191else {5192shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );5193}5194mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );5195sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );5196}5197else {5198shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );5199shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );5200}5201do {5202alternateASig0 = aSig0;5203alternateASig1 = aSig1;5204++q;5205sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );5206} while ( 0 <= (sbits64) aSig0 );5207add128(5208aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );5209if ( ( sigMean0 < 0 )5210|| ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {5211aSig0 = alternateASig0;5212aSig1 = alternateASig1;5213}5214zSign = ( (sbits64) aSig0 < 0 );5215if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );5216return5217normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );52185219}52205221/*5222-------------------------------------------------------------------------------5223Returns the square root of the quadruple-precision floating-point value `a'.5224The operation is performed according to the IEC/IEEE Standard for Binary5225Floating-Point Arithmetic.5226-------------------------------------------------------------------------------5227*/5228float128 float128_sqrt( float128 a )5229{5230flag aSign;5231int32 aExp, zExp;5232bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;5233bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;5234float128 z;52355236aSig1 = extractFloat128Frac1( a );5237aSig0 = extractFloat128Frac0( a );5238aExp = extractFloat128Exp( a );5239aSign = extractFloat128Sign( a );5240if ( aExp == 0x7FFF ) {5241if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );5242if ( ! aSign ) return a;5243goto invalid;5244}5245if ( aSign ) {5246if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;5247invalid:5248float_raise( float_flag_invalid );5249z.low = float128_default_nan_low;5250z.high = float128_default_nan_high;5251return z;5252}5253if ( aExp == 0 ) {5254if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );5255normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );5256}5257zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;5258aSig0 |= LIT64( 0x0001000000000000 );5259zSig0 = estimateSqrt32( aExp, aSig0>>17 );5260shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );5261zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );5262doubleZSig0 = zSig0<<1;5263mul64To128( zSig0, zSig0, &term0, &term1 );5264sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );5265while ( (sbits64) rem0 < 0 ) {5266--zSig0;5267doubleZSig0 -= 2;5268add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );5269}5270zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );5271if ( ( zSig1 & 0x1FFF ) <= 5 ) {5272if ( zSig1 == 0 ) zSig1 = 1;5273mul64To128( doubleZSig0, zSig1, &term1, &term2 );5274sub128( rem1, 0, term1, term2, &rem1, &rem2 );5275mul64To128( zSig1, zSig1, &term2, &term3 );5276sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );5277while ( (sbits64) rem1 < 0 ) {5278--zSig1;5279shortShift128Left( 0, zSig1, 1, &term2, &term3 );5280term3 |= 1;5281term2 |= doubleZSig0;5282add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );5283}5284zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );5285}5286shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );5287return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );52885289}52905291/*5292-------------------------------------------------------------------------------5293Returns 1 if the quadruple-precision floating-point value `a' is equal to5294the corresponding value `b', and 0 otherwise. The comparison is performed5295according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.5296-------------------------------------------------------------------------------5297*/5298flag float128_eq( float128 a, float128 b )5299{53005301if ( ( ( extractFloat128Exp( a ) == 0x7FFF )5302&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )5303|| ( ( extractFloat128Exp( b ) == 0x7FFF )5304&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )5305) {5306if ( float128_is_signaling_nan( a )5307|| float128_is_signaling_nan( b ) ) {5308float_raise( float_flag_invalid );5309}5310return 0;5311}5312return5313( a.low == b.low )5314&& ( ( a.high == b.high )5315|| ( ( a.low == 0 )5316&& ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )5317);53185319}53205321/*5322-------------------------------------------------------------------------------5323Returns 1 if the quadruple-precision floating-point value `a' is less than5324or equal to the corresponding value `b', and 0 otherwise. The comparison5325is performed according to the IEC/IEEE Standard for Binary Floating-Point5326Arithmetic.5327-------------------------------------------------------------------------------5328*/5329flag float128_le( float128 a, float128 b )5330{5331flag aSign, bSign;53325333if ( ( ( extractFloat128Exp( a ) == 0x7FFF )5334&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )5335|| ( ( extractFloat128Exp( b ) == 0x7FFF )5336&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )5337) {5338float_raise( float_flag_invalid );5339return 0;5340}5341aSign = extractFloat128Sign( a );5342bSign = extractFloat128Sign( b );5343if ( aSign != bSign ) {5344return5345aSign5346|| ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )5347== 0 );5348}5349return5350aSign ? le128( b.high, b.low, a.high, a.low )5351: le128( a.high, a.low, b.high, b.low );53525353}53545355/*5356-------------------------------------------------------------------------------5357Returns 1 if the quadruple-precision floating-point value `a' is less than5358the corresponding value `b', and 0 otherwise. The comparison is performed5359according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.5360-------------------------------------------------------------------------------5361*/5362flag float128_lt( float128 a, float128 b )5363{5364flag aSign, bSign;53655366if ( ( ( extractFloat128Exp( a ) == 0x7FFF )5367&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )5368|| ( ( extractFloat128Exp( b ) == 0x7FFF )5369&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )5370) {5371float_raise( float_flag_invalid );5372return 0;5373}5374aSign = extractFloat128Sign( a );5375bSign = extractFloat128Sign( b );5376if ( aSign != bSign ) {5377return5378aSign5379&& ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )5380!= 0 );5381}5382return5383aSign ? lt128( b.high, b.low, a.high, a.low )5384: lt128( a.high, a.low, b.high, b.low );53855386}53875388/*5389-------------------------------------------------------------------------------5390Returns 1 if the quadruple-precision floating-point value `a' is equal to5391the corresponding value `b', and 0 otherwise. The invalid exception is5392raised if either operand is a NaN. Otherwise, the comparison is performed5393according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.5394-------------------------------------------------------------------------------5395*/5396flag float128_eq_signaling( float128 a, float128 b )5397{53985399if ( ( ( extractFloat128Exp( a ) == 0x7FFF )5400&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )5401|| ( ( extractFloat128Exp( b ) == 0x7FFF )5402&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )5403) {5404float_raise( float_flag_invalid );5405return 0;5406}5407return5408( a.low == b.low )5409&& ( ( a.high == b.high )5410|| ( ( a.low == 0 )5411&& ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )5412);54135414}54155416/*5417-------------------------------------------------------------------------------5418Returns 1 if the quadruple-precision floating-point value `a' is less than5419or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not5420cause an exception. Otherwise, the comparison is performed according to the5421IEC/IEEE Standard for Binary Floating-Point Arithmetic.5422-------------------------------------------------------------------------------5423*/5424flag float128_le_quiet( float128 a, float128 b )5425{5426flag aSign, bSign;54275428if ( ( ( extractFloat128Exp( a ) == 0x7FFF )5429&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )5430|| ( ( extractFloat128Exp( b ) == 0x7FFF )5431&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )5432) {5433if ( float128_is_signaling_nan( a )5434|| float128_is_signaling_nan( b ) ) {5435float_raise( float_flag_invalid );5436}5437return 0;5438}5439aSign = extractFloat128Sign( a );5440bSign = extractFloat128Sign( b );5441if ( aSign != bSign ) {5442return5443aSign5444|| ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )5445== 0 );5446}5447return5448aSign ? le128( b.high, b.low, a.high, a.low )5449: le128( a.high, a.low, b.high, b.low );54505451}54525453/*5454-------------------------------------------------------------------------------5455Returns 1 if the quadruple-precision floating-point value `a' is less than5456the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an5457exception. Otherwise, the comparison is performed according to the IEC/IEEE5458Standard for Binary Floating-Point Arithmetic.5459-------------------------------------------------------------------------------5460*/5461flag float128_lt_quiet( float128 a, float128 b )5462{5463flag aSign, bSign;54645465if ( ( ( extractFloat128Exp( a ) == 0x7FFF )5466&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )5467|| ( ( extractFloat128Exp( b ) == 0x7FFF )5468&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )5469) {5470if ( float128_is_signaling_nan( a )5471|| float128_is_signaling_nan( b ) ) {5472float_raise( float_flag_invalid );5473}5474return 0;5475}5476aSign = extractFloat128Sign( a );5477bSign = extractFloat128Sign( b );5478if ( aSign != bSign ) {5479return5480aSign5481&& ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )5482!= 0 );5483}5484return5485aSign ? lt128( b.high, b.low, a.high, a.low )5486: lt128( a.high, a.low, b.high, b.low );54875488}54895490#endif549154925493#if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)54945495/*5496* These two routines are not part of the original softfloat distribution.5497*5498* They are based on the corresponding conversions to integer but return5499* unsigned numbers instead since these functions are required by GCC.5500*5501* Added by Mark Brinicombe <[email protected]> 27/09/975502*5503* float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]5504*/55055506/*5507-------------------------------------------------------------------------------5508Returns the result of converting the double-precision floating-point value5509`a' to the 32-bit unsigned integer format. The conversion is5510performed according to the IEC/IEEE Standard for Binary Floating-point5511Arithmetic, except that the conversion is always rounded toward zero. If5512`a' is a NaN, the largest positive integer is returned. If the conversion5513overflows, the largest integer positive is returned.5514-------------------------------------------------------------------------------5515*/5516uint32 float64_to_uint32_round_to_zero( float64 a )5517{5518flag aSign;5519int16 aExp, shiftCount;5520bits64 aSig, savedASig;5521uint32 z;55225523aSig = extractFloat64Frac( a );5524aExp = extractFloat64Exp( a );5525aSign = extractFloat64Sign( a );55265527if (aSign) {5528float_raise( float_flag_invalid );5529return(0);5530}55315532if ( 0x41E < aExp ) {5533float_raise( float_flag_invalid );5534return 0xffffffff;5535}5536else if ( aExp < 0x3FF ) {5537if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;5538return 0;5539}5540aSig |= LIT64( 0x0010000000000000 );5541shiftCount = 0x433 - aExp;5542savedASig = aSig;5543aSig >>= shiftCount;5544z = aSig;5545if ( ( aSig<<shiftCount ) != savedASig ) {5546float_exception_flags |= float_flag_inexact;5547}5548return z;55495550}55515552/*5553-------------------------------------------------------------------------------5554Returns the result of converting the single-precision floating-point value5555`a' to the 32-bit unsigned integer format. The conversion is5556performed according to the IEC/IEEE Standard for Binary Floating-point5557Arithmetic, except that the conversion is always rounded toward zero. If5558`a' is a NaN, the largest positive integer is returned. If the conversion5559overflows, the largest positive integer is returned.5560-------------------------------------------------------------------------------5561*/5562uint32 float32_to_uint32_round_to_zero( float32 a )5563{5564flag aSign;5565int16 aExp, shiftCount;5566bits32 aSig;5567uint32 z;55685569aSig = extractFloat32Frac( a );5570aExp = extractFloat32Exp( a );5571aSign = extractFloat32Sign( a );5572shiftCount = aExp - 0x9E;55735574if (aSign) {5575float_raise( float_flag_invalid );5576return(0);5577}5578if ( 0 < shiftCount ) {5579float_raise( float_flag_invalid );5580return 0xFFFFFFFF;5581}5582else if ( aExp <= 0x7E ) {5583if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;5584return 0;5585}5586aSig = ( aSig | 0x800000 )<<8;5587z = aSig>>( - shiftCount );5588if ( aSig<<( shiftCount & 31 ) ) {5589float_exception_flags |= float_flag_inexact;5590}5591return z;55925593}55945595#endif559655975598