Path: blob/main/lib/libc/softfloat/bits32/softfloat.c
39530 views
/* $NetBSD: softfloat.c,v 1.1 2002/05/21 23:51:07 bjh21 Exp $ */12/*3* This version hacked for use with gcc -msoft-float by bjh21.4* (Mostly a case of #ifdefing out things GCC doesn't need or provides5* itself).6*/78/*9* Things you may want to define:10*11* SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with12* -msoft-float) to work. Include "softfloat-for-gcc.h" to get them13* properly renamed.14*/1516/*17* This differs from the standard bits32/softfloat.c in that float6418* is defined to be a 64-bit integer rather than a structure. The19* structure is float64s, with translation between the two going via20* float64u.21*/2223/*24===============================================================================2526This C source file is part of the SoftFloat IEC/IEEE Floating-Point27Arithmetic Package, Release 2a.2829Written by John R. Hauser. This work was made possible in part by the30International Computer Science Institute, located at Suite 600, 1947 Center31Street, Berkeley, California 94704. Funding was partially provided by the32National Science Foundation under grant MIP-9311980. The original version33of this code was written as part of a project to build a fixed-point vector34processor in collaboration with the University of California at Berkeley,35overseen by Profs. Nelson Morgan and John Wawrzynek. More information36is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/37arithmetic/SoftFloat.html'.3839THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort40has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT41TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO42PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY43AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.4445Derivative works are acceptable, even for commercial purposes, so long as46(1) they include prominent notice that the work is derivative, and (2) they47include prominent notice akin to these four paragraphs for those parts of48this code that are retained.4950===============================================================================51*/5253#ifdef SOFTFLOAT_FOR_GCC54#include "softfloat-for-gcc.h"55#endif5657#include "milieu.h"58#include "softfloat.h"5960/*61* Conversions between floats as stored in memory and floats as62* SoftFloat uses them63*/64#ifndef FLOAT64_DEMANGLE65#define FLOAT64_DEMANGLE(a) (a)66#endif67#ifndef FLOAT64_MANGLE68#define FLOAT64_MANGLE(a) (a)69#endif7071/*72-------------------------------------------------------------------------------73Floating-point rounding mode and exception flags.74-------------------------------------------------------------------------------75*/76int float_rounding_mode = float_round_nearest_even;77int float_exception_flags = 0;7879/*80-------------------------------------------------------------------------------81Primitive arithmetic functions, including multi-word arithmetic, and82division and square root approximations. (Can be specialized to target if83desired.)84-------------------------------------------------------------------------------85*/86#include "softfloat-macros"8788/*89-------------------------------------------------------------------------------90Functions and definitions to determine: (1) whether tininess for underflow91is detected before or after rounding by default, (2) what (if anything)92happens when exceptions are raised, (3) how signaling NaNs are distinguished93from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs94are propagated from function inputs to output. These details are target-95specific.96-------------------------------------------------------------------------------97*/98#include "softfloat-specialize"99100/*101-------------------------------------------------------------------------------102Returns the fraction bits of the single-precision floating-point value `a'.103-------------------------------------------------------------------------------104*/105INLINE bits32 extractFloat32Frac( float32 a )106{107108return a & 0x007FFFFF;109110}111112/*113-------------------------------------------------------------------------------114Returns the exponent bits of the single-precision floating-point value `a'.115-------------------------------------------------------------------------------116*/117INLINE int16 extractFloat32Exp( float32 a )118{119120return ( a>>23 ) & 0xFF;121122}123124/*125-------------------------------------------------------------------------------126Returns the sign bit of the single-precision floating-point value `a'.127-------------------------------------------------------------------------------128*/129INLINE flag extractFloat32Sign( float32 a )130{131132return a>>31;133134}135136/*137-------------------------------------------------------------------------------138Normalizes the subnormal single-precision floating-point value represented139by the denormalized significand `aSig'. The normalized exponent and140significand are stored at the locations pointed to by `zExpPtr' and141`zSigPtr', respectively.142-------------------------------------------------------------------------------143*/144static void145normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )146{147int8 shiftCount;148149shiftCount = countLeadingZeros32( aSig ) - 8;150*zSigPtr = aSig<<shiftCount;151*zExpPtr = 1 - shiftCount;152153}154155/*156-------------------------------------------------------------------------------157Packs the sign `zSign', exponent `zExp', and significand `zSig' into a158single-precision floating-point value, returning the result. After being159shifted into the proper positions, the three fields are simply added160together to form the result. This means that any integer portion of `zSig'161will be added into the exponent. Since a properly normalized significand162will have an integer portion equal to 1, the `zExp' input should be 1 less163than the desired result exponent whenever `zSig' is a complete, normalized164significand.165-------------------------------------------------------------------------------166*/167INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )168{169170return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;171172}173174/*175-------------------------------------------------------------------------------176Takes an abstract floating-point value having sign `zSign', exponent `zExp',177and significand `zSig', and returns the proper single-precision floating-178point value corresponding to the abstract input. Ordinarily, the abstract179value is simply rounded and packed into the single-precision format, with180the inexact exception raised if the abstract input cannot be represented181exactly. However, if the abstract value is too large, the overflow and182inexact exceptions are raised and an infinity or maximal finite value is183returned. If the abstract value is too small, the input value is rounded to184a subnormal number, and the underflow and inexact exceptions are raised if185the abstract input cannot be represented exactly as a subnormal single-186precision floating-point number.187The input significand `zSig' has its binary point between bits 30188and 29, which is 7 bits to the left of the usual location. This shifted189significand must be normalized or smaller. If `zSig' is not normalized,190`zExp' must be 0; in that case, the result returned is a subnormal number,191and it must not require rounding. In the usual case that `zSig' is192normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.193The handling of underflow and overflow follows the IEC/IEEE Standard for194Binary Floating-Point Arithmetic.195-------------------------------------------------------------------------------196*/197static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )198{199int8 roundingMode;200flag roundNearestEven;201int8 roundIncrement, roundBits;202flag isTiny;203204roundingMode = float_rounding_mode;205roundNearestEven = roundingMode == float_round_nearest_even;206roundIncrement = 0x40;207if ( ! roundNearestEven ) {208if ( roundingMode == float_round_to_zero ) {209roundIncrement = 0;210}211else {212roundIncrement = 0x7F;213if ( zSign ) {214if ( roundingMode == float_round_up ) roundIncrement = 0;215}216else {217if ( roundingMode == float_round_down ) roundIncrement = 0;218}219}220}221roundBits = zSig & 0x7F;222if ( 0xFD <= (bits16) zExp ) {223if ( ( 0xFD < zExp )224|| ( ( zExp == 0xFD )225&& ( (sbits32) ( zSig + roundIncrement ) < 0 ) )226) {227float_raise( float_flag_overflow | float_flag_inexact );228return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );229}230if ( zExp < 0 ) {231isTiny =232( float_detect_tininess == float_tininess_before_rounding )233|| ( zExp < -1 )234|| ( zSig + roundIncrement < 0x80000000 );235shift32RightJamming( zSig, - zExp, &zSig );236zExp = 0;237roundBits = zSig & 0x7F;238if ( isTiny && roundBits ) float_raise( float_flag_underflow );239}240}241if ( roundBits ) float_exception_flags |= float_flag_inexact;242zSig = ( zSig + roundIncrement )>>7;243zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );244if ( zSig == 0 ) zExp = 0;245return packFloat32( zSign, zExp, zSig );246247}248249/*250-------------------------------------------------------------------------------251Takes an abstract floating-point value having sign `zSign', exponent `zExp',252and significand `zSig', and returns the proper single-precision floating-253point value corresponding to the abstract input. This routine is just like254`roundAndPackFloat32' except that `zSig' does not have to be normalized.255Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''256floating-point exponent.257-------------------------------------------------------------------------------258*/259static float32260normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )261{262int8 shiftCount;263264shiftCount = countLeadingZeros32( zSig ) - 1;265return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );266267}268269/*270-------------------------------------------------------------------------------271Returns the least-significant 32 fraction bits of the double-precision272floating-point value `a'.273-------------------------------------------------------------------------------274*/275INLINE bits32 extractFloat64Frac1( float64 a )276{277278return FLOAT64_DEMANGLE(a) & LIT64( 0x00000000FFFFFFFF );279280}281282/*283-------------------------------------------------------------------------------284Returns the most-significant 20 fraction bits of the double-precision285floating-point value `a'.286-------------------------------------------------------------------------------287*/288INLINE bits32 extractFloat64Frac0( float64 a )289{290291return ( FLOAT64_DEMANGLE(a)>>32 ) & 0x000FFFFF;292293}294295/*296-------------------------------------------------------------------------------297Returns the exponent bits of the double-precision floating-point value `a'.298-------------------------------------------------------------------------------299*/300INLINE int16 extractFloat64Exp( float64 a )301{302303return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;304305}306307/*308-------------------------------------------------------------------------------309Returns the sign bit of the double-precision floating-point value `a'.310-------------------------------------------------------------------------------311*/312INLINE flag extractFloat64Sign( float64 a )313{314315return FLOAT64_DEMANGLE(a)>>63;316317}318319/*320-------------------------------------------------------------------------------321Normalizes the subnormal double-precision floating-point value represented322by the denormalized significand formed by the concatenation of `aSig0' and323`aSig1'. The normalized exponent is stored at the location pointed to by324`zExpPtr'. The most significant 21 bits of the normalized significand are325stored at the location pointed to by `zSig0Ptr', and the least significant32632 bits of the normalized significand are stored at the location pointed to327by `zSig1Ptr'.328-------------------------------------------------------------------------------329*/330static void331normalizeFloat64Subnormal(332bits32 aSig0,333bits32 aSig1,334int16 *zExpPtr,335bits32 *zSig0Ptr,336bits32 *zSig1Ptr337)338{339int8 shiftCount;340341if ( aSig0 == 0 ) {342shiftCount = countLeadingZeros32( aSig1 ) - 11;343if ( shiftCount < 0 ) {344*zSig0Ptr = aSig1>>( - shiftCount );345*zSig1Ptr = aSig1<<( shiftCount & 31 );346}347else {348*zSig0Ptr = aSig1<<shiftCount;349*zSig1Ptr = 0;350}351*zExpPtr = - shiftCount - 31;352}353else {354shiftCount = countLeadingZeros32( aSig0 ) - 11;355shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );356*zExpPtr = 1 - shiftCount;357}358359}360361/*362-------------------------------------------------------------------------------363Packs the sign `zSign', the exponent `zExp', and the significand formed by364the concatenation of `zSig0' and `zSig1' into a double-precision floating-365point value, returning the result. After being shifted into the proper366positions, the three fields `zSign', `zExp', and `zSig0' are simply added367together to form the most significant 32 bits of the result. This means368that any integer portion of `zSig0' will be added into the exponent. Since369a properly normalized significand will have an integer portion equal to 1,370the `zExp' input should be 1 less than the desired result exponent whenever371`zSig0' and `zSig1' concatenated form a complete, normalized significand.372-------------------------------------------------------------------------------373*/374INLINE float64375packFloat64( flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 )376{377378return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +379( ( (bits64) zExp )<<52 ) +380( ( (bits64) zSig0 )<<32 ) + zSig1 );381382383}384385/*386-------------------------------------------------------------------------------387Takes an abstract floating-point value having sign `zSign', exponent `zExp',388and extended significand formed by the concatenation of `zSig0', `zSig1',389and `zSig2', and returns the proper double-precision floating-point value390corresponding to the abstract input. Ordinarily, the abstract value is391simply rounded and packed into the double-precision format, with the inexact392exception raised if the abstract input cannot be represented exactly.393However, if the abstract value is too large, the overflow and inexact394exceptions are raised and an infinity or maximal finite value is returned.395If the abstract value is too small, the input value is rounded to a396subnormal number, and the underflow and inexact exceptions are raised if the397abstract input cannot be represented exactly as a subnormal double-precision398floating-point number.399The input significand must be normalized or smaller. If the input400significand is not normalized, `zExp' must be 0; in that case, the result401returned is a subnormal number, and it must not require rounding. In the402usual case that the input significand is normalized, `zExp' must be 1 less403than the ``true'' floating-point exponent. The handling of underflow and404overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.405-------------------------------------------------------------------------------406*/407static float64408roundAndPackFloat64(409flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1, bits32 zSig2 )410{411int8 roundingMode;412flag roundNearestEven, increment, isTiny;413414roundingMode = float_rounding_mode;415roundNearestEven = ( roundingMode == float_round_nearest_even );416increment = ( (sbits32) zSig2 < 0 );417if ( ! roundNearestEven ) {418if ( roundingMode == float_round_to_zero ) {419increment = 0;420}421else {422if ( zSign ) {423increment = ( roundingMode == float_round_down ) && zSig2;424}425else {426increment = ( roundingMode == float_round_up ) && zSig2;427}428}429}430if ( 0x7FD <= (bits16) zExp ) {431if ( ( 0x7FD < zExp )432|| ( ( zExp == 0x7FD )433&& eq64( 0x001FFFFF, 0xFFFFFFFF, zSig0, zSig1 )434&& increment435)436) {437float_raise( float_flag_overflow | float_flag_inexact );438if ( ( roundingMode == float_round_to_zero )439|| ( zSign && ( roundingMode == float_round_up ) )440|| ( ! zSign && ( roundingMode == float_round_down ) )441) {442return packFloat64( zSign, 0x7FE, 0x000FFFFF, 0xFFFFFFFF );443}444return packFloat64( zSign, 0x7FF, 0, 0 );445}446if ( zExp < 0 ) {447isTiny =448( float_detect_tininess == float_tininess_before_rounding )449|| ( zExp < -1 )450|| ! increment451|| lt64( zSig0, zSig1, 0x001FFFFF, 0xFFFFFFFF );452shift64ExtraRightJamming(453zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );454zExp = 0;455if ( isTiny && zSig2 ) float_raise( float_flag_underflow );456if ( roundNearestEven ) {457increment = ( (sbits32) zSig2 < 0 );458}459else {460if ( zSign ) {461increment = ( roundingMode == float_round_down ) && zSig2;462}463else {464increment = ( roundingMode == float_round_up ) && zSig2;465}466}467}468}469if ( zSig2 ) float_exception_flags |= float_flag_inexact;470if ( increment ) {471add64( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );472zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );473}474else {475if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;476}477return packFloat64( zSign, zExp, zSig0, zSig1 );478479}480481/*482-------------------------------------------------------------------------------483Takes an abstract floating-point value having sign `zSign', exponent `zExp',484and significand formed by the concatenation of `zSig0' and `zSig1', and485returns the proper double-precision floating-point value corresponding486to the abstract input. This routine is just like `roundAndPackFloat64'487except that the input significand has fewer bits and does not have to be488normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-489point exponent.490-------------------------------------------------------------------------------491*/492static float64493normalizeRoundAndPackFloat64(494flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 )495{496int8 shiftCount;497bits32 zSig2;498499if ( zSig0 == 0 ) {500zSig0 = zSig1;501zSig1 = 0;502zExp -= 32;503}504shiftCount = countLeadingZeros32( zSig0 ) - 11;505if ( 0 <= shiftCount ) {506zSig2 = 0;507shortShift64Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );508}509else {510shift64ExtraRightJamming(511zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );512}513zExp -= shiftCount;514return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );515516}517518/*519-------------------------------------------------------------------------------520Returns the result of converting the 32-bit two's complement integer `a' to521the single-precision floating-point format. The conversion is performed522according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.523-------------------------------------------------------------------------------524*/525float32 int32_to_float32( int32 a )526{527flag zSign;528529if ( a == 0 ) return 0;530if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );531zSign = ( a < 0 );532return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );533534}535536/*537-------------------------------------------------------------------------------538Returns the result of converting the 32-bit two's complement integer `a' to539the double-precision floating-point format. The conversion is performed540according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.541-------------------------------------------------------------------------------542*/543float64 int32_to_float64( int32 a )544{545flag zSign;546bits32 absA;547int8 shiftCount;548bits32 zSig0, zSig1;549550if ( a == 0 ) return packFloat64( 0, 0, 0, 0 );551zSign = ( a < 0 );552absA = zSign ? - a : a;553shiftCount = countLeadingZeros32( absA ) - 11;554if ( 0 <= shiftCount ) {555zSig0 = absA<<shiftCount;556zSig1 = 0;557}558else {559shift64Right( absA, 0, - shiftCount, &zSig0, &zSig1 );560}561return packFloat64( zSign, 0x412 - shiftCount, zSig0, zSig1 );562563}564565#ifndef SOFTFLOAT_FOR_GCC566/*567-------------------------------------------------------------------------------568Returns the result of converting the single-precision floating-point value569`a' to the 32-bit two's complement integer format. The conversion is570performed according to the IEC/IEEE Standard for Binary Floating-Point571Arithmetic---which means in particular that the conversion is rounded572according to the current rounding mode. If `a' is a NaN, the largest573positive integer is returned. Otherwise, if the conversion overflows, the574largest integer with the same sign as `a' is returned.575-------------------------------------------------------------------------------576*/577int32 float32_to_int32( float32 a )578{579flag aSign;580int16 aExp, shiftCount;581bits32 aSig, aSigExtra;582int32 z;583int8 roundingMode;584585aSig = extractFloat32Frac( a );586aExp = extractFloat32Exp( a );587aSign = extractFloat32Sign( a );588shiftCount = aExp - 0x96;589if ( 0 <= shiftCount ) {590if ( 0x9E <= aExp ) {591if ( a != 0xCF000000 ) {592float_raise( float_flag_invalid );593if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {594return 0x7FFFFFFF;595}596}597return (sbits32) 0x80000000;598}599z = ( aSig | 0x00800000 )<<shiftCount;600if ( aSign ) z = - z;601}602else {603if ( aExp < 0x7E ) {604aSigExtra = aExp | aSig;605z = 0;606}607else {608aSig |= 0x00800000;609aSigExtra = aSig<<( shiftCount & 31 );610z = aSig>>( - shiftCount );611}612if ( aSigExtra ) float_exception_flags |= float_flag_inexact;613roundingMode = float_rounding_mode;614if ( roundingMode == float_round_nearest_even ) {615if ( (sbits32) aSigExtra < 0 ) {616++z;617if ( (bits32) ( aSigExtra<<1 ) == 0 ) z &= ~1;618}619if ( aSign ) z = - z;620}621else {622aSigExtra = ( aSigExtra != 0 );623if ( aSign ) {624z += ( roundingMode == float_round_down ) & aSigExtra;625z = - z;626}627else {628z += ( roundingMode == float_round_up ) & aSigExtra;629}630}631}632return z;633634}635#endif636637/*638-------------------------------------------------------------------------------639Returns the result of converting the single-precision floating-point value640`a' to the 32-bit two's complement integer format. The conversion is641performed according to the IEC/IEEE Standard for Binary Floating-Point642Arithmetic, except that the conversion is always rounded toward zero.643If `a' is a NaN, the largest positive integer is returned. Otherwise, if644the conversion overflows, the largest integer with the same sign as `a' is645returned.646-------------------------------------------------------------------------------647*/648int32 float32_to_int32_round_to_zero( float32 a )649{650flag aSign;651int16 aExp, shiftCount;652bits32 aSig;653int32 z;654655aSig = extractFloat32Frac( a );656aExp = extractFloat32Exp( a );657aSign = extractFloat32Sign( a );658shiftCount = aExp - 0x9E;659if ( 0 <= shiftCount ) {660if ( a != 0xCF000000 ) {661float_raise( float_flag_invalid );662if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;663}664return (sbits32) 0x80000000;665}666else if ( aExp <= 0x7E ) {667if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;668return 0;669}670aSig = ( aSig | 0x00800000 )<<8;671z = aSig>>( - shiftCount );672if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {673float_exception_flags |= float_flag_inexact;674}675if ( aSign ) z = - z;676return z;677678}679680/*681-------------------------------------------------------------------------------682Returns the result of converting the single-precision floating-point value683`a' to the double-precision floating-point format. The conversion is684performed according to the IEC/IEEE Standard for Binary Floating-Point685Arithmetic.686-------------------------------------------------------------------------------687*/688float64 float32_to_float64( float32 a )689{690flag aSign;691int16 aExp;692bits32 aSig, zSig0, zSig1;693694aSig = extractFloat32Frac( a );695aExp = extractFloat32Exp( a );696aSign = extractFloat32Sign( a );697if ( aExp == 0xFF ) {698if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );699return packFloat64( aSign, 0x7FF, 0, 0 );700}701if ( aExp == 0 ) {702if ( aSig == 0 ) return packFloat64( aSign, 0, 0, 0 );703normalizeFloat32Subnormal( aSig, &aExp, &aSig );704--aExp;705}706shift64Right( aSig, 0, 3, &zSig0, &zSig1 );707return packFloat64( aSign, aExp + 0x380, zSig0, zSig1 );708709}710711#ifndef SOFTFLOAT_FOR_GCC712/*713-------------------------------------------------------------------------------714Rounds the single-precision floating-point value `a' to an integer,715and returns the result as a single-precision floating-point value. The716operation is performed according to the IEC/IEEE Standard for Binary717Floating-Point Arithmetic.718-------------------------------------------------------------------------------719*/720float32 float32_round_to_int( float32 a )721{722flag aSign;723int16 aExp;724bits32 lastBitMask, roundBitsMask;725int8 roundingMode;726float32 z;727728aExp = extractFloat32Exp( a );729if ( 0x96 <= aExp ) {730if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {731return propagateFloat32NaN( a, a );732}733return a;734}735if ( aExp <= 0x7E ) {736if ( (bits32) ( a<<1 ) == 0 ) return a;737float_exception_flags |= float_flag_inexact;738aSign = extractFloat32Sign( a );739switch ( float_rounding_mode ) {740case float_round_nearest_even:741if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {742return packFloat32( aSign, 0x7F, 0 );743}744break;745case float_round_to_zero:746break;747case float_round_down:748return aSign ? 0xBF800000 : 0;749case float_round_up:750return aSign ? 0x80000000 : 0x3F800000;751}752return packFloat32( aSign, 0, 0 );753}754lastBitMask = 1;755lastBitMask <<= 0x96 - aExp;756roundBitsMask = lastBitMask - 1;757z = a;758roundingMode = float_rounding_mode;759if ( roundingMode == float_round_nearest_even ) {760z += lastBitMask>>1;761if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;762}763else if ( roundingMode != float_round_to_zero ) {764if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {765z += roundBitsMask;766}767}768z &= ~ roundBitsMask;769if ( z != a ) float_exception_flags |= float_flag_inexact;770return z;771772}773#endif774775/*776-------------------------------------------------------------------------------777Returns the result of adding the absolute values of the single-precision778floating-point values `a' and `b'. If `zSign' is 1, the sum is negated779before being returned. `zSign' is ignored if the result is a NaN.780The addition is performed according to the IEC/IEEE Standard for Binary781Floating-Point Arithmetic.782-------------------------------------------------------------------------------783*/784static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )785{786int16 aExp, bExp, zExp;787bits32 aSig, bSig, zSig;788int16 expDiff;789790aSig = extractFloat32Frac( a );791aExp = extractFloat32Exp( a );792bSig = extractFloat32Frac( b );793bExp = extractFloat32Exp( b );794expDiff = aExp - bExp;795aSig <<= 6;796bSig <<= 6;797if ( 0 < expDiff ) {798if ( aExp == 0xFF ) {799if ( aSig ) return propagateFloat32NaN( a, b );800return a;801}802if ( bExp == 0 ) {803--expDiff;804}805else {806bSig |= 0x20000000;807}808shift32RightJamming( bSig, expDiff, &bSig );809zExp = aExp;810}811else if ( expDiff < 0 ) {812if ( bExp == 0xFF ) {813if ( bSig ) return propagateFloat32NaN( a, b );814return packFloat32( zSign, 0xFF, 0 );815}816if ( aExp == 0 ) {817++expDiff;818}819else {820aSig |= 0x20000000;821}822shift32RightJamming( aSig, - expDiff, &aSig );823zExp = bExp;824}825else {826if ( aExp == 0xFF ) {827if ( aSig | bSig ) return propagateFloat32NaN( a, b );828return a;829}830if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );831zSig = 0x40000000 + aSig + bSig;832zExp = aExp;833goto roundAndPack;834}835aSig |= 0x20000000;836zSig = ( aSig + bSig )<<1;837--zExp;838if ( (sbits32) zSig < 0 ) {839zSig = aSig + bSig;840++zExp;841}842roundAndPack:843return roundAndPackFloat32( zSign, zExp, zSig );844845}846847/*848-------------------------------------------------------------------------------849Returns the result of subtracting the absolute values of the single-850precision floating-point values `a' and `b'. If `zSign' is 1, the851difference is negated before being returned. `zSign' is ignored if the852result is a NaN. The subtraction is performed according to the IEC/IEEE853Standard for Binary Floating-Point Arithmetic.854-------------------------------------------------------------------------------855*/856static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )857{858int16 aExp, bExp, zExp;859bits32 aSig, bSig, zSig;860int16 expDiff;861862aSig = extractFloat32Frac( a );863aExp = extractFloat32Exp( a );864bSig = extractFloat32Frac( b );865bExp = extractFloat32Exp( b );866expDiff = aExp - bExp;867aSig <<= 7;868bSig <<= 7;869if ( 0 < expDiff ) goto aExpBigger;870if ( expDiff < 0 ) goto bExpBigger;871if ( aExp == 0xFF ) {872if ( aSig | bSig ) return propagateFloat32NaN( a, b );873float_raise( float_flag_invalid );874return float32_default_nan;875}876if ( aExp == 0 ) {877aExp = 1;878bExp = 1;879}880if ( bSig < aSig ) goto aBigger;881if ( aSig < bSig ) goto bBigger;882return packFloat32( float_rounding_mode == float_round_down, 0, 0 );883bExpBigger:884if ( bExp == 0xFF ) {885if ( bSig ) return propagateFloat32NaN( a, b );886return packFloat32( zSign ^ 1, 0xFF, 0 );887}888if ( aExp == 0 ) {889++expDiff;890}891else {892aSig |= 0x40000000;893}894shift32RightJamming( aSig, - expDiff, &aSig );895bSig |= 0x40000000;896bBigger:897zSig = bSig - aSig;898zExp = bExp;899zSign ^= 1;900goto normalizeRoundAndPack;901aExpBigger:902if ( aExp == 0xFF ) {903if ( aSig ) return propagateFloat32NaN( a, b );904return a;905}906if ( bExp == 0 ) {907--expDiff;908}909else {910bSig |= 0x40000000;911}912shift32RightJamming( bSig, expDiff, &bSig );913aSig |= 0x40000000;914aBigger:915zSig = aSig - bSig;916zExp = aExp;917normalizeRoundAndPack:918--zExp;919return normalizeRoundAndPackFloat32( zSign, zExp, zSig );920921}922923/*924-------------------------------------------------------------------------------925Returns the result of adding the single-precision floating-point values `a'926and `b'. The operation is performed according to the IEC/IEEE Standard for927Binary Floating-Point Arithmetic.928-------------------------------------------------------------------------------929*/930float32 float32_add( float32 a, float32 b )931{932flag aSign, bSign;933934aSign = extractFloat32Sign( a );935bSign = extractFloat32Sign( b );936if ( aSign == bSign ) {937return addFloat32Sigs( a, b, aSign );938}939else {940return subFloat32Sigs( a, b, aSign );941}942943}944945/*946-------------------------------------------------------------------------------947Returns the result of subtracting the single-precision floating-point values948`a' and `b'. The operation is performed according to the IEC/IEEE Standard949for Binary Floating-Point Arithmetic.950-------------------------------------------------------------------------------951*/952float32 float32_sub( float32 a, float32 b )953{954flag aSign, bSign;955956aSign = extractFloat32Sign( a );957bSign = extractFloat32Sign( b );958if ( aSign == bSign ) {959return subFloat32Sigs( a, b, aSign );960}961else {962return addFloat32Sigs( a, b, aSign );963}964965}966967/*968-------------------------------------------------------------------------------969Returns the result of multiplying the single-precision floating-point values970`a' and `b'. The operation is performed according to the IEC/IEEE Standard971for Binary Floating-Point Arithmetic.972-------------------------------------------------------------------------------973*/974float32 float32_mul( float32 a, float32 b )975{976flag aSign, bSign, zSign;977int16 aExp, bExp, zExp;978bits32 aSig, bSig, zSig0, zSig1;979980aSig = extractFloat32Frac( a );981aExp = extractFloat32Exp( a );982aSign = extractFloat32Sign( a );983bSig = extractFloat32Frac( b );984bExp = extractFloat32Exp( b );985bSign = extractFloat32Sign( b );986zSign = aSign ^ bSign;987if ( aExp == 0xFF ) {988if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {989return propagateFloat32NaN( a, b );990}991if ( ( bExp | bSig ) == 0 ) {992float_raise( float_flag_invalid );993return float32_default_nan;994}995return packFloat32( zSign, 0xFF, 0 );996}997if ( bExp == 0xFF ) {998if ( bSig ) return propagateFloat32NaN( a, b );999if ( ( aExp | aSig ) == 0 ) {1000float_raise( float_flag_invalid );1001return float32_default_nan;1002}1003return packFloat32( zSign, 0xFF, 0 );1004}1005if ( aExp == 0 ) {1006if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );1007normalizeFloat32Subnormal( aSig, &aExp, &aSig );1008}1009if ( bExp == 0 ) {1010if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );1011normalizeFloat32Subnormal( bSig, &bExp, &bSig );1012}1013zExp = aExp + bExp - 0x7F;1014aSig = ( aSig | 0x00800000 )<<7;1015bSig = ( bSig | 0x00800000 )<<8;1016mul32To64( aSig, bSig, &zSig0, &zSig1 );1017zSig0 |= ( zSig1 != 0 );1018if ( 0 <= (sbits32) ( zSig0<<1 ) ) {1019zSig0 <<= 1;1020--zExp;1021}1022return roundAndPackFloat32( zSign, zExp, zSig0 );10231024}10251026/*1027-------------------------------------------------------------------------------1028Returns the result of dividing the single-precision floating-point value `a'1029by the corresponding value `b'. The operation is performed according to the1030IEC/IEEE Standard for Binary Floating-Point Arithmetic.1031-------------------------------------------------------------------------------1032*/1033float32 float32_div( float32 a, float32 b )1034{1035flag aSign, bSign, zSign;1036int16 aExp, bExp, zExp;1037bits32 aSig, bSig, zSig, rem0, rem1, term0, term1;10381039aSig = extractFloat32Frac( a );1040aExp = extractFloat32Exp( a );1041aSign = extractFloat32Sign( a );1042bSig = extractFloat32Frac( b );1043bExp = extractFloat32Exp( b );1044bSign = extractFloat32Sign( b );1045zSign = aSign ^ bSign;1046if ( aExp == 0xFF ) {1047if ( aSig ) return propagateFloat32NaN( a, b );1048if ( bExp == 0xFF ) {1049if ( bSig ) return propagateFloat32NaN( a, b );1050float_raise( float_flag_invalid );1051return float32_default_nan;1052}1053return packFloat32( zSign, 0xFF, 0 );1054}1055if ( bExp == 0xFF ) {1056if ( bSig ) return propagateFloat32NaN( a, b );1057return packFloat32( zSign, 0, 0 );1058}1059if ( bExp == 0 ) {1060if ( bSig == 0 ) {1061if ( ( aExp | aSig ) == 0 ) {1062float_raise( float_flag_invalid );1063return float32_default_nan;1064}1065float_raise( float_flag_divbyzero );1066return packFloat32( zSign, 0xFF, 0 );1067}1068normalizeFloat32Subnormal( bSig, &bExp, &bSig );1069}1070if ( aExp == 0 ) {1071if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );1072normalizeFloat32Subnormal( aSig, &aExp, &aSig );1073}1074zExp = aExp - bExp + 0x7D;1075aSig = ( aSig | 0x00800000 )<<7;1076bSig = ( bSig | 0x00800000 )<<8;1077if ( bSig <= ( aSig + aSig ) ) {1078aSig >>= 1;1079++zExp;1080}1081zSig = estimateDiv64To32( aSig, 0, bSig );1082if ( ( zSig & 0x3F ) <= 2 ) {1083mul32To64( bSig, zSig, &term0, &term1 );1084sub64( aSig, 0, term0, term1, &rem0, &rem1 );1085while ( (sbits32) rem0 < 0 ) {1086--zSig;1087add64( rem0, rem1, 0, bSig, &rem0, &rem1 );1088}1089zSig |= ( rem1 != 0 );1090}1091return roundAndPackFloat32( zSign, zExp, zSig );10921093}10941095#ifndef SOFTFLOAT_FOR_GCC1096/*1097-------------------------------------------------------------------------------1098Returns the remainder of the single-precision floating-point value `a'1099with respect to the corresponding value `b'. The operation is performed1100according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.1101-------------------------------------------------------------------------------1102*/1103float32 float32_rem( float32 a, float32 b )1104{1105flag aSign, bSign, zSign;1106int16 aExp, bExp, expDiff;1107bits32 aSig, bSig, q, allZero, alternateASig;1108sbits32 sigMean;11091110aSig = extractFloat32Frac( a );1111aExp = extractFloat32Exp( a );1112aSign = extractFloat32Sign( a );1113bSig = extractFloat32Frac( b );1114bExp = extractFloat32Exp( b );1115bSign = extractFloat32Sign( b );1116if ( aExp == 0xFF ) {1117if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {1118return propagateFloat32NaN( a, b );1119}1120float_raise( float_flag_invalid );1121return float32_default_nan;1122}1123if ( bExp == 0xFF ) {1124if ( bSig ) return propagateFloat32NaN( a, b );1125return a;1126}1127if ( bExp == 0 ) {1128if ( bSig == 0 ) {1129float_raise( float_flag_invalid );1130return float32_default_nan;1131}1132normalizeFloat32Subnormal( bSig, &bExp, &bSig );1133}1134if ( aExp == 0 ) {1135if ( aSig == 0 ) return a;1136normalizeFloat32Subnormal( aSig, &aExp, &aSig );1137}1138expDiff = aExp - bExp;1139aSig = ( aSig | 0x00800000 )<<8;1140bSig = ( bSig | 0x00800000 )<<8;1141if ( expDiff < 0 ) {1142if ( expDiff < -1 ) return a;1143aSig >>= 1;1144}1145q = ( bSig <= aSig );1146if ( q ) aSig -= bSig;1147expDiff -= 32;1148while ( 0 < expDiff ) {1149q = estimateDiv64To32( aSig, 0, bSig );1150q = ( 2 < q ) ? q - 2 : 0;1151aSig = - ( ( bSig>>2 ) * q );1152expDiff -= 30;1153}1154expDiff += 32;1155if ( 0 < expDiff ) {1156q = estimateDiv64To32( aSig, 0, bSig );1157q = ( 2 < q ) ? q - 2 : 0;1158q >>= 32 - expDiff;1159bSig >>= 2;1160aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;1161}1162else {1163aSig >>= 2;1164bSig >>= 2;1165}1166do {1167alternateASig = aSig;1168++q;1169aSig -= bSig;1170} while ( 0 <= (sbits32) aSig );1171sigMean = aSig + alternateASig;1172if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {1173aSig = alternateASig;1174}1175zSign = ( (sbits32) aSig < 0 );1176if ( zSign ) aSig = - aSig;1177return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );11781179}1180#endif11811182#ifndef SOFTFLOAT_FOR_GCC1183/*1184-------------------------------------------------------------------------------1185Returns the square root of the single-precision floating-point value `a'.1186The operation is performed according to the IEC/IEEE Standard for Binary1187Floating-Point Arithmetic.1188-------------------------------------------------------------------------------1189*/1190float32 float32_sqrt( float32 a )1191{1192flag aSign;1193int16 aExp, zExp;1194bits32 aSig, zSig, rem0, rem1, term0, term1;11951196aSig = extractFloat32Frac( a );1197aExp = extractFloat32Exp( a );1198aSign = extractFloat32Sign( a );1199if ( aExp == 0xFF ) {1200if ( aSig ) return propagateFloat32NaN( a, 0 );1201if ( ! aSign ) return a;1202float_raise( float_flag_invalid );1203return float32_default_nan;1204}1205if ( aSign ) {1206if ( ( aExp | aSig ) == 0 ) return a;1207float_raise( float_flag_invalid );1208return float32_default_nan;1209}1210if ( aExp == 0 ) {1211if ( aSig == 0 ) return 0;1212normalizeFloat32Subnormal( aSig, &aExp, &aSig );1213}1214zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;1215aSig = ( aSig | 0x00800000 )<<8;1216zSig = estimateSqrt32( aExp, aSig ) + 2;1217if ( ( zSig & 0x7F ) <= 5 ) {1218if ( zSig < 2 ) {1219zSig = 0x7FFFFFFF;1220goto roundAndPack;1221}1222else {1223aSig >>= aExp & 1;1224mul32To64( zSig, zSig, &term0, &term1 );1225sub64( aSig, 0, term0, term1, &rem0, &rem1 );1226while ( (sbits32) rem0 < 0 ) {1227--zSig;1228shortShift64Left( 0, zSig, 1, &term0, &term1 );1229term1 |= 1;1230add64( rem0, rem1, term0, term1, &rem0, &rem1 );1231}1232zSig |= ( ( rem0 | rem1 ) != 0 );1233}1234}1235shift32RightJamming( zSig, 1, &zSig );1236roundAndPack:1237return roundAndPackFloat32( 0, zExp, zSig );12381239}1240#endif12411242/*1243-------------------------------------------------------------------------------1244Returns 1 if the single-precision floating-point value `a' is equal to1245the corresponding value `b', and 0 otherwise. The comparison is performed1246according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.1247-------------------------------------------------------------------------------1248*/1249flag float32_eq( float32 a, float32 b )1250{12511252if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )1253|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )1254) {1255if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {1256float_raise( float_flag_invalid );1257}1258return 0;1259}1260return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );12611262}12631264/*1265-------------------------------------------------------------------------------1266Returns 1 if the single-precision floating-point value `a' is less than1267or equal to the corresponding value `b', and 0 otherwise. The comparison1268is performed according to the IEC/IEEE Standard for Binary Floating-Point1269Arithmetic.1270-------------------------------------------------------------------------------1271*/1272flag float32_le( float32 a, float32 b )1273{1274flag aSign, bSign;12751276if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )1277|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )1278) {1279float_raise( float_flag_invalid );1280return 0;1281}1282aSign = extractFloat32Sign( a );1283bSign = extractFloat32Sign( b );1284if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );1285return ( a == b ) || ( aSign ^ ( a < b ) );12861287}12881289/*1290-------------------------------------------------------------------------------1291Returns 1 if the single-precision floating-point value `a' is less than1292the corresponding value `b', and 0 otherwise. The comparison is performed1293according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.1294-------------------------------------------------------------------------------1295*/1296flag float32_lt( float32 a, float32 b )1297{1298flag aSign, bSign;12991300if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )1301|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )1302) {1303float_raise( float_flag_invalid );1304return 0;1305}1306aSign = extractFloat32Sign( a );1307bSign = extractFloat32Sign( b );1308if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );1309return ( a != b ) && ( aSign ^ ( a < b ) );13101311}13121313#ifndef SOFTFLOAT_FOR_GCC /* Not needed */1314/*1315-------------------------------------------------------------------------------1316Returns 1 if the single-precision floating-point value `a' is equal to1317the corresponding value `b', and 0 otherwise. The invalid exception is1318raised if either operand is a NaN. Otherwise, the comparison is performed1319according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.1320-------------------------------------------------------------------------------1321*/1322flag float32_eq_signaling( float32 a, float32 b )1323{13241325if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )1326|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )1327) {1328float_raise( float_flag_invalid );1329return 0;1330}1331return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );13321333}13341335/*1336-------------------------------------------------------------------------------1337Returns 1 if the single-precision floating-point value `a' is less than or1338equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not1339cause an exception. Otherwise, the comparison is performed according to the1340IEC/IEEE Standard for Binary Floating-Point Arithmetic.1341-------------------------------------------------------------------------------1342*/1343flag float32_le_quiet( float32 a, float32 b )1344{1345flag aSign, bSign;1346int16 aExp, bExp;13471348if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )1349|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )1350) {1351if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {1352float_raise( float_flag_invalid );1353}1354return 0;1355}1356aSign = extractFloat32Sign( a );1357bSign = extractFloat32Sign( b );1358if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );1359return ( a == b ) || ( aSign ^ ( a < b ) );13601361}13621363/*1364-------------------------------------------------------------------------------1365Returns 1 if the single-precision floating-point value `a' is less than1366the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an1367exception. Otherwise, the comparison is performed according to the IEC/IEEE1368Standard for Binary Floating-Point Arithmetic.1369-------------------------------------------------------------------------------1370*/1371flag float32_lt_quiet( float32 a, float32 b )1372{1373flag aSign, bSign;13741375if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )1376|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )1377) {1378if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {1379float_raise( float_flag_invalid );1380}1381return 0;1382}1383aSign = extractFloat32Sign( a );1384bSign = extractFloat32Sign( b );1385if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );1386return ( a != b ) && ( aSign ^ ( a < b ) );13871388}1389#endif /* !SOFTFLOAT_FOR_GCC */13901391#ifndef SOFTFLOAT_FOR_GCC /* Not needed */1392/*1393-------------------------------------------------------------------------------1394Returns the result of converting the double-precision floating-point value1395`a' to the 32-bit two's complement integer format. The conversion is1396performed according to the IEC/IEEE Standard for Binary Floating-Point1397Arithmetic---which means in particular that the conversion is rounded1398according to the current rounding mode. If `a' is a NaN, the largest1399positive integer is returned. Otherwise, if the conversion overflows, the1400largest integer with the same sign as `a' is returned.1401-------------------------------------------------------------------------------1402*/1403int32 float64_to_int32( float64 a )1404{1405flag aSign;1406int16 aExp, shiftCount;1407bits32 aSig0, aSig1, absZ, aSigExtra;1408int32 z;1409int8 roundingMode;14101411aSig1 = extractFloat64Frac1( a );1412aSig0 = extractFloat64Frac0( a );1413aExp = extractFloat64Exp( a );1414aSign = extractFloat64Sign( a );1415shiftCount = aExp - 0x413;1416if ( 0 <= shiftCount ) {1417if ( 0x41E < aExp ) {1418if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;1419goto invalid;1420}1421shortShift64Left(1422aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );1423if ( 0x80000000 < absZ ) goto invalid;1424}1425else {1426aSig1 = ( aSig1 != 0 );1427if ( aExp < 0x3FE ) {1428aSigExtra = aExp | aSig0 | aSig1;1429absZ = 0;1430}1431else {1432aSig0 |= 0x00100000;1433aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;1434absZ = aSig0>>( - shiftCount );1435}1436}1437roundingMode = float_rounding_mode;1438if ( roundingMode == float_round_nearest_even ) {1439if ( (sbits32) aSigExtra < 0 ) {1440++absZ;1441if ( (bits32) ( aSigExtra<<1 ) == 0 ) absZ &= ~1;1442}1443z = aSign ? - absZ : absZ;1444}1445else {1446aSigExtra = ( aSigExtra != 0 );1447if ( aSign ) {1448z = - ( absZ1449+ ( ( roundingMode == float_round_down ) & aSigExtra ) );1450}1451else {1452z = absZ + ( ( roundingMode == float_round_up ) & aSigExtra );1453}1454}1455if ( ( aSign ^ ( z < 0 ) ) && z ) {1456invalid:1457float_raise( float_flag_invalid );1458return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;1459}1460if ( aSigExtra ) float_exception_flags |= float_flag_inexact;1461return z;14621463}1464#endif /* !SOFTFLOAT_FOR_GCC */14651466/*1467-------------------------------------------------------------------------------1468Returns the result of converting the double-precision floating-point value1469`a' to the 32-bit two's complement integer format. The conversion is1470performed according to the IEC/IEEE Standard for Binary Floating-Point1471Arithmetic, except that the conversion is always rounded toward zero.1472If `a' is a NaN, the largest positive integer is returned. Otherwise, if1473the conversion overflows, the largest integer with the same sign as `a' is1474returned.1475-------------------------------------------------------------------------------1476*/1477int32 float64_to_int32_round_to_zero( float64 a )1478{1479flag aSign;1480int16 aExp, shiftCount;1481bits32 aSig0, aSig1, absZ, aSigExtra;1482int32 z;14831484aSig1 = extractFloat64Frac1( a );1485aSig0 = extractFloat64Frac0( a );1486aExp = extractFloat64Exp( a );1487aSign = extractFloat64Sign( a );1488shiftCount = aExp - 0x413;1489if ( 0 <= shiftCount ) {1490if ( 0x41E < aExp ) {1491if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;1492goto invalid;1493}1494shortShift64Left(1495aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );1496}1497else {1498if ( aExp < 0x3FF ) {1499if ( aExp | aSig0 | aSig1 ) {1500float_exception_flags |= float_flag_inexact;1501}1502return 0;1503}1504aSig0 |= 0x00100000;1505aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;1506absZ = aSig0>>( - shiftCount );1507}1508z = aSign ? - absZ : absZ;1509if ( ( aSign ^ ( z < 0 ) ) && z ) {1510invalid:1511float_raise( float_flag_invalid );1512return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;1513}1514if ( aSigExtra ) float_exception_flags |= float_flag_inexact;1515return z;15161517}15181519/*1520-------------------------------------------------------------------------------1521Returns the result of converting the double-precision floating-point value1522`a' to the single-precision floating-point format. The conversion is1523performed according to the IEC/IEEE Standard for Binary Floating-Point1524Arithmetic.1525-------------------------------------------------------------------------------1526*/1527float32 float64_to_float32( float64 a )1528{1529flag aSign;1530int16 aExp;1531bits32 aSig0, aSig1, zSig;1532bits32 allZero;15331534aSig1 = extractFloat64Frac1( a );1535aSig0 = extractFloat64Frac0( a );1536aExp = extractFloat64Exp( a );1537aSign = extractFloat64Sign( a );1538if ( aExp == 0x7FF ) {1539if ( aSig0 | aSig1 ) {1540return commonNaNToFloat32( float64ToCommonNaN( a ) );1541}1542return packFloat32( aSign, 0xFF, 0 );1543}1544shift64RightJamming( aSig0, aSig1, 22, &allZero, &zSig );1545if ( aExp ) zSig |= 0x40000000;1546return roundAndPackFloat32( aSign, aExp - 0x381, zSig );15471548}15491550#ifndef SOFTFLOAT_FOR_GCC1551/*1552-------------------------------------------------------------------------------1553Rounds the double-precision floating-point value `a' to an integer,1554and returns the result as a double-precision floating-point value. The1555operation is performed according to the IEC/IEEE Standard for Binary1556Floating-Point Arithmetic.1557-------------------------------------------------------------------------------1558*/1559float64 float64_round_to_int( float64 a )1560{1561flag aSign;1562int16 aExp;1563bits32 lastBitMask, roundBitsMask;1564int8 roundingMode;1565float64 z;15661567aExp = extractFloat64Exp( a );1568if ( 0x413 <= aExp ) {1569if ( 0x433 <= aExp ) {1570if ( ( aExp == 0x7FF )1571&& ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) {1572return propagateFloat64NaN( a, a );1573}1574return a;1575}1576lastBitMask = 1;1577lastBitMask = ( lastBitMask<<( 0x432 - aExp ) )<<1;1578roundBitsMask = lastBitMask - 1;1579z = a;1580roundingMode = float_rounding_mode;1581if ( roundingMode == float_round_nearest_even ) {1582if ( lastBitMask ) {1583add64( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );1584if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;1585}1586else {1587if ( (sbits32) z.low < 0 ) {1588++z.high;1589if ( (bits32) ( z.low<<1 ) == 0 ) z.high &= ~1;1590}1591}1592}1593else if ( roundingMode != float_round_to_zero ) {1594if ( extractFloat64Sign( z )1595^ ( roundingMode == float_round_up ) ) {1596add64( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );1597}1598}1599z.low &= ~ roundBitsMask;1600}1601else {1602if ( aExp <= 0x3FE ) {1603if ( ( ( (bits32) ( a.high<<1 ) ) | a.low ) == 0 ) return a;1604float_exception_flags |= float_flag_inexact;1605aSign = extractFloat64Sign( a );1606switch ( float_rounding_mode ) {1607case float_round_nearest_even:1608if ( ( aExp == 0x3FE )1609&& ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) )1610) {1611return packFloat64( aSign, 0x3FF, 0, 0 );1612}1613break;1614case float_round_down:1615return1616aSign ? packFloat64( 1, 0x3FF, 0, 0 )1617: packFloat64( 0, 0, 0, 0 );1618case float_round_up:1619return1620aSign ? packFloat64( 1, 0, 0, 0 )1621: packFloat64( 0, 0x3FF, 0, 0 );1622}1623return packFloat64( aSign, 0, 0, 0 );1624}1625lastBitMask = 1;1626lastBitMask <<= 0x413 - aExp;1627roundBitsMask = lastBitMask - 1;1628z.low = 0;1629z.high = a.high;1630roundingMode = float_rounding_mode;1631if ( roundingMode == float_round_nearest_even ) {1632z.high += lastBitMask>>1;1633if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {1634z.high &= ~ lastBitMask;1635}1636}1637else if ( roundingMode != float_round_to_zero ) {1638if ( extractFloat64Sign( z )1639^ ( roundingMode == float_round_up ) ) {1640z.high |= ( a.low != 0 );1641z.high += roundBitsMask;1642}1643}1644z.high &= ~ roundBitsMask;1645}1646if ( ( z.low != a.low ) || ( z.high != a.high ) ) {1647float_exception_flags |= float_flag_inexact;1648}1649return z;16501651}1652#endif16531654/*1655-------------------------------------------------------------------------------1656Returns the result of adding the absolute values of the double-precision1657floating-point values `a' and `b'. If `zSign' is 1, the sum is negated1658before being returned. `zSign' is ignored if the result is a NaN.1659The addition is performed according to the IEC/IEEE Standard for Binary1660Floating-Point Arithmetic.1661-------------------------------------------------------------------------------1662*/1663static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )1664{1665int16 aExp, bExp, zExp;1666bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;1667int16 expDiff;16681669aSig1 = extractFloat64Frac1( a );1670aSig0 = extractFloat64Frac0( a );1671aExp = extractFloat64Exp( a );1672bSig1 = extractFloat64Frac1( b );1673bSig0 = extractFloat64Frac0( b );1674bExp = extractFloat64Exp( b );1675expDiff = aExp - bExp;1676if ( 0 < expDiff ) {1677if ( aExp == 0x7FF ) {1678if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );1679return a;1680}1681if ( bExp == 0 ) {1682--expDiff;1683}1684else {1685bSig0 |= 0x00100000;1686}1687shift64ExtraRightJamming(1688bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );1689zExp = aExp;1690}1691else if ( expDiff < 0 ) {1692if ( bExp == 0x7FF ) {1693if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );1694return packFloat64( zSign, 0x7FF, 0, 0 );1695}1696if ( aExp == 0 ) {1697++expDiff;1698}1699else {1700aSig0 |= 0x00100000;1701}1702shift64ExtraRightJamming(1703aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );1704zExp = bExp;1705}1706else {1707if ( aExp == 0x7FF ) {1708if ( aSig0 | aSig1 | bSig0 | bSig1 ) {1709return propagateFloat64NaN( a, b );1710}1711return a;1712}1713add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );1714if ( aExp == 0 ) return packFloat64( zSign, 0, zSig0, zSig1 );1715zSig2 = 0;1716zSig0 |= 0x00200000;1717zExp = aExp;1718goto shiftRight1;1719}1720aSig0 |= 0x00100000;1721add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );1722--zExp;1723if ( zSig0 < 0x00200000 ) goto roundAndPack;1724++zExp;1725shiftRight1:1726shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );1727roundAndPack:1728return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );17291730}17311732/*1733-------------------------------------------------------------------------------1734Returns the result of subtracting the absolute values of the double-1735precision floating-point values `a' and `b'. If `zSign' is 1, the1736difference is negated before being returned. `zSign' is ignored if the1737result is a NaN. The subtraction is performed according to the IEC/IEEE1738Standard for Binary Floating-Point Arithmetic.1739-------------------------------------------------------------------------------1740*/1741static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )1742{1743int16 aExp, bExp, zExp;1744bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;1745int16 expDiff;17461747aSig1 = extractFloat64Frac1( a );1748aSig0 = extractFloat64Frac0( a );1749aExp = extractFloat64Exp( a );1750bSig1 = extractFloat64Frac1( b );1751bSig0 = extractFloat64Frac0( b );1752bExp = extractFloat64Exp( b );1753expDiff = aExp - bExp;1754shortShift64Left( aSig0, aSig1, 10, &aSig0, &aSig1 );1755shortShift64Left( bSig0, bSig1, 10, &bSig0, &bSig1 );1756if ( 0 < expDiff ) goto aExpBigger;1757if ( expDiff < 0 ) goto bExpBigger;1758if ( aExp == 0x7FF ) {1759if ( aSig0 | aSig1 | bSig0 | bSig1 ) {1760return propagateFloat64NaN( a, b );1761}1762float_raise( float_flag_invalid );1763return float64_default_nan;1764}1765if ( aExp == 0 ) {1766aExp = 1;1767bExp = 1;1768}1769if ( bSig0 < aSig0 ) goto aBigger;1770if ( aSig0 < bSig0 ) goto bBigger;1771if ( bSig1 < aSig1 ) goto aBigger;1772if ( aSig1 < bSig1 ) goto bBigger;1773return packFloat64( float_rounding_mode == float_round_down, 0, 0, 0 );1774bExpBigger:1775if ( bExp == 0x7FF ) {1776if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );1777return packFloat64( zSign ^ 1, 0x7FF, 0, 0 );1778}1779if ( aExp == 0 ) {1780++expDiff;1781}1782else {1783aSig0 |= 0x40000000;1784}1785shift64RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );1786bSig0 |= 0x40000000;1787bBigger:1788sub64( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );1789zExp = bExp;1790zSign ^= 1;1791goto normalizeRoundAndPack;1792aExpBigger:1793if ( aExp == 0x7FF ) {1794if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );1795return a;1796}1797if ( bExp == 0 ) {1798--expDiff;1799}1800else {1801bSig0 |= 0x40000000;1802}1803shift64RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );1804aSig0 |= 0x40000000;1805aBigger:1806sub64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );1807zExp = aExp;1808normalizeRoundAndPack:1809--zExp;1810return normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1 );18111812}18131814/*1815-------------------------------------------------------------------------------1816Returns the result of adding the double-precision floating-point values `a'1817and `b'. The operation is performed according to the IEC/IEEE Standard for1818Binary Floating-Point Arithmetic.1819-------------------------------------------------------------------------------1820*/1821float64 float64_add( float64 a, float64 b )1822{1823flag aSign, bSign;18241825aSign = extractFloat64Sign( a );1826bSign = extractFloat64Sign( b );1827if ( aSign == bSign ) {1828return addFloat64Sigs( a, b, aSign );1829}1830else {1831return subFloat64Sigs( a, b, aSign );1832}18331834}18351836/*1837-------------------------------------------------------------------------------1838Returns the result of subtracting the double-precision floating-point values1839`a' and `b'. The operation is performed according to the IEC/IEEE Standard1840for Binary Floating-Point Arithmetic.1841-------------------------------------------------------------------------------1842*/1843float64 float64_sub( float64 a, float64 b )1844{1845flag aSign, bSign;18461847aSign = extractFloat64Sign( a );1848bSign = extractFloat64Sign( b );1849if ( aSign == bSign ) {1850return subFloat64Sigs( a, b, aSign );1851}1852else {1853return addFloat64Sigs( a, b, aSign );1854}18551856}18571858/*1859-------------------------------------------------------------------------------1860Returns the result of multiplying the double-precision floating-point values1861`a' and `b'. The operation is performed according to the IEC/IEEE Standard1862for Binary Floating-Point Arithmetic.1863-------------------------------------------------------------------------------1864*/1865float64 float64_mul( float64 a, float64 b )1866{1867flag aSign, bSign, zSign;1868int16 aExp, bExp, zExp;1869bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;18701871aSig1 = extractFloat64Frac1( a );1872aSig0 = extractFloat64Frac0( a );1873aExp = extractFloat64Exp( a );1874aSign = extractFloat64Sign( a );1875bSig1 = extractFloat64Frac1( b );1876bSig0 = extractFloat64Frac0( b );1877bExp = extractFloat64Exp( b );1878bSign = extractFloat64Sign( b );1879zSign = aSign ^ bSign;1880if ( aExp == 0x7FF ) {1881if ( ( aSig0 | aSig1 )1882|| ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) {1883return propagateFloat64NaN( a, b );1884}1885if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;1886return packFloat64( zSign, 0x7FF, 0, 0 );1887}1888if ( bExp == 0x7FF ) {1889if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );1890if ( ( aExp | aSig0 | aSig1 ) == 0 ) {1891invalid:1892float_raise( float_flag_invalid );1893return float64_default_nan;1894}1895return packFloat64( zSign, 0x7FF, 0, 0 );1896}1897if ( aExp == 0 ) {1898if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );1899normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );1900}1901if ( bExp == 0 ) {1902if ( ( bSig0 | bSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );1903normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );1904}1905zExp = aExp + bExp - 0x400;1906aSig0 |= 0x00100000;1907shortShift64Left( bSig0, bSig1, 12, &bSig0, &bSig1 );1908mul64To128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );1909add64( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );1910zSig2 |= ( zSig3 != 0 );1911if ( 0x00200000 <= zSig0 ) {1912shift64ExtraRightJamming(1913zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );1914++zExp;1915}1916return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );19171918}19191920/*1921-------------------------------------------------------------------------------1922Returns the result of dividing the double-precision floating-point value `a'1923by the corresponding value `b'. The operation is performed according to the1924IEC/IEEE Standard for Binary Floating-Point Arithmetic.1925-------------------------------------------------------------------------------1926*/1927float64 float64_div( float64 a, float64 b )1928{1929flag aSign, bSign, zSign;1930int16 aExp, bExp, zExp;1931bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;1932bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3;19331934aSig1 = extractFloat64Frac1( a );1935aSig0 = extractFloat64Frac0( a );1936aExp = extractFloat64Exp( a );1937aSign = extractFloat64Sign( a );1938bSig1 = extractFloat64Frac1( b );1939bSig0 = extractFloat64Frac0( b );1940bExp = extractFloat64Exp( b );1941bSign = extractFloat64Sign( b );1942zSign = aSign ^ bSign;1943if ( aExp == 0x7FF ) {1944if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );1945if ( bExp == 0x7FF ) {1946if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );1947goto invalid;1948}1949return packFloat64( zSign, 0x7FF, 0, 0 );1950}1951if ( bExp == 0x7FF ) {1952if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );1953return packFloat64( zSign, 0, 0, 0 );1954}1955if ( bExp == 0 ) {1956if ( ( bSig0 | bSig1 ) == 0 ) {1957if ( ( aExp | aSig0 | aSig1 ) == 0 ) {1958invalid:1959float_raise( float_flag_invalid );1960return float64_default_nan;1961}1962float_raise( float_flag_divbyzero );1963return packFloat64( zSign, 0x7FF, 0, 0 );1964}1965normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );1966}1967if ( aExp == 0 ) {1968if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );1969normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );1970}1971zExp = aExp - bExp + 0x3FD;1972shortShift64Left( aSig0 | 0x00100000, aSig1, 11, &aSig0, &aSig1 );1973shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 );1974if ( le64( bSig0, bSig1, aSig0, aSig1 ) ) {1975shift64Right( aSig0, aSig1, 1, &aSig0, &aSig1 );1976++zExp;1977}1978zSig0 = estimateDiv64To32( aSig0, aSig1, bSig0 );1979mul64By32To96( bSig0, bSig1, zSig0, &term0, &term1, &term2 );1980sub96( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );1981while ( (sbits32) rem0 < 0 ) {1982--zSig0;1983add96( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );1984}1985zSig1 = estimateDiv64To32( rem1, rem2, bSig0 );1986if ( ( zSig1 & 0x3FF ) <= 4 ) {1987mul64By32To96( bSig0, bSig1, zSig1, &term1, &term2, &term3 );1988sub96( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );1989while ( (sbits32) rem1 < 0 ) {1990--zSig1;1991add96( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );1992}1993zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );1994}1995shift64ExtraRightJamming( zSig0, zSig1, 0, 11, &zSig0, &zSig1, &zSig2 );1996return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );19971998}19992000#ifndef SOFTFLOAT_FOR_GCC2001/*2002-------------------------------------------------------------------------------2003Returns the remainder of the double-precision floating-point value `a'2004with respect to the corresponding value `b'. The operation is performed2005according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.2006-------------------------------------------------------------------------------2007*/2008float64 float64_rem( float64 a, float64 b )2009{2010flag aSign, bSign, zSign;2011int16 aExp, bExp, expDiff;2012bits32 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;2013bits32 allZero, alternateASig0, alternateASig1, sigMean1;2014sbits32 sigMean0;2015float64 z;20162017aSig1 = extractFloat64Frac1( a );2018aSig0 = extractFloat64Frac0( a );2019aExp = extractFloat64Exp( a );2020aSign = extractFloat64Sign( a );2021bSig1 = extractFloat64Frac1( b );2022bSig0 = extractFloat64Frac0( b );2023bExp = extractFloat64Exp( b );2024bSign = extractFloat64Sign( b );2025if ( aExp == 0x7FF ) {2026if ( ( aSig0 | aSig1 )2027|| ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) {2028return propagateFloat64NaN( a, b );2029}2030goto invalid;2031}2032if ( bExp == 0x7FF ) {2033if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );2034return a;2035}2036if ( bExp == 0 ) {2037if ( ( bSig0 | bSig1 ) == 0 ) {2038invalid:2039float_raise( float_flag_invalid );2040return float64_default_nan;2041}2042normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );2043}2044if ( aExp == 0 ) {2045if ( ( aSig0 | aSig1 ) == 0 ) return a;2046normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );2047}2048expDiff = aExp - bExp;2049if ( expDiff < -1 ) return a;2050shortShift64Left(2051aSig0 | 0x00100000, aSig1, 11 - ( expDiff < 0 ), &aSig0, &aSig1 );2052shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 );2053q = le64( bSig0, bSig1, aSig0, aSig1 );2054if ( q ) sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );2055expDiff -= 32;2056while ( 0 < expDiff ) {2057q = estimateDiv64To32( aSig0, aSig1, bSig0 );2058q = ( 4 < q ) ? q - 4 : 0;2059mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 );2060shortShift96Left( term0, term1, term2, 29, &term1, &term2, &allZero );2061shortShift64Left( aSig0, aSig1, 29, &aSig0, &allZero );2062sub64( aSig0, 0, term1, term2, &aSig0, &aSig1 );2063expDiff -= 29;2064}2065if ( -32 < expDiff ) {2066q = estimateDiv64To32( aSig0, aSig1, bSig0 );2067q = ( 4 < q ) ? q - 4 : 0;2068q >>= - expDiff;2069shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 );2070expDiff += 24;2071if ( expDiff < 0 ) {2072shift64Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );2073}2074else {2075shortShift64Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );2076}2077mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 );2078sub64( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );2079}2080else {2081shift64Right( aSig0, aSig1, 8, &aSig0, &aSig1 );2082shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 );2083}2084do {2085alternateASig0 = aSig0;2086alternateASig1 = aSig1;2087++q;2088sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );2089} while ( 0 <= (sbits32) aSig0 );2090add64(2091aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );2092if ( ( sigMean0 < 0 )2093|| ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {2094aSig0 = alternateASig0;2095aSig1 = alternateASig1;2096}2097zSign = ( (sbits32) aSig0 < 0 );2098if ( zSign ) sub64( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );2099return2100normalizeRoundAndPackFloat64( aSign ^ zSign, bExp - 4, aSig0, aSig1 );21012102}2103#endif21042105#ifndef SOFTFLOAT_FOR_GCC2106/*2107-------------------------------------------------------------------------------2108Returns the square root of the double-precision floating-point value `a'.2109The operation is performed according to the IEC/IEEE Standard for Binary2110Floating-Point Arithmetic.2111-------------------------------------------------------------------------------2112*/2113float64 float64_sqrt( float64 a )2114{2115flag aSign;2116int16 aExp, zExp;2117bits32 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;2118bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3;2119float64 z;21202121aSig1 = extractFloat64Frac1( a );2122aSig0 = extractFloat64Frac0( a );2123aExp = extractFloat64Exp( a );2124aSign = extractFloat64Sign( a );2125if ( aExp == 0x7FF ) {2126if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, a );2127if ( ! aSign ) return a;2128goto invalid;2129}2130if ( aSign ) {2131if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;2132invalid:2133float_raise( float_flag_invalid );2134return float64_default_nan;2135}2136if ( aExp == 0 ) {2137if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( 0, 0, 0, 0 );2138normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );2139}2140zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;2141aSig0 |= 0x00100000;2142shortShift64Left( aSig0, aSig1, 11, &term0, &term1 );2143zSig0 = ( estimateSqrt32( aExp, term0 )>>1 ) + 1;2144if ( zSig0 == 0 ) zSig0 = 0x7FFFFFFF;2145doubleZSig0 = zSig0 + zSig0;2146shortShift64Left( aSig0, aSig1, 9 - ( aExp & 1 ), &aSig0, &aSig1 );2147mul32To64( zSig0, zSig0, &term0, &term1 );2148sub64( aSig0, aSig1, term0, term1, &rem0, &rem1 );2149while ( (sbits32) rem0 < 0 ) {2150--zSig0;2151doubleZSig0 -= 2;2152add64( rem0, rem1, 0, doubleZSig0 | 1, &rem0, &rem1 );2153}2154zSig1 = estimateDiv64To32( rem1, 0, doubleZSig0 );2155if ( ( zSig1 & 0x1FF ) <= 5 ) {2156if ( zSig1 == 0 ) zSig1 = 1;2157mul32To64( doubleZSig0, zSig1, &term1, &term2 );2158sub64( rem1, 0, term1, term2, &rem1, &rem2 );2159mul32To64( zSig1, zSig1, &term2, &term3 );2160sub96( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );2161while ( (sbits32) rem1 < 0 ) {2162--zSig1;2163shortShift64Left( 0, zSig1, 1, &term2, &term3 );2164term3 |= 1;2165term2 |= doubleZSig0;2166add96( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );2167}2168zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );2169}2170shift64ExtraRightJamming( zSig0, zSig1, 0, 10, &zSig0, &zSig1, &zSig2 );2171return roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2 );21722173}2174#endif21752176/*2177-------------------------------------------------------------------------------2178Returns 1 if the double-precision floating-point value `a' is equal to2179the corresponding value `b', and 0 otherwise. The comparison is performed2180according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.2181-------------------------------------------------------------------------------2182*/2183flag float64_eq( float64 a, float64 b )2184{21852186if ( ( ( extractFloat64Exp( a ) == 0x7FF )2187&& ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )2188|| ( ( extractFloat64Exp( b ) == 0x7FF )2189&& ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )2190) {2191if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {2192float_raise( float_flag_invalid );2193}2194return 0;2195}2196return ( a == b ) ||2197( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );21982199}22002201/*2202-------------------------------------------------------------------------------2203Returns 1 if the double-precision floating-point value `a' is less than2204or equal to the corresponding value `b', and 0 otherwise. The comparison2205is performed according to the IEC/IEEE Standard for Binary Floating-Point2206Arithmetic.2207-------------------------------------------------------------------------------2208*/2209flag float64_le( float64 a, float64 b )2210{2211flag aSign, bSign;22122213if ( ( ( extractFloat64Exp( a ) == 0x7FF )2214&& ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )2215|| ( ( extractFloat64Exp( b ) == 0x7FF )2216&& ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )2217) {2218float_raise( float_flag_invalid );2219return 0;2220}2221aSign = extractFloat64Sign( a );2222bSign = extractFloat64Sign( b );2223if ( aSign != bSign )2224return aSign ||2225( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==22260 );2227return ( a == b ) ||2228( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );2229}22302231/*2232-------------------------------------------------------------------------------2233Returns 1 if the double-precision floating-point value `a' is less than2234the corresponding value `b', and 0 otherwise. The comparison is performed2235according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.2236-------------------------------------------------------------------------------2237*/2238flag float64_lt( float64 a, float64 b )2239{2240flag aSign, bSign;22412242if ( ( ( extractFloat64Exp( a ) == 0x7FF )2243&& ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )2244|| ( ( extractFloat64Exp( b ) == 0x7FF )2245&& ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )2246) {2247float_raise( float_flag_invalid );2248return 0;2249}2250aSign = extractFloat64Sign( a );2251bSign = extractFloat64Sign( b );2252if ( aSign != bSign )2253return aSign &&2254( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=22550 );2256return ( a != b ) &&2257( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );22582259}22602261#ifndef SOFTFLOAT_FOR_GCC2262/*2263-------------------------------------------------------------------------------2264Returns 1 if the double-precision floating-point value `a' is equal to2265the corresponding value `b', and 0 otherwise. The invalid exception is2266raised if either operand is a NaN. Otherwise, the comparison is performed2267according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.2268-------------------------------------------------------------------------------2269*/2270flag float64_eq_signaling( float64 a, float64 b )2271{22722273if ( ( ( extractFloat64Exp( a ) == 0x7FF )2274&& ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )2275|| ( ( extractFloat64Exp( b ) == 0x7FF )2276&& ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )2277) {2278float_raise( float_flag_invalid );2279return 0;2280}2281return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );22822283}22842285/*2286-------------------------------------------------------------------------------2287Returns 1 if the double-precision floating-point value `a' is less than or2288equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not2289cause an exception. Otherwise, the comparison is performed according to the2290IEC/IEEE Standard for Binary Floating-Point Arithmetic.2291-------------------------------------------------------------------------------2292*/2293flag float64_le_quiet( float64 a, float64 b )2294{2295flag aSign, bSign;22962297if ( ( ( extractFloat64Exp( a ) == 0x7FF )2298&& ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )2299|| ( ( extractFloat64Exp( b ) == 0x7FF )2300&& ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )2301) {2302if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {2303float_raise( float_flag_invalid );2304}2305return 0;2306}2307aSign = extractFloat64Sign( a );2308bSign = extractFloat64Sign( b );2309if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );2310return ( a == b ) || ( aSign ^ ( a < b ) );23112312}23132314/*2315-------------------------------------------------------------------------------2316Returns 1 if the double-precision floating-point value `a' is less than2317the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an2318exception. Otherwise, the comparison is performed according to the IEC/IEEE2319Standard for Binary Floating-Point Arithmetic.2320-------------------------------------------------------------------------------2321*/2322flag float64_lt_quiet( float64 a, float64 b )2323{2324flag aSign, bSign;23252326if ( ( ( extractFloat64Exp( a ) == 0x7FF )2327&& ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )2328|| ( ( extractFloat64Exp( b ) == 0x7FF )2329&& ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )2330) {2331if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {2332float_raise( float_flag_invalid );2333}2334return 0;2335}2336aSign = extractFloat64Sign( a );2337bSign = extractFloat64Sign( b );2338if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );2339return ( a != b ) && ( aSign ^ ( a < b ) );23402341}23422343#endif234423452346