arch/arm/nwfpe/softfloat.c

   1 /*
   2 ===============================================================================
   3
   4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
   5 Arithmetic Package, Release 2.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  15 arithmetic/softfloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22
  23 Derivative works are acceptable, even for commercial purposes, so long as
  24 (1) they include prominent notice that the work is derivative, and (2) they
  25 include prominent notice akin to these three paragraphs for those parts of
  26 this code that are retained.
  27
  28 ===============================================================================
  29 */
  30
  31 #include <asm/div64.h>
  32
  33 #include "fpa11.h"
  34 //#include "milieu.h"
  35 //#include "softfloat.h"
  36
  37 /*
  38 -------------------------------------------------------------------------------
  39 Floating-point rounding mode, extended double-precision rounding precision,
  40 and exception flags.
  41 -------------------------------------------------------------------------------
  42 */
  43 int8 float_rounding_mode = float_round_nearest_even;
  44 int8 floatx80_rounding_precision = 80;
  45 int8 float_exception_flags;
  46
  47 /*
  48 -------------------------------------------------------------------------------
  49 Primitive arithmetic functions, including multi-word arithmetic, and
  50 division and square root approximations.  (Can be specialized to target if
  51 desired.)
  52 -------------------------------------------------------------------------------
  53 */
  54 #include "softfloat-macros"
  55
  56 /*
  57 -------------------------------------------------------------------------------
  58 Functions and definitions to determine:  (1) whether tininess for underflow
  59 is detected before or after rounding by default, (2) what (if anything)
  60 happens when exceptions are raised, (3) how signaling NaNs are distinguished
  61 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  62 are propagated from function inputs to output.  These details are target-
  63 specific.
  64 -------------------------------------------------------------------------------
  65 */
  66 #include "softfloat-specialize"
  67
  68 /*
  69 -------------------------------------------------------------------------------
  70 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  71 and 7, and returns the properly rounded 32-bit integer corresponding to the
  72 input.  If `zSign' is nonzero, the input is negated before being converted
  73 to an integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point
  74 input is simply rounded to an integer, with the inexact exception raised if
  75 the input cannot be represented exactly as an integer.  If the fixed-point
  76 input is too large, however, the invalid exception is raised and the largest
  77 positive or negative integer is returned.
  78 -------------------------------------------------------------------------------
  79 */
  80 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
  81 {
  82     int8 roundingMode;
  83     flag roundNearestEven;
  84     int8 roundIncrement, roundBits;
  85     int32 z;
  86
  87     roundingMode = float_rounding_mode;
  88     roundNearestEven = ( roundingMode == float_round_nearest_even );
  89     roundIncrement = 0x40;
  90     if ( ! roundNearestEven ) {
  91         if ( roundingMode == float_round_to_zero ) {
  92             roundIncrement = 0;
  93         }
  94         else {
  95             roundIncrement = 0x7F;
  96             if ( zSign ) {
  97                 if ( roundingMode == float_round_up ) roundIncrement = 0;
  98             }
  99             else {
 100                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 101             }
 102         }
 103     }
 104     roundBits = absZ & 0x7F;
 105     absZ = ( absZ + roundIncrement )>>7;
 106     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 107     z = absZ;
 108     if ( zSign ) z = - z;
 109     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 110         float_exception_flags |= float_flag_invalid;
 111         return zSign ? 0x80000000 : 0x7FFFFFFF;
 112     }
 113     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 114     return z;
 115
 116 }
 117
 118 /*
 119 -------------------------------------------------------------------------------
 120 Returns the fraction bits of the single-precision floating-point value `a'.
 121 -------------------------------------------------------------------------------
 122 */
 123 INLINE bits32 extractFloat32Frac( float32 a )
 124 {
 125
 126     return a & 0x007FFFFF;
 127
 128 }
 129
 130 /*
 131 -------------------------------------------------------------------------------
 132 Returns the exponent bits of the single-precision floating-point value `a'.
 133 -------------------------------------------------------------------------------
 134 */
 135 INLINE int16 extractFloat32Exp( float32 a )
 136 {
 137
 138     return ( a>>23 ) & 0xFF;
 139
 140 }
 141
 142 /*
 143 -------------------------------------------------------------------------------
 144 Returns the sign bit of the single-precision floating-point value `a'.
 145 -------------------------------------------------------------------------------
 146 */
 147 #if 0   /* in softfloat.h */
 148 INLINE flag extractFloat32Sign( float32 a )
 149 {
 150
 151     return a>>31;
 152
 153 }
 154 #endif
 155
 156 /*
 157 -------------------------------------------------------------------------------
 158 Normalizes the subnormal single-precision floating-point value represented
 159 by the denormalized significand `aSig'.  The normalized exponent and
 160 significand are stored at the locations pointed to by `zExpPtr' and
 161 `zSigPtr', respectively.
 162 -------------------------------------------------------------------------------
 163 */
 164 static void
 165  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 166 {
 167     int8 shiftCount;
 168
 169     shiftCount = countLeadingZeros32( aSig ) - 8;
 170     *zSigPtr = aSig<<shiftCount;
 171     *zExpPtr = 1 - shiftCount;
 172
 173 }
 174
 175 /*
 176 -------------------------------------------------------------------------------
 177 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 178 single-precision floating-point value, returning the result.  After being
 179 shifted into the proper positions, the three fields are simply added
 180 together to form the result.  This means that any integer portion of `zSig'
 181 will be added into the exponent.  Since a properly normalized significand
 182 will have an integer portion equal to 1, the `zExp' input should be 1 less
 183 than the desired result exponent whenever `zSig' is a complete, normalized
 184 significand.
 185 -------------------------------------------------------------------------------
 186 */
 187 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 188 {
 189 #if 0
 190    float32 f;
 191    __asm__("@ packFloat32                               \n\
 192             mov %0, %1, asl #31                         \n\
 193             orr %0, %2, asl #23                         \n\
 194             orr %0, %3"
 195             : /* no outputs */
 196             : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig)
 197             : "cc");
 198    return f;
 199 #else
 200     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
 201 #endif
 202 }
 203
 204 /*
 205 -------------------------------------------------------------------------------
 206 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 207 and significand `zSig', and returns the proper single-precision floating-
 208 point value corresponding to the abstract input.  Ordinarily, the abstract
 209 value is simply rounded and packed into the single-precision format, with
 210 the inexact exception raised if the abstract input cannot be represented
 211 exactly.  If the abstract value is too large, however, the overflow and
 212 inexact exceptions are raised and an infinity or maximal finite value is
 213 returned.  If the abstract value is too small, the input value is rounded to
 214 a subnormal number, and the underflow and inexact exceptions are raised if
 215 the abstract input cannot be represented exactly as a subnormal single-
 216 precision floating-point number.
 217     The input significand `zSig' has its binary point between bits 30
 218 and 29, which is 7 bits to the left of the usual location.  This shifted
 219 significand must be normalized or smaller.  If `zSig' is not normalized,
 220 `zExp' must be 0; in that case, the result returned is a subnormal number,
 221 and it must not require rounding.  In the usual case that `zSig' is
 222 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 223 The handling of underflow and overflow follows the IEC/IEEE Standard for
 224 Binary Floating-point Arithmetic.
 225 -------------------------------------------------------------------------------
 226 */
 227 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 228 {
 229     int8 roundingMode;
 230     flag roundNearestEven;
 231     int8 roundIncrement, roundBits;
 232     flag isTiny;
 233
 234     roundingMode = float_rounding_mode;
 235     roundNearestEven = ( roundingMode == float_round_nearest_even );
 236     roundIncrement = 0x40;
 237     if ( ! roundNearestEven ) {
 238         if ( roundingMode == float_round_to_zero ) {
 239             roundIncrement = 0;
 240         }
 241         else {
 242             roundIncrement = 0x7F;
 243             if ( zSign ) {
 244                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 245             }
 246             else {
 247                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 248             }
 249         }
 250     }
 251     roundBits = zSig & 0x7F;
 252     if ( 0xFD <= (bits16) zExp ) {
 253         if (    ( 0xFD < zExp )
 254              || (    ( zExp == 0xFD )
 255                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 256            ) {
 257             float_raise( float_flag_overflow | float_flag_inexact );
 258             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
 259         }
 260         if ( zExp < 0 ) {
 261             isTiny =
 262                    ( float_detect_tininess == float_tininess_before_rounding )
 263                 || ( zExp < -1 )
 264                 || ( zSig + roundIncrement < 0x80000000 );
 265             shift32RightJamming( zSig, - zExp, &zSig );
 266             zExp = 0;
 267             roundBits = zSig & 0x7F;
 268             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 269         }
 270     }
 271     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 272     zSig = ( zSig + roundIncrement )>>7;
 273     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 274     if ( zSig == 0 ) zExp = 0;
 275     return packFloat32( zSign, zExp, zSig );
 276
 277 }
 278
 279 /*
 280 -------------------------------------------------------------------------------
 281 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 282 and significand `zSig', and returns the proper single-precision floating-
 283 point value corresponding to the abstract input.  This routine is just like
 284 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
 285 any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
 286 point exponent.
 287 -------------------------------------------------------------------------------
 288 */
 289 static float32
 290  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 291 {
 292     int8 shiftCount;
 293
 294     shiftCount = countLeadingZeros32( zSig ) - 1;
 295     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
 296
 297 }
 298
 299 /*
 300 -------------------------------------------------------------------------------
 301 Returns the fraction bits of the double-precision floating-point value `a'.
 302 -------------------------------------------------------------------------------
 303 */
 304 INLINE bits64 extractFloat64Frac( float64 a )
 305 {
 306
 307     return a & LIT64( 0x000FFFFFFFFFFFFF );
 308
 309 }
 310
 311 /*
 312 -------------------------------------------------------------------------------
 313 Returns the exponent bits of the double-precision floating-point value `a'.
 314 -------------------------------------------------------------------------------
 315 */
 316 INLINE int16 extractFloat64Exp( float64 a )
 317 {
 318
 319     return ( a>>52 ) & 0x7FF;
 320
 321 }
 322
 323 /*
 324 -------------------------------------------------------------------------------
 325 Returns the sign bit of the double-precision floating-point value `a'.
 326 -------------------------------------------------------------------------------
 327 */
 328 #if 0   /* in softfloat.h */
 329 INLINE flag extractFloat64Sign( float64 a )
 330 {
 331
 332     return a>>63;
 333
 334 }
 335 #endif
 336
 337 /*
 338 -------------------------------------------------------------------------------
 339 Normalizes the subnormal double-precision floating-point value represented
 340 by the denormalized significand `aSig'.  The normalized exponent and
 341 significand are stored at the locations pointed to by `zExpPtr' and
 342 `zSigPtr', respectively.
 343 -------------------------------------------------------------------------------
 344 */
 345 static void
 346  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 347 {
 348     int8 shiftCount;
 349
 350     shiftCount = countLeadingZeros64( aSig ) - 11;
 351     *zSigPtr = aSig<<shiftCount;
 352     *zExpPtr = 1 - shiftCount;
 353
 354 }
 355
 356 /*
 357 -------------------------------------------------------------------------------
 358 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 359 double-precision floating-point value, returning the result.  After being
 360 shifted into the proper positions, the three fields are simply added
 361 together to form the result.  This means that any integer portion of `zSig'
 362 will be added into the exponent.  Since a properly normalized significand
 363 will have an integer portion equal to 1, the `zExp' input should be 1 less
 364 than the desired result exponent whenever `zSig' is a complete, normalized
 365 significand.
 366 -------------------------------------------------------------------------------
 367 */
 368 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 369 {
 370
 371     return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
 372
 373 }
 374
 375 /*
 376 -------------------------------------------------------------------------------
 377 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 378 and significand `zSig', and returns the proper double-precision floating-
 379 point value corresponding to the abstract input.  Ordinarily, the abstract
 380 value is simply rounded and packed into the double-precision format, with
 381 the inexact exception raised if the abstract input cannot be represented
 382 exactly.  If the abstract value is too large, however, the overflow and
 383 inexact exceptions are raised and an infinity or maximal finite value is
 384 returned.  If the abstract value is too small, the input value is rounded to
 385 a subnormal number, and the underflow and inexact exceptions are raised if
 386 the abstract input cannot be represented exactly as a subnormal double-
 387 precision floating-point number.
 388     The input significand `zSig' has its binary point between bits 62
 389 and 61, which is 10 bits to the left of the usual location.  This shifted
 390 significand must be normalized or smaller.  If `zSig' is not normalized,
 391 `zExp' must be 0; in that case, the result returned is a subnormal number,
 392 and it must not require rounding.  In the usual case that `zSig' is
 393 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 394 The handling of underflow and overflow follows the IEC/IEEE Standard for
 395 Binary Floating-point Arithmetic.
 396 -------------------------------------------------------------------------------
 397 */
 398 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 399 {
 400     int8 roundingMode;
 401     flag roundNearestEven;
 402     int16 roundIncrement, roundBits;
 403     flag isTiny;
 404
 405     roundingMode = float_rounding_mode;
 406     roundNearestEven = ( roundingMode == float_round_nearest_even );
 407     roundIncrement = 0x200;
 408     if ( ! roundNearestEven ) {
 409         if ( roundingMode == float_round_to_zero ) {
 410             roundIncrement = 0;
 411         }
 412         else {
 413             roundIncrement = 0x3FF;
 414             if ( zSign ) {
 415                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 416             }
 417             else {
 418                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 419             }
 420         }
 421     }
 422     roundBits = zSig & 0x3FF;
 423     if ( 0x7FD <= (bits16) zExp ) {
 424         if (    ( 0x7FD < zExp )
 425              || (    ( zExp == 0x7FD )
 426                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 427            ) {
 428             //register int lr = __builtin_return_address(0);
 429             //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
 430             float_raise( float_flag_overflow | float_flag_inexact );
 431             return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
 432         }
 433         if ( zExp < 0 ) {
 434             isTiny =
 435                    ( float_detect_tininess == float_tininess_before_rounding )
 436                 || ( zExp < -1 )
 437                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 438             shift64RightJamming( zSig, - zExp, &zSig );
 439             zExp = 0;
 440             roundBits = zSig & 0x3FF;
 441             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 442         }
 443     }
 444     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 445     zSig = ( zSig + roundIncrement )>>10;
 446     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 447     if ( zSig == 0 ) zExp = 0;
 448     return packFloat64( zSign, zExp, zSig );
 449
 450 }
 451
 452 /*
 453 -------------------------------------------------------------------------------
 454 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 455 and significand `zSig', and returns the proper double-precision floating-
 456 point value corresponding to the abstract input.  This routine is just like
 457 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
 458 any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
 459 point exponent.
 460 -------------------------------------------------------------------------------
 461 */
 462 static float64
 463  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 464 {
 465     int8 shiftCount;
 466
 467     shiftCount = countLeadingZeros64( zSig ) - 1;
 468     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
 469
 470 }
 471
 472 #ifdef FLOATX80
 473
 474 /*
 475 -------------------------------------------------------------------------------
 476 Returns the fraction bits of the extended double-precision floating-point
 477 value `a'.
 478 -------------------------------------------------------------------------------
 479 */
 480 INLINE bits64 extractFloatx80Frac( floatx80 a )
 481 {
 482
 483     return a.low;
 484
 485 }
 486
 487 /*
 488 -------------------------------------------------------------------------------
 489 Returns the exponent bits of the extended double-precision floating-point
 490 value `a'.
 491 -------------------------------------------------------------------------------
 492 */
 493 INLINE int32 extractFloatx80Exp( floatx80 a )
 494 {
 495
 496     return a.high & 0x7FFF;
 497
 498 }
 499
 500 /*
 501 -------------------------------------------------------------------------------
 502 Returns the sign bit of the extended double-precision floating-point value
 503 `a'.
 504 -------------------------------------------------------------------------------
 505 */
 506 INLINE flag extractFloatx80Sign( floatx80 a )
 507 {
 508
 509     return a.high>>15;
 510
 511 }
 512
 513 /*
 514 -------------------------------------------------------------------------------
 515 Normalizes the subnormal extended double-precision floating-point value
 516 represented by the denormalized significand `aSig'.  The normalized exponent
 517 and significand are stored at the locations pointed to by `zExpPtr' and
 518 `zSigPtr', respectively.
 519 -------------------------------------------------------------------------------
 520 */
 521 static void
 522  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 523 {
 524     int8 shiftCount;
 525
 526     shiftCount = countLeadingZeros64( aSig );
 527     *zSigPtr = aSig<<shiftCount;
 528     *zExpPtr = 1 - shiftCount;
 529
 530 }
 531
 532 /*
 533 -------------------------------------------------------------------------------
 534 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 535 extended double-precision floating-point value, returning the result.
 536 -------------------------------------------------------------------------------
 537 */
 538 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 539 {
 540     floatx80 z;
 541
 542     z.low = zSig;
 543     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 544     return z;
 545
 546 }
 547
 548 /*
 549 -------------------------------------------------------------------------------
 550 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 551 and extended significand formed by the concatenation of `zSig0' and `zSig1',
 552 and returns the proper extended double-precision floating-point value
 553 corresponding to the abstract input.  Ordinarily, the abstract value is
 554 rounded and packed into the extended double-precision format, with the
 555 inexact exception raised if the abstract input cannot be represented
 556 exactly.  If the abstract value is too large, however, the overflow and
 557 inexact exceptions are raised and an infinity or maximal finite value is
 558 returned.  If the abstract value is too small, the input value is rounded to
 559 a subnormal number, and the underflow and inexact exceptions are raised if
 560 the abstract input cannot be represented exactly as a subnormal extended
 561 double-precision floating-point number.
 562     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 563 number of bits as single or double precision, respectively.  Otherwise, the
 564 result is rounded to the full precision of the extended double-precision
 565 format.
 566     The input significand must be normalized or smaller.  If the input
 567 significand is not normalized, `zExp' must be 0; in that case, the result
 568 returned is a subnormal number, and it must not require rounding.  The
 569 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 570 Floating-point Arithmetic.
 571 -------------------------------------------------------------------------------
 572 */
 573 static floatx80
 574  roundAndPackFloatx80(
 575      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 576  )
 577 {
 578     int8 roundingMode;
 579     flag roundNearestEven, increment, isTiny;
 580     int64 roundIncrement, roundMask, roundBits;
 581
 582     roundingMode = float_rounding_mode;
 583     roundNearestEven = ( roundingMode == float_round_nearest_even );
 584     if ( roundingPrecision == 80 ) goto precision80;
 585     if ( roundingPrecision == 64 ) {
 586         roundIncrement = LIT64( 0x0000000000000400 );
 587         roundMask = LIT64( 0x00000000000007FF );
 588     }
 589     else if ( roundingPrecision == 32 ) {
 590         roundIncrement = LIT64( 0x0000008000000000 );
 591         roundMask = LIT64( 0x000000FFFFFFFFFF );
 592     }
 593     else {
 594         goto precision80;
 595     }
 596     zSig0 |= ( zSig1 != 0 );
 597     if ( ! roundNearestEven ) {
 598         if ( roundingMode == float_round_to_zero ) {
 599             roundIncrement = 0;
 600         }
 601         else {
 602             roundIncrement = roundMask;
 603             if ( zSign ) {
 604                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 605             }
 606             else {
 607                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 608             }
 609         }
 610     }
 611     roundBits = zSig0 & roundMask;
 612     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 613         if (    ( 0x7FFE < zExp )
 614              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 615            ) {
 616             goto overflow;
 617         }
 618         if ( zExp <= 0 ) {
 619             isTiny =
 620                    ( float_detect_tininess == float_tininess_before_rounding )
 621                 || ( zExp < 0 )
 622                 || ( zSig0 <= zSig0 + roundIncrement );
 623             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 624             zExp = 0;
 625             roundBits = zSig0 & roundMask;
 626             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 627             if ( roundBits ) float_exception_flags |= float_flag_inexact;
 628             zSig0 += roundIncrement;
 629             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 630             roundIncrement = roundMask + 1;
 631             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 632                 roundMask |= roundIncrement;
 633             }
 634             zSig0 &= ~ roundMask;
 635             return packFloatx80( zSign, zExp, zSig0 );
 636         }
 637     }
 638     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 639     zSig0 += roundIncrement;
 640     if ( zSig0 < roundIncrement ) {
 641         ++zExp;
 642         zSig0 = LIT64( 0x8000000000000000 );
 643     }
 644     roundIncrement = roundMask + 1;
 645     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 646         roundMask |= roundIncrement;
 647     }
 648     zSig0 &= ~ roundMask;
 649     if ( zSig0 == 0 ) zExp = 0;
 650     return packFloatx80( zSign, zExp, zSig0 );
 651  precision80:
 652     increment = ( (sbits64) zSig1 < 0 );
 653     if ( ! roundNearestEven ) {
 654         if ( roundingMode == float_round_to_zero ) {
 655             increment = 0;
 656         }
 657         else {
 658             if ( zSign ) {
 659                 increment = ( roundingMode == float_round_down ) && zSig1;
 660             }
 661             else {
 662                 increment = ( roundingMode == float_round_up ) && zSig1;
 663             }
 664         }
 665     }
 666     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 667         if (    ( 0x7FFE < zExp )
 668              || (    ( zExp == 0x7FFE )
 669                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 670                   && increment
 671                 )
 672            ) {
 673             roundMask = 0;
 674  overflow:
 675             float_raise( float_flag_overflow | float_flag_inexact );
 676             if (    ( roundingMode == float_round_to_zero )
 677                  || ( zSign && ( roundingMode == float_round_up ) )
 678                  || ( ! zSign && ( roundingMode == float_round_down ) )
 679                ) {
 680                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 681             }
 682             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 683         }
 684         if ( zExp <= 0 ) {
 685             isTiny =
 686                    ( float_detect_tininess == float_tininess_before_rounding )
 687                 || ( zExp < 0 )
 688                 || ! increment
 689                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 690             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 691             zExp = 0;
 692             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
 693             if ( zSig1 ) float_exception_flags |= float_flag_inexact;
 694             if ( roundNearestEven ) {
 695                 increment = ( (sbits64) zSig1 < 0 );
 696             }
 697             else {
 698                 if ( zSign ) {
 699                     increment = ( roundingMode == float_round_down ) && zSig1;
 700                 }
 701                 else {
 702                     increment = ( roundingMode == float_round_up ) && zSig1;
 703                 }
 704             }
 705             if ( increment ) {
 706                 ++zSig0;
 707                 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
 708                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 709             }
 710             return packFloatx80( zSign, zExp, zSig0 );
 711         }
 712     }
 713     if ( zSig1 ) float_exception_flags |= float_flag_inexact;
 714     if ( increment ) {
 715         ++zSig0;
 716         if ( zSig0 == 0 ) {
 717             ++zExp;
 718             zSig0 = LIT64( 0x8000000000000000 );
 719         }
 720         else {
 721             zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
 722         }
 723     }
 724     else {
 725         if ( zSig0 == 0 ) zExp = 0;
 726     }
 727
 728     return packFloatx80( zSign, zExp, zSig0 );
 729 }
 730
 731 /*
 732 -------------------------------------------------------------------------------
 733 Takes an abstract floating-point value having sign `zSign', exponent
 734 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 735 and returns the proper extended double-precision floating-point value
 736 corresponding to the abstract input.  This routine is just like
 737 `roundAndPackFloatx80' except that the input significand does not have to be
 738 normalized.
 739 -------------------------------------------------------------------------------
 740 */
 741 static floatx80
 742  normalizeRoundAndPackFloatx80(
 743      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 744  )
 745 {
 746     int8 shiftCount;
 747
 748     if ( zSig0 == 0 ) {
 749         zSig0 = zSig1;
 750         zSig1 = 0;
 751         zExp -= 64;
 752     }
 753     shiftCount = countLeadingZeros64( zSig0 );
 754     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 755     zExp -= shiftCount;
 756     return
 757         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
 758
 759 }
 760
 761 #endif
 762
 763 /*
 764 -------------------------------------------------------------------------------
 765 Returns the result of converting the 32-bit two's complement integer `a' to
 766 the single-precision floating-point format.  The conversion is performed
 767 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
 768 -------------------------------------------------------------------------------
 769 */
 770 float32 int32_to_float32( int32 a )
 771 {
 772     flag zSign;
 773
 774     if ( a == 0 ) return 0;
 775     if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
 776     zSign = ( a < 0 );
 777     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
 778
 779 }
 780
 781 /*
 782 -------------------------------------------------------------------------------
 783 Returns the result of converting the 32-bit two's complement integer `a' to
 784 the double-precision floating-point format.  The conversion is performed
 785 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
 786 -------------------------------------------------------------------------------
 787 */
 788 float64 int32_to_float64( int32 a )
 789 {
 790     flag aSign;
 791     uint32 absA;
 792     int8 shiftCount;
 793     bits64 zSig;
 794
 795     if ( a == 0 ) return 0;
 796     aSign = ( a < 0 );
 797     absA = aSign ? - a : a;
 798     shiftCount = countLeadingZeros32( absA ) + 21;
 799     zSig = absA;
 800     return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
 801
 802 }
 803
 804 #ifdef FLOATX80
 805
 806 /*
 807 -------------------------------------------------------------------------------
 808 Returns the result of converting the 32-bit two's complement integer `a'
 809 to the extended double-precision floating-point format.  The conversion
 810 is performed according to the IEC/IEEE Standard for Binary Floating-point
 811 Arithmetic.
 812 -------------------------------------------------------------------------------
 813 */
 814 floatx80 int32_to_floatx80( int32 a )
 815 {
 816     flag zSign;
 817     uint32 absA;
 818     int8 shiftCount;
 819     bits64 zSig;
 820
 821     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
 822     zSign = ( a < 0 );
 823     absA = zSign ? - a : a;
 824     shiftCount = countLeadingZeros32( absA ) + 32;
 825     zSig = absA;
 826     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
 827
 828 }
 829
 830 #endif
 831
 832 /*
 833 -------------------------------------------------------------------------------
 834 Returns the result of converting the single-precision floating-point value
 835 `a' to the 32-bit two's complement integer format.  The conversion is
 836 performed according to the IEC/IEEE Standard for Binary Floating-point
 837 Arithmetic---which means in particular that the conversion is rounded
 838 according to the current rounding mode.  If `a' is a NaN, the largest
 839 positive integer is returned.  Otherwise, if the conversion overflows, the
 840 largest integer with the same sign as `a' is returned.
 841 -------------------------------------------------------------------------------
 842 */
 843 int32 float32_to_int32( float32 a )
 844 {
 845     flag aSign;
 846     int16 aExp, shiftCount;
 847     bits32 aSig;
 848     bits64 zSig;
 849
 850     aSig = extractFloat32Frac( a );
 851     aExp = extractFloat32Exp( a );
 852     aSign = extractFloat32Sign( a );
 853     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
 854     if ( aExp ) aSig |= 0x00800000;
 855     shiftCount = 0xAF - aExp;
 856     zSig = aSig;
 857     zSig <<= 32;
 858     if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
 859     return roundAndPackInt32( aSign, zSig );
 860
 861 }
 862
 863 /*
 864 -------------------------------------------------------------------------------
 865 Returns the result of converting the single-precision floating-point value
 866 `a' to the 32-bit two's complement integer format.  The conversion is
 867 performed according to the IEC/IEEE Standard for Binary Floating-point
 868 Arithmetic, except that the conversion is always rounded toward zero.  If
 869 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
 870 conversion overflows, the largest integer with the same sign as `a' is
 871 returned.
 872 -------------------------------------------------------------------------------
 873 */
 874 int32 float32_to_int32_round_to_zero( float32 a )
 875 {
 876     flag aSign;
 877     int16 aExp, shiftCount;
 878     bits32 aSig;
 879     int32 z;
 880
 881     aSig = extractFloat32Frac( a );
 882     aExp = extractFloat32Exp( a );
 883     aSign = extractFloat32Sign( a );
 884     shiftCount = aExp - 0x9E;
 885     if ( 0 <= shiftCount ) {
 886         if ( a == 0xCF000000 ) return 0x80000000;
 887         float_raise( float_flag_invalid );
 888         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
 889         return 0x80000000;
 890     }
 891     else if ( aExp <= 0x7E ) {
 892         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
 893         return 0;
 894     }
 895     aSig = ( aSig | 0x00800000 )<<8;
 896     z = aSig>>( - shiftCount );
 897     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
 898         float_exception_flags |= float_flag_inexact;
 899     }
 900     return aSign ? - z : z;
 901
 902 }
 903
 904 /*
 905 -------------------------------------------------------------------------------
 906 Returns the result of converting the single-precision floating-point value
 907 `a' to the double-precision floating-point format.  The conversion is
 908 performed according to the IEC/IEEE Standard for Binary Floating-point
 909 Arithmetic.
 910 -------------------------------------------------------------------------------
 911 */
 912 float64 float32_to_float64( float32 a )
 913 {
 914     flag aSign;
 915     int16 aExp;
 916     bits32 aSig;
 917
 918     aSig = extractFloat32Frac( a );
 919     aExp = extractFloat32Exp( a );
 920     aSign = extractFloat32Sign( a );
 921     if ( aExp == 0xFF ) {
 922         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
 923         return packFloat64( aSign, 0x7FF, 0 );
 924     }
 925     if ( aExp == 0 ) {
 926         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
 927         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 928         --aExp;
 929     }
 930     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
 931
 932 }
 933
 934 #ifdef FLOATX80
 935
 936 /*
 937 -------------------------------------------------------------------------------
 938 Returns the result of converting the single-precision floating-point value
 939 `a' to the extended double-precision floating-point format.  The conversion
 940 is performed according to the IEC/IEEE Standard for Binary Floating-point
 941 Arithmetic.
 942 -------------------------------------------------------------------------------
 943 */
 944 floatx80 float32_to_floatx80( float32 a )
 945 {
 946     flag aSign;
 947     int16 aExp;
 948     bits32 aSig;
 949
 950     aSig = extractFloat32Frac( a );
 951     aExp = extractFloat32Exp( a );
 952     aSign = extractFloat32Sign( a );
 953     if ( aExp == 0xFF ) {
 954         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
 955         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 956     }
 957     if ( aExp == 0 ) {
 958         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
 959         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 960     }
 961     aSig |= 0x00800000;
 962     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
 963
 964 }
 965
 966 #endif
 967
 968 /*
 969 -------------------------------------------------------------------------------
 970 Rounds the single-precision floating-point value `a' to an integer, and
 971 returns the result as a single-precision floating-point value.  The
 972 operation is performed according to the IEC/IEEE Standard for Binary
 973 Floating-point Arithmetic.
 974 -------------------------------------------------------------------------------
 975 */
 976 float32 float32_round_to_int( float32 a )
 977 {
 978     flag aSign;
 979     int16 aExp;
 980     bits32 lastBitMask, roundBitsMask;
 981     int8 roundingMode;
 982     float32 z;
 983
 984     aExp = extractFloat32Exp( a );
 985     if ( 0x96 <= aExp ) {
 986         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
 987             return propagateFloat32NaN( a, a );
 988         }
 989         return a;
 990     }
 991     if ( aExp <= 0x7E ) {
 992         if ( (bits32) ( a<<1 ) == 0 ) return a;
 993         float_exception_flags |= float_flag_inexact;
 994         aSign = extractFloat32Sign( a );
 995         switch ( float_rounding_mode ) {
 996          case float_round_nearest_even:
 997             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
 998                 return packFloat32( aSign, 0x7F, 0 );
 999             }
1000             break;
1001          case float_round_down:
1002             return aSign ? 0xBF800000 : 0;
1003          case float_round_up:
1004             return aSign ? 0x80000000 : 0x3F800000;
1005         }
1006         return packFloat32( aSign, 0, 0 );
1007     }
1008     lastBitMask = 1;
1009     lastBitMask <<= 0x96 - aExp;
1010     roundBitsMask = lastBitMask - 1;
1011     z = a;
1012     roundingMode = float_rounding_mode;
1013     if ( roundingMode == float_round_nearest_even ) {
1014         z += lastBitMask>>1;
1015         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1016     }
1017     else if ( roundingMode != float_round_to_zero ) {
1018         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1019             z += roundBitsMask;
1020         }
1021     }
1022     z &= ~ roundBitsMask;
1023     if ( z != a ) float_exception_flags |= float_flag_inexact;
1024     return z;
1025
1026 }
1027
1028 /*
1029 -------------------------------------------------------------------------------
1030 Returns the result of adding the absolute values of the single-precision
1031 floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1032 before being returned.  `zSign' is ignored if the result is a NaN.  The
1033 addition is performed according to the IEC/IEEE Standard for Binary
1034 Floating-point Arithmetic.
1035 -------------------------------------------------------------------------------
1036 */
1037 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1038 {
1039     int16 aExp, bExp, zExp;
1040     bits32 aSig, bSig, zSig;
1041     int16 expDiff;
1042
1043     aSig = extractFloat32Frac( a );
1044     aExp = extractFloat32Exp( a );
1045     bSig = extractFloat32Frac( b );
1046     bExp = extractFloat32Exp( b );
1047     expDiff = aExp - bExp;
1048     aSig <<= 6;
1049     bSig <<= 6;
1050     if ( 0 < expDiff ) {
1051         if ( aExp == 0xFF ) {
1052             if ( aSig ) return propagateFloat32NaN( a, b );
1053             return a;
1054         }
1055         if ( bExp == 0 ) {
1056             --expDiff;
1057         }
1058         else {
1059             bSig |= 0x20000000;
1060         }
1061         shift32RightJamming( bSig, expDiff, &bSig );
1062         zExp = aExp;
1063     }
1064     else if ( expDiff < 0 ) {
1065         if ( bExp == 0xFF ) {
1066             if ( bSig ) return propagateFloat32NaN( a, b );
1067             return packFloat32( zSign, 0xFF, 0 );
1068         }
1069         if ( aExp == 0 ) {
1070             ++expDiff;
1071         }
1072         else {
1073             aSig |= 0x20000000;
1074         }
1075         shift32RightJamming( aSig, - expDiff, &aSig );
1076         zExp = bExp;
1077     }
1078     else {
1079         if ( aExp == 0xFF ) {
1080             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1081             return a;
1082         }
1083         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1084         zSig = 0x40000000 + aSig + bSig;
1085         zExp = aExp;
1086         goto roundAndPack;
1087     }
1088     aSig |= 0x20000000;
1089     zSig = ( aSig + bSig )<<1;
1090     --zExp;
1091     if ( (sbits32) zSig < 0 ) {
1092         zSig = aSig + bSig;
1093         ++zExp;
1094     }
1095  roundAndPack:
1096     return roundAndPackFloat32( zSign, zExp, zSig );
1097
1098 }
1099
1100 /*
1101 -------------------------------------------------------------------------------
1102 Returns the result of subtracting the absolute values of the single-
1103 precision floating-point values `a' and `b'.  If `zSign' is true, the
1104 difference is negated before being returned.  `zSign' is ignored if the
1105 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1106 Standard for Binary Floating-point Arithmetic.
1107 -------------------------------------------------------------------------------
1108 */
1109 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1110 {
1111     int16 aExp, bExp, zExp;
1112     bits32 aSig, bSig, zSig;
1113     int16 expDiff;
1114
1115     aSig = extractFloat32Frac( a );
1116     aExp = extractFloat32Exp( a );
1117     bSig = extractFloat32Frac( b );
1118     bExp = extractFloat32Exp( b );
1119     expDiff = aExp - bExp;
1120     aSig <<= 7;
1121     bSig <<= 7;
1122     if ( 0 < expDiff ) goto aExpBigger;
1123     if ( expDiff < 0 ) goto bExpBigger;
1124     if ( aExp == 0xFF ) {
1125         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1126         float_raise( float_flag_invalid );
1127         return float32_default_nan;
1128     }
1129     if ( aExp == 0 ) {
1130         aExp = 1;
1131         bExp = 1;
1132     }
1133     if ( bSig < aSig ) goto aBigger;
1134     if ( aSig < bSig ) goto bBigger;
1135     return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1136  bExpBigger:
1137     if ( bExp == 0xFF ) {
1138         if ( bSig ) return propagateFloat32NaN( a, b );
1139         return packFloat32( zSign ^ 1, 0xFF, 0 );
1140     }
1141     if ( aExp == 0 ) {
1142         ++expDiff;
1143     }
1144     else {
1145         aSig |= 0x40000000;
1146     }
1147     shift32RightJamming( aSig, - expDiff, &aSig );
1148     bSig |= 0x40000000;
1149  bBigger:
1150     zSig = bSig - aSig;
1151     zExp = bExp;
1152     zSign ^= 1;
1153     goto normalizeRoundAndPack;
1154  aExpBigger:
1155     if ( aExp == 0xFF ) {
1156         if ( aSig ) return propagateFloat32NaN( a, b );
1157         return a;
1158     }
1159     if ( bExp == 0 ) {
1160         --expDiff;
1161     }
1162     else {
1163         bSig |= 0x40000000;
1164     }
1165     shift32RightJamming( bSig, expDiff, &bSig );
1166     aSig |= 0x40000000;
1167  aBigger:
1168     zSig = aSig - bSig;
1169     zExp = aExp;
1170  normalizeRoundAndPack:
1171     --zExp;
1172     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1173
1174 }
1175
1176 /*
1177 -------------------------------------------------------------------------------
1178 Returns the result of adding the single-precision floating-point values `a'
1179 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1180 Binary Floating-point Arithmetic.
1181 -------------------------------------------------------------------------------
1182 */
1183 float32 float32_add( float32 a, float32 b )
1184 {
1185     flag aSign, bSign;
1186
1187     aSign = extractFloat32Sign( a );
1188     bSign = extractFloat32Sign( b );
1189     if ( aSign == bSign ) {
1190         return addFloat32Sigs( a, b, aSign );
1191     }
1192     else {
1193         return subFloat32Sigs( a, b, aSign );
1194     }
1195
1196 }
1197
1198 /*
1199 -------------------------------------------------------------------------------
1200 Returns the result of subtracting the single-precision floating-point values
1201 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1202 for Binary Floating-point Arithmetic.
1203 -------------------------------------------------------------------------------
1204 */
1205 float32 float32_sub( float32 a, float32 b )
1206 {
1207     flag aSign, bSign;
1208
1209     aSign = extractFloat32Sign( a );
1210     bSign = extractFloat32Sign( b );
1211     if ( aSign == bSign ) {
1212         return subFloat32Sigs( a, b, aSign );
1213     }
1214     else {
1215         return addFloat32Sigs( a, b, aSign );
1216     }
1217
1218 }
1219
1220 /*
1221 -------------------------------------------------------------------------------
1222 Returns the result of multiplying the single-precision floating-point values
1223 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1224 for Binary Floating-point Arithmetic.
1225 -------------------------------------------------------------------------------
1226 */
1227 float32 float32_mul( float32 a, float32 b )
1228 {
1229     flag aSign, bSign, zSign;
1230     int16 aExp, bExp, zExp;
1231     bits32 aSig, bSig;
1232     bits64 zSig64;
1233     bits32 zSig;
1234
1235     aSig = extractFloat32Frac( a );
1236     aExp = extractFloat32Exp( a );
1237     aSign = extractFloat32Sign( a );
1238     bSig = extractFloat32Frac( b );
1239     bExp = extractFloat32Exp( b );
1240     bSign = extractFloat32Sign( b );
1241     zSign = aSign ^ bSign;
1242     if ( aExp == 0xFF ) {
1243         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1244             return propagateFloat32NaN( a, b );
1245         }
1246         if ( ( bExp | bSig ) == 0 ) {
1247             float_raise( float_flag_invalid );
1248             return float32_default_nan;
1249         }
1250         return packFloat32( zSign, 0xFF, 0 );
1251     }
1252     if ( bExp == 0xFF ) {
1253         if ( bSig ) return propagateFloat32NaN( a, b );
1254         if ( ( aExp | aSig ) == 0 ) {
1255             float_raise( float_flag_invalid );
1256             return float32_default_nan;
1257         }
1258         return packFloat32( zSign, 0xFF, 0 );
1259     }
1260     if ( aExp == 0 ) {
1261         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1262         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1263     }
1264     if ( bExp == 0 ) {
1265         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1266         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1267     }
1268     zExp = aExp + bExp - 0x7F;
1269     aSig = ( aSig | 0x00800000 )<<7;
1270     bSig = ( bSig | 0x00800000 )<<8;
1271     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1272     zSig = zSig64;
1273     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1274         zSig <<= 1;
1275         --zExp;
1276     }
1277     return roundAndPackFloat32( zSign, zExp, zSig );
1278
1279 }
1280
1281 /*
1282 -------------------------------------------------------------------------------
1283 Returns the result of dividing the single-precision floating-point value `a'
1284 by the corresponding value `b'.  The operation is performed according to the
1285 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1286 -------------------------------------------------------------------------------
1287 */
1288 float32 float32_div( float32 a, float32 b )
1289 {
1290     flag aSign, bSign, zSign;
1291     int16 aExp, bExp, zExp;
1292     bits32 aSig, bSig, zSig;
1293
1294     aSig = extractFloat32Frac( a );
1295     aExp = extractFloat32Exp( a );
1296     aSign = extractFloat32Sign( a );
1297     bSig = extractFloat32Frac( b );
1298     bExp = extractFloat32Exp( b );
1299     bSign = extractFloat32Sign( b );
1300     zSign = aSign ^ bSign;
1301     if ( aExp == 0xFF ) {
1302         if ( aSig ) return propagateFloat32NaN( a, b );
1303         if ( bExp == 0xFF ) {
1304             if ( bSig ) return propagateFloat32NaN( a, b );
1305             float_raise( float_flag_invalid );
1306             return float32_default_nan;
1307         }
1308         return packFloat32( zSign, 0xFF, 0 );
1309     }
1310     if ( bExp == 0xFF ) {
1311         if ( bSig ) return propagateFloat32NaN( a, b );
1312         return packFloat32( zSign, 0, 0 );
1313     }
1314     if ( bExp == 0 ) {
1315         if ( bSig == 0 ) {
1316             if ( ( aExp | aSig ) == 0 ) {
1317                 float_raise( float_flag_invalid );
1318                 return float32_default_nan;
1319             }
1320             float_raise( float_flag_divbyzero );
1321             return packFloat32( zSign, 0xFF, 0 );
1322         }
1323         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1324     }
1325     if ( aExp == 0 ) {
1326         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1327         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1328     }
1329     zExp = aExp - bExp + 0x7D;
1330     aSig = ( aSig | 0x00800000 )<<7;
1331     bSig = ( bSig | 0x00800000 )<<8;
1332     if ( bSig <= ( aSig + aSig ) ) {
1333         aSig >>= 1;
1334         ++zExp;
1335     }
1336     {
1337         bits64 tmp = ( (bits64) aSig )<<32;
1338         do_div( tmp, bSig );
1339         zSig = tmp;
1340     }
1341     if ( ( zSig & 0x3F ) == 0 ) {
1342         zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1343     }
1344     return roundAndPackFloat32( zSign, zExp, zSig );
1345
1346 }
1347
1348 /*
1349 -------------------------------------------------------------------------------
1350 Returns the remainder of the single-precision floating-point value `a'
1351 with respect to the corresponding value `b'.  The operation is performed
1352 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1353 -------------------------------------------------------------------------------
1354 */
1355 float32 float32_rem( float32 a, float32 b )
1356 {
1357     flag aSign, bSign, zSign;
1358     int16 aExp, bExp, expDiff;
1359     bits32 aSig, bSig;
1360     bits32 q;
1361     bits64 aSig64, bSig64, q64;
1362     bits32 alternateASig;
1363     sbits32 sigMean;
1364
1365     aSig = extractFloat32Frac( a );
1366     aExp = extractFloat32Exp( a );
1367     aSign = extractFloat32Sign( a );
1368     bSig = extractFloat32Frac( b );
1369     bExp = extractFloat32Exp( b );
1370     bSign = extractFloat32Sign( b );
1371     if ( aExp == 0xFF ) {
1372         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1373             return propagateFloat32NaN( a, b );
1374         }
1375         float_raise( float_flag_invalid );
1376         return float32_default_nan;
1377     }
1378     if ( bExp == 0xFF ) {
1379         if ( bSig ) return propagateFloat32NaN( a, b );
1380         return a;
1381     }
1382     if ( bExp == 0 ) {
1383         if ( bSig == 0 ) {
1384             float_raise( float_flag_invalid );
1385             return float32_default_nan;
1386         }
1387         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1388     }
1389     if ( aExp == 0 ) {
1390         if ( aSig == 0 ) return a;
1391         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1392     }
1393     expDiff = aExp - bExp;
1394     aSig |= 0x00800000;
1395     bSig |= 0x00800000;
1396     if ( expDiff < 32 ) {
1397         aSig <<= 8;
1398         bSig <<= 8;
1399         if ( expDiff < 0 ) {
1400             if ( expDiff < -1 ) return a;
1401             aSig >>= 1;
1402         }
1403         q = ( bSig <= aSig );
1404         if ( q ) aSig -= bSig;
1405         if ( 0 < expDiff ) {
1406             bits64 tmp = ( (bits64) aSig )<<32;
1407             do_div( tmp, bSig );
1408             q = tmp;
1409             q >>= 32 - expDiff;
1410             bSig >>= 2;
1411             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1412         }
1413         else {
1414             aSig >>= 2;
1415             bSig >>= 2;
1416         }
1417     }
1418     else {
1419         if ( bSig <= aSig ) aSig -= bSig;
1420         aSig64 = ( (bits64) aSig )<<40;
1421         bSig64 = ( (bits64) bSig )<<40;
1422         expDiff -= 64;
1423         while ( 0 < expDiff ) {
1424             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1425             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1426             aSig64 = - ( ( bSig * q64 )<<38 );
1427             expDiff -= 62;
1428         }
1429         expDiff += 64;
1430         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1431         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1432         q = q64>>( 64 - expDiff );
1433         bSig <<= 6;
1434         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1435     }
1436     do {
1437         alternateASig = aSig;
1438         ++q;
1439         aSig -= bSig;
1440     } while ( 0 <= (sbits32) aSig );
1441     sigMean = aSig + alternateASig;
1442     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1443         aSig = alternateASig;
1444     }
1445     zSign = ( (sbits32) aSig < 0 );
1446     if ( zSign ) aSig = - aSig;
1447     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1448
1449 }
1450
1451 /*
1452 -------------------------------------------------------------------------------
1453 Returns the square root of the single-precision floating-point value `a'.
1454 The operation is performed according to the IEC/IEEE Standard for Binary
1455 Floating-point Arithmetic.
1456 -------------------------------------------------------------------------------
1457 */
1458 float32 float32_sqrt( float32 a )
1459 {
1460     flag aSign;
1461     int16 aExp, zExp;
1462     bits32 aSig, zSig;
1463     bits64 rem, term;
1464
1465     aSig = extractFloat32Frac( a );
1466     aExp = extractFloat32Exp( a );
1467     aSign = extractFloat32Sign( a );
1468     if ( aExp == 0xFF ) {
1469         if ( aSig ) return propagateFloat32NaN( a, 0 );
1470         if ( ! aSign ) return a;
1471         float_raise( float_flag_invalid );
1472         return float32_default_nan;
1473     }
1474     if ( aSign ) {
1475         if ( ( aExp | aSig ) == 0 ) return a;
1476         float_raise( float_flag_invalid );
1477         return float32_default_nan;
1478     }
1479     if ( aExp == 0 ) {
1480         if ( aSig == 0 ) return 0;
1481         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1482     }
1483     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1484     aSig = ( aSig | 0x00800000 )<<8;
1485     zSig = estimateSqrt32( aExp, aSig ) + 2;
1486     if ( ( zSig & 0x7F ) <= 5 ) {
1487         if ( zSig < 2 ) {
1488             zSig = 0xFFFFFFFF;
1489         }
1490         else {
1491             aSig >>= aExp & 1;
1492             term = ( (bits64) zSig ) * zSig;
1493             rem = ( ( (bits64) aSig )<<32 ) - term;
1494             while ( (sbits64) rem < 0 ) {
1495                 --zSig;
1496                 rem += ( ( (bits64) zSig )<<1 ) | 1;
1497             }
1498             zSig |= ( rem != 0 );
1499         }
1500     }
1501     shift32RightJamming( zSig, 1, &zSig );
1502     return roundAndPackFloat32( 0, zExp, zSig );
1503
1504 }
1505
1506 /*
1507 -------------------------------------------------------------------------------
1508 Returns 1 if the single-precision floating-point value `a' is equal to the
1509 corresponding value `b', and 0 otherwise.  The comparison is performed
1510 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1511 -------------------------------------------------------------------------------
1512 */
1513 flag float32_eq( float32 a, float32 b )
1514 {
1515
1516     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1517          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1518        ) {
1519         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1520             float_raise( float_flag_invalid );
1521         }
1522         return 0;
1523     }
1524     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1525
1526 }
1527
1528 /*
1529 -------------------------------------------------------------------------------
1530 Returns 1 if the single-precision floating-point value `a' is less than or
1531 equal to the corresponding value `b', and 0 otherwise.  The comparison is
1532 performed according to the IEC/IEEE Standard for Binary Floating-point
1533 Arithmetic.
1534 -------------------------------------------------------------------------------
1535 */
1536 flag float32_le( float32 a, float32 b )
1537 {
1538     flag aSign, bSign;
1539
1540     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1541          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1542        ) {
1543         float_raise( float_flag_invalid );
1544         return 0;
1545     }
1546     aSign = extractFloat32Sign( a );
1547     bSign = extractFloat32Sign( b );
1548     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1549     return ( a == b ) || ( aSign ^ ( a < b ) );
1550
1551 }
1552
1553 /*
1554 -------------------------------------------------------------------------------
1555 Returns 1 if the single-precision floating-point value `a' is less than
1556 the corresponding value `b', and 0 otherwise.  The comparison is performed
1557 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1558 -------------------------------------------------------------------------------
1559 */
1560 flag float32_lt( float32 a, float32 b )
1561 {
1562     flag aSign, bSign;
1563
1564     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1565          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1566        ) {
1567         float_raise( float_flag_invalid );
1568         return 0;
1569     }
1570     aSign = extractFloat32Sign( a );
1571     bSign = extractFloat32Sign( b );
1572     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1573     return ( a != b ) && ( aSign ^ ( a < b ) );
1574
1575 }
1576
1577 /*
1578 -------------------------------------------------------------------------------
1579 Returns 1 if the single-precision floating-point value `a' is equal to the
1580 corresponding value `b', and 0 otherwise.  The invalid exception is raised
1581 if either operand is a NaN.  Otherwise, the comparison is performed
1582 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1583 -------------------------------------------------------------------------------
1584 */
1585 flag float32_eq_signaling( float32 a, float32 b )
1586 {
1587
1588     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1589          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1590        ) {
1591         float_raise( float_flag_invalid );
1592         return 0;
1593     }
1594     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1595
1596 }
1597
1598 /*
1599 -------------------------------------------------------------------------------
1600 Returns 1 if the single-precision floating-point value `a' is less than or
1601 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
1602 cause an exception.  Otherwise, the comparison is performed according to the
1603 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1604 -------------------------------------------------------------------------------
1605 */
1606 flag float32_le_quiet( float32 a, float32 b )
1607 {
1608     flag aSign, bSign;
1609     //int16 aExp, bExp;
1610
1611     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1612          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1613        ) {
1614         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1615             float_raise( float_flag_invalid );
1616         }
1617         return 0;
1618     }
1619     aSign = extractFloat32Sign( a );
1620     bSign = extractFloat32Sign( b );
1621     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1622     return ( a == b ) || ( aSign ^ ( a < b ) );
1623
1624 }
1625
1626 /*
1627 -------------------------------------------------------------------------------
1628 Returns 1 if the single-precision floating-point value `a' is less than
1629 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
1630 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
1631 Standard for Binary Floating-point Arithmetic.
1632 -------------------------------------------------------------------------------
1633 */
1634 flag float32_lt_quiet( float32 a, float32 b )
1635 {
1636     flag aSign, bSign;
1637
1638     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1639          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1640        ) {
1641         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1642             float_raise( float_flag_invalid );
1643         }
1644         return 0;
1645     }
1646     aSign = extractFloat32Sign( a );
1647     bSign = extractFloat32Sign( b );
1648     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1649     return ( a != b ) && ( aSign ^ ( a < b ) );
1650
1651 }
1652
1653 /*
1654 -------------------------------------------------------------------------------
1655 Returns the result of converting the double-precision floating-point value
1656 `a' to the 32-bit two's complement integer format.  The conversion is
1657 performed according to the IEC/IEEE Standard for Binary Floating-point
1658 Arithmetic---which means in particular that the conversion is rounded
1659 according to the current rounding mode.  If `a' is a NaN, the largest
1660 positive integer is returned.  Otherwise, if the conversion overflows, the
1661 largest integer with the same sign as `a' is returned.
1662 -------------------------------------------------------------------------------
1663 */
1664 int32 float64_to_int32( float64 a )
1665 {
1666     flag aSign;
1667     int16 aExp, shiftCount;
1668     bits64 aSig;
1669
1670     aSig = extractFloat64Frac( a );
1671     aExp = extractFloat64Exp( a );
1672     aSign = extractFloat64Sign( a );
1673     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1674     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1675     shiftCount = 0x42C - aExp;
1676     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1677     return roundAndPackInt32( aSign, aSig );
1678
1679 }
1680
1681 /*
1682 -------------------------------------------------------------------------------
1683 Returns the result of converting the double-precision floating-point value
1684 `a' to the 32-bit two's complement integer format.  The conversion is
1685 performed according to the IEC/IEEE Standard for Binary Floating-point
1686 Arithmetic, except that the conversion is always rounded toward zero.  If
1687 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1688 conversion overflows, the largest integer with the same sign as `a' is
1689 returned.
1690 -------------------------------------------------------------------------------
1691 */
1692 int32 float64_to_int32_round_to_zero( float64 a )
1693 {
1694     flag aSign;
1695     int16 aExp, shiftCount;
1696     bits64 aSig, savedASig;
1697     int32 z;
1698
1699     aSig = extractFloat64Frac( a );
1700     aExp = extractFloat64Exp( a );
1701     aSign = extractFloat64Sign( a );
1702     shiftCount = 0x433 - aExp;
1703     if ( shiftCount < 21 ) {
1704         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1705         goto invalid;
1706     }
1707     else if ( 52 < shiftCount ) {
1708         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1709         return 0;
1710     }
1711     aSig |= LIT64( 0x0010000000000000 );
1712     savedASig = aSig;
1713     aSig >>= shiftCount;
1714     z = aSig;
1715     if ( aSign ) z = - z;
1716     if ( ( z < 0 ) ^ aSign ) {
1717  invalid:
1718         float_exception_flags |= float_flag_invalid;
1719         return aSign ? 0x80000000 : 0x7FFFFFFF;
1720     }
1721     if ( ( aSig<<shiftCount ) != savedASig ) {
1722         float_exception_flags |= float_flag_inexact;
1723     }
1724     return z;
1725
1726 }
1727
1728 /*
1729 -------------------------------------------------------------------------------
1730 Returns the result of converting the double-precision floating-point value
1731 `a' to the 32-bit two's complement unsigned integer format.  The conversion
1732 is performed according to the IEC/IEEE Standard for Binary Floating-point
1733 Arithmetic---which means in particular that the conversion is rounded
1734 according to the current rounding mode.  If `a' is a NaN, the largest
1735 positive integer is returned.  Otherwise, if the conversion overflows, the
1736 largest positive integer is returned.
1737 -------------------------------------------------------------------------------
1738 */
1739 int32 float64_to_uint32( float64 a )
1740 {
1741     flag aSign;
1742     int16 aExp, shiftCount;
1743     bits64 aSig;
1744
1745     aSig = extractFloat64Frac( a );
1746     aExp = extractFloat64Exp( a );
1747     aSign = 0; //extractFloat64Sign( a );
1748     //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1749     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1750     shiftCount = 0x42C - aExp;
1751     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1752     return roundAndPackInt32( aSign, aSig );
1753 }
1754
1755 /*
1756 -------------------------------------------------------------------------------
1757 Returns the result of converting the double-precision floating-point value
1758 `a' to the 32-bit two's complement integer format.  The conversion is
1759 performed according to the IEC/IEEE Standard for Binary Floating-point
1760 Arithmetic, except that the conversion is always rounded toward zero.  If
1761 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1762 conversion overflows, the largest positive integer is returned.
1763 -------------------------------------------------------------------------------
1764 */
1765 int32 float64_to_uint32_round_to_zero( float64 a )
1766 {
1767     flag aSign;
1768     int16 aExp, shiftCount;
1769     bits64 aSig, savedASig;
1770     int32 z;
1771
1772     aSig = extractFloat64Frac( a );
1773     aExp = extractFloat64Exp( a );
1774     aSign = extractFloat64Sign( a );
1775     shiftCount = 0x433 - aExp;
1776     if ( shiftCount < 21 ) {
1777         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1778         goto invalid;
1779     }
1780     else if ( 52 < shiftCount ) {
1781         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1782         return 0;
1783     }
1784     aSig |= LIT64( 0x0010000000000000 );
1785     savedASig = aSig;
1786     aSig >>= shiftCount;
1787     z = aSig;
1788     if ( aSign ) z = - z;
1789     if ( ( z < 0 ) ^ aSign ) {
1790  invalid:
1791         float_exception_flags |= float_flag_invalid;
1792         return aSign ? 0x80000000 : 0x7FFFFFFF;
1793     }
1794     if ( ( aSig<<shiftCount ) != savedASig ) {
1795         float_exception_flags |= float_flag_inexact;
1796     }
1797     return z;
1798 }
1799
1800 /*
1801 -------------------------------------------------------------------------------
1802 Returns the result of converting the double-precision floating-point value
1803 `a' to the single-precision floating-point format.  The conversion is
1804 performed according to the IEC/IEEE Standard for Binary Floating-point
1805 Arithmetic.
1806 -------------------------------------------------------------------------------
1807 */
1808 float32 float64_to_float32( float64 a )
1809 {
1810     flag aSign;
1811     int16 aExp;
1812     bits64 aSig;
1813     bits32 zSig;
1814
1815     aSig = extractFloat64Frac( a );
1816     aExp = extractFloat64Exp( a );
1817     aSign = extractFloat64Sign( a );
1818     if ( aExp == 0x7FF ) {
1819         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1820         return packFloat32( aSign, 0xFF, 0 );
1821     }
1822     shift64RightJamming( aSig, 22, &aSig );
1823     zSig = aSig;
1824     if ( aExp || zSig ) {
1825         zSig |= 0x40000000;
1826         aExp -= 0x381;
1827     }
1828     return roundAndPackFloat32( aSign, aExp, zSig );
1829
1830 }
1831
1832 #ifdef FLOATX80
1833
1834 /*
1835 -------------------------------------------------------------------------------
1836 Returns the result of converting the double-precision floating-point value
1837 `a' to the extended double-precision floating-point format.  The conversion
1838 is performed according to the IEC/IEEE Standard for Binary Floating-point
1839 Arithmetic.
1840 -------------------------------------------------------------------------------
1841 */
1842 floatx80 float64_to_floatx80( float64 a )
1843 {
1844     flag aSign;
1845     int16 aExp;
1846     bits64 aSig;
1847
1848     aSig = extractFloat64Frac( a );
1849     aExp = extractFloat64Exp( a );
1850     aSign = extractFloat64Sign( a );
1851     if ( aExp == 0x7FF ) {
1852         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1853         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1854     }
1855     if ( aExp == 0 ) {
1856         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1857         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1858     }
1859     return
1860         packFloatx80(
1861             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1862
1863 }
1864
1865 #endif
1866
1867 /*
1868 -------------------------------------------------------------------------------
1869 Rounds the double-precision floating-point value `a' to an integer, and
1870 returns the result as a double-precision floating-point value.  The
1871 operation is performed according to the IEC/IEEE Standard for Binary
1872 Floating-point Arithmetic.
1873 -------------------------------------------------------------------------------
1874 */
1875 float64 float64_round_to_int( float64 a )
1876 {
1877     flag aSign;
1878     int16 aExp;
1879     bits64 lastBitMask, roundBitsMask;
1880     int8 roundingMode;
1881     float64 z;
1882
1883     aExp = extractFloat64Exp( a );
1884     if ( 0x433 <= aExp ) {
1885         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1886             return propagateFloat64NaN( a, a );
1887         }
1888         return a;
1889     }
1890     if ( aExp <= 0x3FE ) {
1891         if ( (bits64) ( a<<1 ) == 0 ) return a;
1892         float_exception_flags |= float_flag_inexact;
1893         aSign = extractFloat64Sign( a );
1894         switch ( float_rounding_mode ) {
1895          case float_round_nearest_even:
1896             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1897                 return packFloat64( aSign, 0x3FF, 0 );
1898             }
1899             break;
1900          case float_round_down:
1901             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1902          case float_round_up:
1903             return
1904             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1905         }
1906         return packFloat64( aSign, 0, 0 );
1907     }
1908     lastBitMask = 1;
1909     lastBitMask <<= 0x433 - aExp;
1910     roundBitsMask = lastBitMask - 1;
1911     z = a;
1912     roundingMode = float_rounding_mode;
1913     if ( roundingMode == float_round_nearest_even ) {
1914         z += lastBitMask>>1;
1915         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1916     }
1917     else if ( roundingMode != float_round_to_zero ) {
1918         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1919             z += roundBitsMask;
1920         }
1921     }
1922     z &= ~ roundBitsMask;
1923     if ( z != a ) float_exception_flags |= float_flag_inexact;
1924     return z;
1925
1926 }
1927
1928 /*
1929 -------------------------------------------------------------------------------
1930 Returns the result of adding the absolute values of the double-precision
1931 floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1932 before being returned.  `zSign' is ignored if the result is a NaN.  The
1933 addition is performed according to the IEC/IEEE Standard for Binary
1934 Floating-point Arithmetic.
1935 -------------------------------------------------------------------------------
1936 */
1937 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
1938 {
1939     int16 aExp, bExp, zExp;
1940     bits64 aSig, bSig, zSig;
1941     int16 expDiff;
1942
1943     aSig = extractFloat64Frac( a );
1944     aExp = extractFloat64Exp( a );
1945     bSig = extractFloat64Frac( b );
1946     bExp = extractFloat64Exp( b );
1947     expDiff = aExp - bExp;
1948     aSig <<= 9;
1949     bSig <<= 9;
1950     if ( 0 < expDiff ) {
1951         if ( aExp == 0x7FF ) {
1952             if ( aSig ) return propagateFloat64NaN( a, b );
1953             return a;
1954         }
1955         if ( bExp == 0 ) {
1956             --expDiff;
1957         }
1958         else {
1959             bSig |= LIT64( 0x2000000000000000 );
1960         }
1961         shift64RightJamming( bSig, expDiff, &bSig );
1962         zExp = aExp;
1963     }
1964     else if ( expDiff < 0 ) {
1965         if ( bExp == 0x7FF ) {
1966             if ( bSig ) return propagateFloat64NaN( a, b );
1967             return packFloat64( zSign, 0x7FF, 0 );
1968         }
1969         if ( aExp == 0 ) {
1970             ++expDiff;
1971         }
1972         else {
1973             aSig |= LIT64( 0x2000000000000000 );
1974         }
1975         shift64RightJamming( aSig, - expDiff, &aSig );
1976         zExp = bExp;
1977     }
1978     else {
1979         if ( aExp == 0x7FF ) {
1980             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1981             return a;
1982         }
1983         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1984         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1985         zExp = aExp;
1986         goto roundAndPack;
1987     }
1988     aSig |= LIT64( 0x2000000000000000 );
1989     zSig = ( aSig + bSig )<<1;
1990     --zExp;
1991     if ( (sbits64) zSig < 0 ) {
1992         zSig = aSig + bSig;
1993         ++zExp;
1994     }
1995  roundAndPack:
1996     return roundAndPackFloat64( zSign, zExp, zSig );
1997
1998 }
1999
2000 /*
2001 -------------------------------------------------------------------------------
2002 Returns the result of subtracting the absolute values of the double-
2003 precision floating-point values `a' and `b'.  If `zSign' is true, the
2004 difference is negated before being returned.  `zSign' is ignored if the
2005 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2006 Standard for Binary Floating-point Arithmetic.
2007 -------------------------------------------------------------------------------
2008 */
2009 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2010 {
2011     int16 aExp, bExp, zExp;
2012     bits64 aSig, bSig, zSig;
2013     int16 expDiff;
2014
2015     aSig = extractFloat64Frac( a );
2016     aExp = extractFloat64Exp( a );
2017     bSig = extractFloat64Frac( b );
2018     bExp = extractFloat64Exp( b );
2019     expDiff = aExp - bExp;
2020     aSig <<= 10;
2021     bSig <<= 10;
2022     if ( 0 < expDiff ) goto aExpBigger;
2023     if ( expDiff < 0 ) goto bExpBigger;
2024     if ( aExp == 0x7FF ) {
2025         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2026         float_raise( float_flag_invalid );
2027         return float64_default_nan;
2028     }
2029     if ( aExp == 0 ) {
2030         aExp = 1;
2031         bExp = 1;
2032     }
2033     if ( bSig < aSig ) goto aBigger;
2034     if ( aSig < bSig ) goto bBigger;
2035     return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2036  bExpBigger:
2037     if ( bExp == 0x7FF ) {
2038         if ( bSig ) return propagateFloat64NaN( a, b );
2039         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2040     }
2041     if ( aExp == 0 ) {
2042         ++expDiff;
2043     }
2044     else {
2045         aSig |= LIT64( 0x4000000000000000 );
2046     }
2047     shift64RightJamming( aSig, - expDiff, &aSig );
2048     bSig |= LIT64( 0x4000000000000000 );
2049  bBigger:
2050     zSig = bSig - aSig;
2051     zExp = bExp;
2052     zSign ^= 1;
2053     goto normalizeRoundAndPack;
2054  aExpBigger:
2055     if ( aExp == 0x7FF ) {
2056         if ( aSig ) return propagateFloat64NaN( a, b );
2057         return a;
2058     }
2059     if ( bExp == 0 ) {
2060         --expDiff;
2061     }
2062     else {
2063         bSig |= LIT64( 0x4000000000000000 );
2064     }
2065     shift64RightJamming( bSig, expDiff, &bSig );
2066     aSig |= LIT64( 0x4000000000000000 );
2067  aBigger:
2068     zSig = aSig - bSig;
2069     zExp = aExp;
2070  normalizeRoundAndPack:
2071     --zExp;
2072     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2073
2074 }
2075
2076 /*
2077 -------------------------------------------------------------------------------
2078 Returns the result of adding the double-precision floating-point values `a'
2079 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2080 Binary Floating-point Arithmetic.
2081 -------------------------------------------------------------------------------
2082 */
2083 float64 float64_add( float64 a, float64 b )
2084 {
2085     flag aSign, bSign;
2086
2087     aSign = extractFloat64Sign( a );
2088     bSign = extractFloat64Sign( b );
2089     if ( aSign == bSign ) {
2090         return addFloat64Sigs( a, b, aSign );
2091     }
2092     else {
2093         return subFloat64Sigs( a, b, aSign );
2094     }
2095
2096 }
2097
2098 /*
2099 -------------------------------------------------------------------------------
2100 Returns the result of subtracting the double-precision floating-point values
2101 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2102 for Binary Floating-point Arithmetic.
2103 -------------------------------------------------------------------------------
2104 */
2105 float64 float64_sub( float64 a, float64 b )
2106 {
2107     flag aSign, bSign;
2108
2109     aSign = extractFloat64Sign( a );
2110     bSign = extractFloat64Sign( b );
2111     if ( aSign == bSign ) {
2112         return subFloat64Sigs( a, b, aSign );
2113     }
2114     else {
2115         return addFloat64Sigs( a, b, aSign );
2116     }
2117
2118 }
2119
2120 /*
2121 -------------------------------------------------------------------------------
2122 Returns the result of multiplying the double-precision floating-point values
2123 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2124 for Binary Floating-point Arithmetic.
2125 -------------------------------------------------------------------------------
2126 */
2127 float64 float64_mul( float64 a, float64 b )
2128 {
2129     flag aSign, bSign, zSign;
2130     int16 aExp, bExp, zExp;
2131     bits64 aSig, bSig, zSig0, zSig1;
2132
2133     aSig = extractFloat64Frac( a );
2134     aExp = extractFloat64Exp( a );
2135     aSign = extractFloat64Sign( a );
2136     bSig = extractFloat64Frac( b );
2137     bExp = extractFloat64Exp( b );
2138     bSign = extractFloat64Sign( b );
2139     zSign = aSign ^ bSign;
2140     if ( aExp == 0x7FF ) {
2141         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2142             return propagateFloat64NaN( a, b );
2143         }
2144         if ( ( bExp | bSig ) == 0 ) {
2145             float_raise( float_flag_invalid );
2146             return float64_default_nan;
2147         }
2148         return packFloat64( zSign, 0x7FF, 0 );
2149     }
2150     if ( bExp == 0x7FF ) {
2151         if ( bSig ) return propagateFloat64NaN( a, b );
2152         if ( ( aExp | aSig ) == 0 ) {
2153             float_raise( float_flag_invalid );
2154             return float64_default_nan;
2155         }
2156         return packFloat64( zSign, 0x7FF, 0 );
2157     }
2158     if ( aExp == 0 ) {
2159         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2160         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2161     }
2162     if ( bExp == 0 ) {
2163         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2164         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2165     }
2166     zExp = aExp + bExp - 0x3FF;
2167     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2168     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2169     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2170     zSig0 |= ( zSig1 != 0 );
2171     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2172         zSig0 <<= 1;
2173         --zExp;
2174     }
2175     return roundAndPackFloat64( zSign, zExp, zSig0 );
2176
2177 }
2178
2179 /*
2180 -------------------------------------------------------------------------------
2181 Returns the result of dividing the double-precision floating-point value `a'
2182 by the corresponding value `b'.  The operation is performed according to
2183 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2184 -------------------------------------------------------------------------------
2185 */
2186 float64 float64_div( float64 a, float64 b )
2187 {
2188     flag aSign, bSign, zSign;
2189     int16 aExp, bExp, zExp;
2190     bits64 aSig, bSig, zSig;
2191     bits64 rem0, rem1;
2192     bits64 term0, term1;
2193
2194     aSig = extractFloat64Frac( a );
2195     aExp = extractFloat64Exp( a );
2196     aSign = extractFloat64Sign( a );
2197     bSig = extractFloat64Frac( b );
2198     bExp = extractFloat64Exp( b );
2199     bSign = extractFloat64Sign( b );
2200     zSign = aSign ^ bSign;
2201     if ( aExp == 0x7FF ) {
2202         if ( aSig ) return propagateFloat64NaN( a, b );
2203         if ( bExp == 0x7FF ) {
2204             if ( bSig ) return propagateFloat64NaN( a, b );
2205             float_raise( float_flag_invalid );
2206             return float64_default_nan;
2207         }
2208         return packFloat64( zSign, 0x7FF, 0 );
2209     }
2210     if ( bExp == 0x7FF ) {
2211         if ( bSig ) return propagateFloat64NaN( a, b );
2212         return packFloat64( zSign, 0, 0 );
2213     }
2214     if ( bExp == 0 ) {
2215         if ( bSig == 0 ) {
2216             if ( ( aExp | aSig ) == 0 ) {
2217                 float_raise( float_flag_invalid );
2218                 return float64_default_nan;
2219             }
2220             float_raise( float_flag_divbyzero );
2221             return packFloat64( zSign, 0x7FF, 0 );
2222         }
2223         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2224     }
2225     if ( aExp == 0 ) {
2226         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2227         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2228     }
2229     zExp = aExp - bExp + 0x3FD;
2230     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2231     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2232     if ( bSig <= ( aSig + aSig ) ) {
2233         aSig >>= 1;
2234         ++zExp;
2235     }
2236     zSig = estimateDiv128To64( aSig, 0, bSig );
2237     if ( ( zSig & 0x1FF ) <= 2 ) {
2238         mul64To128( bSig, zSig, &term0, &term1 );
2239         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2240         while ( (sbits64) rem0 < 0 ) {
2241             --zSig;
2242             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2243         }
2244         zSig |= ( rem1 != 0 );
2245     }
2246     return roundAndPackFloat64( zSign, zExp, zSig );
2247
2248 }
2249
2250 /*
2251 -------------------------------------------------------------------------------
2252 Returns the remainder of the double-precision floating-point value `a'
2253 with respect to the corresponding value `b'.  The operation is performed
2254 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2255 -------------------------------------------------------------------------------
2256 */
2257 float64 float64_rem( float64 a, float64 b )
2258 {
2259     flag aSign, bSign, zSign;
2260     int16 aExp, bExp, expDiff;
2261     bits64 aSig, bSig;
2262     bits64 q, alternateASig;
2263     sbits64 sigMean;
2264
2265     aSig = extractFloat64Frac( a );
2266     aExp = extractFloat64Exp( a );
2267     aSign = extractFloat64Sign( a );
2268     bSig = extractFloat64Frac( b );
2269     bExp = extractFloat64Exp( b );
2270     bSign = extractFloat64Sign( b );
2271     if ( aExp == 0x7FF ) {
2272         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2273             return propagateFloat64NaN( a, b );
2274         }
2275         float_raise( float_flag_invalid );
2276         return float64_default_nan;
2277     }
2278     if ( bExp == 0x7FF ) {
2279         if ( bSig ) return propagateFloat64NaN( a, b );
2280         return a;
2281     }
2282     if ( bExp == 0 ) {
2283         if ( bSig == 0 ) {
2284             float_raise( float_flag_invalid );
2285             return float64_default_nan;
2286         }
2287         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2288     }
2289     if ( aExp == 0 ) {
2290         if ( aSig == 0 ) return a;
2291         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2292     }
2293     expDiff = aExp - bExp;
2294     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2295     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2296     if ( expDiff < 0 ) {
2297         if ( expDiff < -1 ) return a;
2298         aSig >>= 1;
2299     }
2300     q = ( bSig <= aSig );
2301     if ( q ) aSig -= bSig;
2302     expDiff -= 64;
2303     while ( 0 < expDiff ) {
2304         q = estimateDiv128To64( aSig, 0, bSig );
2305         q = ( 2 < q ) ? q - 2 : 0;
2306         aSig = - ( ( bSig>>2 ) * q );
2307         expDiff -= 62;
2308     }
2309     expDiff += 64;
2310     if ( 0 < expDiff ) {
2311         q = estimateDiv128To64( aSig, 0, bSig );
2312         q = ( 2 < q ) ? q - 2 : 0;
2313         q >>= 64 - expDiff;
2314         bSig >>= 2;
2315         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2316     }
2317     else {
2318         aSig >>= 2;
2319         bSig >>= 2;
2320     }
2321     do {
2322         alternateASig = aSig;
2323         ++q;
2324         aSig -= bSig;
2325     } while ( 0 <= (sbits64) aSig );
2326     sigMean = aSig + alternateASig;
2327     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2328         aSig = alternateASig;
2329     }
2330     zSign = ( (sbits64) aSig < 0 );
2331     if ( zSign ) aSig = - aSig;
2332     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2333
2334 }
2335
2336 /*
2337 -------------------------------------------------------------------------------
2338 Returns the square root of the double-precision floating-point value `a'.
2339 The operation is performed according to the IEC/IEEE Standard for Binary
2340 Floating-point Arithmetic.
2341 -------------------------------------------------------------------------------
2342 */
2343 float64 float64_sqrt( float64 a )
2344 {
2345     flag aSign;
2346     int16 aExp, zExp;
2347     bits64 aSig, zSig;
2348     bits64 rem0, rem1, term0, term1; //, shiftedRem;
2349     //float64 z;
2350
2351     aSig = extractFloat64Frac( a );
2352     aExp = extractFloat64Exp( a );
2353     aSign = extractFloat64Sign( a );
2354     if ( aExp == 0x7FF ) {
2355         if ( aSig ) return propagateFloat64NaN( a, a );
2356         if ( ! aSign ) return a;
2357         float_raise( float_flag_invalid );
2358         return float64_default_nan;
2359     }
2360     if ( aSign ) {
2361         if ( ( aExp | aSig ) == 0 ) return a;
2362         float_raise( float_flag_invalid );
2363         return float64_default_nan;
2364     }
2365     if ( aExp == 0 ) {
2366         if ( aSig == 0 ) return 0;
2367         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2368     }
2369     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2370     aSig |= LIT64( 0x0010000000000000 );
2371     zSig = estimateSqrt32( aExp, aSig>>21 );
2372     zSig <<= 31;
2373     aSig <<= 9 - ( aExp & 1 );
2374     zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2375     if ( ( zSig & 0x3FF ) <= 5 ) {
2376         if ( zSig < 2 ) {
2377             zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2378         }
2379         else {
2380             aSig <<= 2;
2381             mul64To128( zSig, zSig, &term0, &term1 );
2382             sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2383             while ( (sbits64) rem0 < 0 ) {
2384                 --zSig;
2385                 shortShift128Left( 0, zSig, 1, &term0, &term1 );
2386                 term1 |= 1;
2387                 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2388             }
2389             zSig |= ( ( rem0 | rem1 ) != 0 );
2390         }
2391     }
2392     shift64RightJamming( zSig, 1, &zSig );
2393     return roundAndPackFloat64( 0, zExp, zSig );
2394
2395 }
2396
2397 /*
2398 -------------------------------------------------------------------------------
2399 Returns 1 if the double-precision floating-point value `a' is equal to the
2400 corresponding value `b', and 0 otherwise.  The comparison is performed
2401 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2402 -------------------------------------------------------------------------------
2403 */
2404 flag float64_eq( float64 a, float64 b )
2405 {
2406
2407     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2408          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2409        ) {
2410         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2411             float_raise( float_flag_invalid );
2412         }
2413         return 0;
2414     }
2415     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2416
2417 }
2418
2419 /*
2420 -------------------------------------------------------------------------------
2421 Returns 1 if the double-precision floating-point value `a' is less than or
2422 equal to the corresponding value `b', and 0 otherwise.  The comparison is
2423 performed according to the IEC/IEEE Standard for Binary Floating-point
2424 Arithmetic.
2425 -------------------------------------------------------------------------------
2426 */
2427 flag float64_le( float64 a, float64 b )
2428 {
2429     flag aSign, bSign;
2430
2431     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2432          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2433        ) {
2434         float_raise( float_flag_invalid );
2435         return 0;
2436     }
2437     aSign = extractFloat64Sign( a );
2438     bSign = extractFloat64Sign( b );
2439     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2440     return ( a == b ) || ( aSign ^ ( a < b ) );
2441
2442 }
2443
2444 /*
2445 -------------------------------------------------------------------------------
2446 Returns 1 if the double-precision floating-point value `a' is less than
2447 the corresponding value `b', and 0 otherwise.  The comparison is performed
2448 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2449 -------------------------------------------------------------------------------
2450 */
2451 flag float64_lt( float64 a, float64 b )
2452 {
2453     flag aSign, bSign;
2454
2455     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2456          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2457        ) {
2458         float_raise( float_flag_invalid );
2459         return 0;
2460     }
2461     aSign = extractFloat64Sign( a );
2462     bSign = extractFloat64Sign( b );
2463     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2464     return ( a != b ) && ( aSign ^ ( a < b ) );
2465
2466 }
2467
2468 /*
2469 -------------------------------------------------------------------------------
2470 Returns 1 if the double-precision floating-point value `a' is equal to the
2471 corresponding value `b', and 0 otherwise.  The invalid exception is raised
2472 if either operand is a NaN.  Otherwise, the comparison is performed
2473 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2474 -------------------------------------------------------------------------------
2475 */
2476 flag float64_eq_signaling( float64 a, float64 b )
2477 {
2478
2479     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2480          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2481        ) {
2482         float_raise( float_flag_invalid );
2483         return 0;
2484     }
2485     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2486
2487 }
2488
2489 /*
2490 -------------------------------------------------------------------------------
2491 Returns 1 if the double-precision floating-point value `a' is less than or
2492 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2493 cause an exception.  Otherwise, the comparison is performed according to the
2494 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2495 -------------------------------------------------------------------------------
2496 */
2497 flag float64_le_quiet( float64 a, float64 b )
2498 {
2499     flag aSign, bSign;
2500     //int16 aExp, bExp;
2501
2502     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2503          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2504        ) {
2505         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2506             float_raise( float_flag_invalid );
2507         }
2508         return 0;
2509     }
2510     aSign = extractFloat64Sign( a );
2511     bSign = extractFloat64Sign( b );
2512     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2513     return ( a == b ) || ( aSign ^ ( a < b ) );
2514
2515 }
2516
2517 /*
2518 -------------------------------------------------------------------------------
2519 Returns 1 if the double-precision floating-point value `a' is less than
2520 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2521 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2522 Standard for Binary Floating-point Arithmetic.
2523 -------------------------------------------------------------------------------
2524 */
2525 flag float64_lt_quiet( float64 a, float64 b )
2526 {
2527     flag aSign, bSign;
2528
2529     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2530          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2531        ) {
2532         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2533             float_raise( float_flag_invalid );
2534         }
2535         return 0;
2536     }
2537     aSign = extractFloat64Sign( a );
2538     bSign = extractFloat64Sign( b );
2539     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2540     return ( a != b ) && ( aSign ^ ( a < b ) );
2541
2542 }
2543
2544 #ifdef FLOATX80
2545
2546 /*
2547 -------------------------------------------------------------------------------
2548 Returns the result of converting the extended double-precision floating-
2549 point value `a' to the 32-bit two's complement integer format.  The
2550 conversion is performed according to the IEC/IEEE Standard for Binary
2551 Floating-point Arithmetic---which means in particular that the conversion
2552 is rounded according to the current rounding mode.  If `a' is a NaN, the
2553 largest positive integer is returned.  Otherwise, if the conversion
2554 overflows, the largest integer with the same sign as `a' is returned.
2555 -------------------------------------------------------------------------------
2556 */
2557 int32 floatx80_to_int32( floatx80 a )
2558 {
2559     flag aSign;
2560     int32 aExp, shiftCount;
2561     bits64 aSig;
2562
2563     aSig = extractFloatx80Frac( a );
2564     aExp = extractFloatx80Exp( a );
2565     aSign = extractFloatx80Sign( a );
2566     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2567     shiftCount = 0x4037 - aExp;
2568     if ( shiftCount <= 0 ) shiftCount = 1;
2569     shift64RightJamming( aSig, shiftCount, &aSig );
2570     return roundAndPackInt32( aSign, aSig );
2571
2572 }
2573
2574 /*
2575 -------------------------------------------------------------------------------
2576 Returns the result of converting the extended double-precision floating-
2577 point value `a' to the 32-bit two's complement integer format.  The
2578 conversion is performed according to the IEC/IEEE Standard for Binary
2579 Floating-point Arithmetic, except that the conversion is always rounded
2580 toward zero.  If `a' is a NaN, the largest positive integer is returned.
2581 Otherwise, if the conversion overflows, the largest integer with the same
2582 sign as `a' is returned.
2583 -------------------------------------------------------------------------------
2584 */
2585 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2586 {
2587     flag aSign;
2588     int32 aExp, shiftCount;
2589     bits64 aSig, savedASig;
2590     int32 z;
2591
2592     aSig = extractFloatx80Frac( a );
2593     aExp = extractFloatx80Exp( a );
2594     aSign = extractFloatx80Sign( a );
2595     shiftCount = 0x403E - aExp;
2596     if ( shiftCount < 32 ) {
2597         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2598         goto invalid;
2599     }
2600     else if ( 63 < shiftCount ) {
2601         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2602         return 0;
2603     }
2604     savedASig = aSig;
2605     aSig >>= shiftCount;
2606     z = aSig;
2607     if ( aSign ) z = - z;
2608     if ( ( z < 0 ) ^ aSign ) {
2609  invalid:
2610         float_exception_flags |= float_flag_invalid;
2611         return aSign ? 0x80000000 : 0x7FFFFFFF;
2612     }
2613     if ( ( aSig<<shiftCount ) != savedASig ) {
2614         float_exception_flags |= float_flag_inexact;
2615     }
2616     return z;
2617
2618 }
2619
2620 /*
2621 -------------------------------------------------------------------------------
2622 Returns the result of converting the extended double-precision floating-
2623 point value `a' to the single-precision floating-point format.  The
2624 conversion is performed according to the IEC/IEEE Standard for Binary
2625 Floating-point Arithmetic.
2626 -------------------------------------------------------------------------------
2627 */
2628 float32 floatx80_to_float32( floatx80 a )
2629 {
2630     flag aSign;
2631     int32 aExp;
2632     bits64 aSig;
2633
2634     aSig = extractFloatx80Frac( a );
2635     aExp = extractFloatx80Exp( a );
2636     aSign = extractFloatx80Sign( a );
2637     if ( aExp == 0x7FFF ) {
2638         if ( (bits64) ( aSig<<1 ) ) {
2639             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2640         }
2641         return packFloat32( aSign, 0xFF, 0 );
2642     }
2643     shift64RightJamming( aSig, 33, &aSig );
2644     if ( aExp || aSig ) aExp -= 0x3F81;
2645     return roundAndPackFloat32( aSign, aExp, aSig );
2646
2647 }
2648
2649 /*
2650 -------------------------------------------------------------------------------
2651 Returns the result of converting the extended double-precision floating-
2652 point value `a' to the double-precision floating-point format.  The
2653 conversion is performed according to the IEC/IEEE Standard for Binary
2654 Floating-point Arithmetic.
2655 -------------------------------------------------------------------------------
2656 */
2657 float64 floatx80_to_float64( floatx80 a )
2658 {
2659     flag aSign;
2660     int32 aExp;
2661     bits64 aSig, zSig;
2662
2663     aSig = extractFloatx80Frac( a );
2664     aExp = extractFloatx80Exp( a );
2665     aSign = extractFloatx80Sign( a );
2666     if ( aExp == 0x7FFF ) {
2667         if ( (bits64) ( aSig<<1 ) ) {
2668             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2669         }
2670         return packFloat64( aSign, 0x7FF, 0 );
2671     }
2672     shift64RightJamming( aSig, 1, &zSig );
2673     if ( aExp || aSig ) aExp -= 0x3C01;
2674     return roundAndPackFloat64( aSign, aExp, zSig );
2675
2676 }
2677
2678 /*
2679 -------------------------------------------------------------------------------
2680 Rounds the extended double-precision floating-point value `a' to an integer,
2681 and returns the result as an extended quadruple-precision floating-point
2682 value.  The operation is performed according to the IEC/IEEE Standard for
2683 Binary Floating-point Arithmetic.
2684 -------------------------------------------------------------------------------
2685 */
2686 floatx80 floatx80_round_to_int( floatx80 a )
2687 {
2688     flag aSign;
2689     int32 aExp;
2690     bits64 lastBitMask, roundBitsMask;
2691     int8 roundingMode;
2692     floatx80 z;
2693
2694     aExp = extractFloatx80Exp( a );
2695     if ( 0x403E <= aExp ) {
2696         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2697             return propagateFloatx80NaN( a, a );
2698         }
2699         return a;
2700     }
2701     if ( aExp <= 0x3FFE ) {
2702         if (    ( aExp == 0 )
2703              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2704             return a;
2705         }
2706         float_exception_flags |= float_flag_inexact;
2707         aSign = extractFloatx80Sign( a );
2708         switch ( float_rounding_mode ) {
2709          case float_round_nearest_even:
2710             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2711                ) {
2712                 return
2713                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2714             }
2715             break;
2716          case float_round_down:
2717             return
2718                   aSign ?
2719                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2720                 : packFloatx80( 0, 0, 0 );
2721          case float_round_up:
2722             return
2723                   aSign ? packFloatx80( 1, 0, 0 )
2724                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2725         }
2726         return packFloatx80( aSign, 0, 0 );
2727     }
2728     lastBitMask = 1;
2729     lastBitMask <<= 0x403E - aExp;
2730     roundBitsMask = lastBitMask - 1;
2731     z = a;
2732     roundingMode = float_rounding_mode;
2733     if ( roundingMode == float_round_nearest_even ) {
2734         z.low += lastBitMask>>1;
2735         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2736     }
2737     else if ( roundingMode != float_round_to_zero ) {
2738         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2739             z.low += roundBitsMask;
2740         }
2741     }
2742     z.low &= ~ roundBitsMask;
2743     if ( z.low == 0 ) {
2744         ++z.high;
2745         z.low = LIT64( 0x8000000000000000 );
2746     }
2747     if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
2748     return z;
2749
2750 }
2751
2752 /*
2753 -------------------------------------------------------------------------------
2754 Returns the result of adding the absolute values of the extended double-
2755 precision floating-point values `a' and `b'.  If `zSign' is true, the sum is
2756 negated before being returned.  `zSign' is ignored if the result is a NaN.
2757 The addition is performed according to the IEC/IEEE Standard for Binary
2758 Floating-point Arithmetic.
2759 -------------------------------------------------------------------------------
2760 */
2761 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2762 {
2763     int32 aExp, bExp, zExp;
2764     bits64 aSig, bSig, zSig0, zSig1;
2765     int32 expDiff;
2766
2767     aSig = extractFloatx80Frac( a );
2768     aExp = extractFloatx80Exp( a );
2769     bSig = extractFloatx80Frac( b );
2770     bExp = extractFloatx80Exp( b );
2771     expDiff = aExp - bExp;
2772     if ( 0 < expDiff ) {
2773         if ( aExp == 0x7FFF ) {
2774             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2775             return a;
2776         }
2777         if ( bExp == 0 ) --expDiff;
2778         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2779         zExp = aExp;
2780     }
2781     else if ( expDiff < 0 ) {
2782         if ( bExp == 0x7FFF ) {
2783             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2784             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2785         }
2786         if ( aExp == 0 ) ++expDiff;
2787         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2788         zExp = bExp;
2789     }
2790     else {
2791         if ( aExp == 0x7FFF ) {
2792             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2793                 return propagateFloatx80NaN( a, b );
2794             }
2795             return a;
2796         }
2797         zSig1 = 0;
2798         zSig0 = aSig + bSig;
2799         if ( aExp == 0 ) {
2800             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2801             goto roundAndPack;
2802         }
2803         zExp = aExp;
2804         goto shiftRight1;
2805     }
2806
2807     zSig0 = aSig + bSig;
2808
2809     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
2810  shiftRight1:
2811     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2812     zSig0 |= LIT64( 0x8000000000000000 );
2813     ++zExp;
2814  roundAndPack:
2815     return
2816         roundAndPackFloatx80(
2817             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2818
2819 }
2820
2821 /*
2822 -------------------------------------------------------------------------------
2823 Returns the result of subtracting the absolute values of the extended
2824 double-precision floating-point values `a' and `b'.  If `zSign' is true,
2825 the difference is negated before being returned.  `zSign' is ignored if the
2826 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2827 Standard for Binary Floating-point Arithmetic.
2828 -------------------------------------------------------------------------------
2829 */
2830 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2831 {
2832     int32 aExp, bExp, zExp;
2833     bits64 aSig, bSig, zSig0, zSig1;
2834     int32 expDiff;
2835     floatx80 z;
2836
2837     aSig = extractFloatx80Frac( a );
2838     aExp = extractFloatx80Exp( a );
2839     bSig = extractFloatx80Frac( b );
2840     bExp = extractFloatx80Exp( b );
2841     expDiff = aExp - bExp;
2842     if ( 0 < expDiff ) goto aExpBigger;
2843     if ( expDiff < 0 ) goto bExpBigger;
2844     if ( aExp == 0x7FFF ) {
2845         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2846             return propagateFloatx80NaN( a, b );
2847         }
2848         float_raise( float_flag_invalid );
2849         z.low = floatx80_default_nan_low;
2850         z.high = floatx80_default_nan_high;
2851         return z;
2852     }
2853     if ( aExp == 0 ) {
2854         aExp = 1;
2855         bExp = 1;
2856     }
2857     zSig1 = 0;
2858     if ( bSig < aSig ) goto aBigger;
2859     if ( aSig < bSig ) goto bBigger;
2860     return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
2861  bExpBigger:
2862     if ( bExp == 0x7FFF ) {
2863         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2864         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2865     }
2866     if ( aExp == 0 ) ++expDiff;
2867     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2868  bBigger:
2869     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2870     zExp = bExp;
2871     zSign ^= 1;
2872     goto normalizeRoundAndPack;
2873  aExpBigger:
2874     if ( aExp == 0x7FFF ) {
2875         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2876         return a;
2877     }
2878     if ( bExp == 0 ) --expDiff;
2879     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2880  aBigger:
2881     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2882     zExp = aExp;
2883  normalizeRoundAndPack:
2884     return
2885         normalizeRoundAndPackFloatx80(
2886             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2887
2888 }
2889
2890 /*
2891 -------------------------------------------------------------------------------
2892 Returns the result of adding the extended double-precision floating-point
2893 values `a' and `b'.  The operation is performed according to the IEC/IEEE
2894 Standard for Binary Floating-point Arithmetic.
2895 -------------------------------------------------------------------------------
2896 */
2897 floatx80 floatx80_add( floatx80 a, floatx80 b )
2898 {
2899     flag aSign, bSign;
2900
2901     aSign = extractFloatx80Sign( a );
2902     bSign = extractFloatx80Sign( b );
2903     if ( aSign == bSign ) {
2904         return addFloatx80Sigs( a, b, aSign );
2905     }
2906     else {
2907         return subFloatx80Sigs( a, b, aSign );
2908     }
2909
2910 }
2911
2912 /*
2913 -------------------------------------------------------------------------------
2914 Returns the result of subtracting the extended double-precision floating-
2915 point values `a' and `b'.  The operation is performed according to the
2916 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2917 -------------------------------------------------------------------------------
2918 */
2919 floatx80 floatx80_sub( floatx80 a, floatx80 b )
2920 {
2921     flag aSign, bSign;
2922
2923     aSign = extractFloatx80Sign( a );
2924     bSign = extractFloatx80Sign( b );
2925     if ( aSign == bSign ) {
2926         return subFloatx80Sigs( a, b, aSign );
2927     }
2928     else {
2929         return addFloatx80Sigs( a, b, aSign );
2930     }
2931
2932 }
2933
2934 /*
2935 -------------------------------------------------------------------------------
2936 Returns the result of multiplying the extended double-precision floating-
2937 point values `a' and `b'.  The operation is performed according to the
2938 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2939 -------------------------------------------------------------------------------
2940 */
2941 floatx80 floatx80_mul( floatx80 a, floatx80 b )
2942 {
2943     flag aSign, bSign, zSign;
2944     int32 aExp, bExp, zExp;
2945     bits64 aSig, bSig, zSig0, zSig1;
2946     floatx80 z;
2947
2948     aSig = extractFloatx80Frac( a );
2949     aExp = extractFloatx80Exp( a );
2950     aSign = extractFloatx80Sign( a );
2951     bSig = extractFloatx80Frac( b );
2952     bExp = extractFloatx80Exp( b );
2953     bSign = extractFloatx80Sign( b );
2954     zSign = aSign ^ bSign;
2955     if ( aExp == 0x7FFF ) {
2956         if (    (bits64) ( aSig<<1 )
2957              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2958             return propagateFloatx80NaN( a, b );
2959         }
2960         if ( ( bExp | bSig ) == 0 ) goto invalid;
2961         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2962     }
2963     if ( bExp == 0x7FFF ) {
2964         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2965         if ( ( aExp | aSig ) == 0 ) {
2966  invalid:
2967             float_raise( float_flag_invalid );
2968             z.low = floatx80_default_nan_low;
2969             z.high = floatx80_default_nan_high;
2970             return z;
2971         }
2972         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2973     }
2974     if ( aExp == 0 ) {
2975         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2976         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2977     }
2978     if ( bExp == 0 ) {
2979         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2980         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2981     }
2982     zExp = aExp + bExp - 0x3FFE;
2983     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2984     if ( 0 < (sbits64) zSig0 ) {
2985         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2986         --zExp;
2987     }
2988     return
2989         roundAndPackFloatx80(
2990             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2991
2992 }
2993
2994 /*
2995 -------------------------------------------------------------------------------
2996 Returns the result of dividing the extended double-precision floating-point
2997 value `a' by the corresponding value `b'.  The operation is performed
2998 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2999 -------------------------------------------------------------------------------
3000 */
3001 floatx80 floatx80_div( floatx80 a, floatx80 b )
3002 {
3003     flag aSign, bSign, zSign;
3004     int32 aExp, bExp, zExp;
3005     bits64 aSig, bSig, zSig0, zSig1;
3006     bits64 rem0, rem1, rem2, term0, term1, term2;
3007     floatx80 z;
3008
3009     aSig = extractFloatx80Frac( a );
3010     aExp = extractFloatx80Exp( a );
3011     aSign = extractFloatx80Sign( a );
3012     bSig = extractFloatx80Frac( b );
3013     bExp = extractFloatx80Exp( b );
3014     bSign = extractFloatx80Sign( b );
3015     zSign = aSign ^ bSign;
3016     if ( aExp == 0x7FFF ) {
3017         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3018         if ( bExp == 0x7FFF ) {
3019             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3020             goto invalid;
3021         }
3022         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3023     }
3024     if ( bExp == 0x7FFF ) {
3025         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3026         return packFloatx80( zSign, 0, 0 );
3027     }
3028     if ( bExp == 0 ) {
3029         if ( bSig == 0 ) {
3030             if ( ( aExp | aSig ) == 0 ) {
3031  invalid:
3032                 float_raise( float_flag_invalid );
3033                 z.low = floatx80_default_nan_low;
3034                 z.high = floatx80_default_nan_high;
3035                 return z;
3036             }
3037             float_raise( float_flag_divbyzero );
3038             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3039         }
3040         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3041     }
3042     if ( aExp == 0 ) {
3043         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3044         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3045     }
3046     zExp = aExp - bExp + 0x3FFE;
3047     rem1 = 0;
3048     if ( bSig <= aSig ) {
3049         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3050         ++zExp;
3051     }
3052     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3053     mul64To128( bSig, zSig0, &term0, &term1 );
3054     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3055     while ( (sbits64) rem0 < 0 ) {
3056         --zSig0;
3057         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3058     }
3059     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3060     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3061         mul64To128( bSig, zSig1, &term1, &term2 );
3062         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3063         while ( (sbits64) rem1 < 0 ) {
3064             --zSig1;
3065             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3066         }
3067         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3068     }
3069     return
3070         roundAndPackFloatx80(
3071             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3072
3073 }
3074
3075 /*
3076 -------------------------------------------------------------------------------
3077 Returns the remainder of the extended double-precision floating-point value
3078 `a' with respect to the corresponding value `b'.  The operation is performed
3079 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3080 -------------------------------------------------------------------------------
3081 */
3082 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3083 {
3084     flag aSign, bSign, zSign;
3085     int32 aExp, bExp, expDiff;
3086     bits64 aSig0, aSig1, bSig;
3087     bits64 q, term0, term1, alternateASig0, alternateASig1;
3088     floatx80 z;
3089
3090     aSig0 = extractFloatx80Frac( a );
3091     aExp = extractFloatx80Exp( a );
3092     aSign = extractFloatx80Sign( a );
3093     bSig = extractFloatx80Frac( b );
3094     bExp = extractFloatx80Exp( b );
3095     bSign = extractFloatx80Sign( b );
3096     if ( aExp == 0x7FFF ) {
3097         if (    (bits64) ( aSig0<<1 )
3098              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3099             return propagateFloatx80NaN( a, b );
3100         }
3101         goto invalid;
3102     }
3103     if ( bExp == 0x7FFF ) {
3104         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3105         return a;
3106     }
3107     if ( bExp == 0 ) {
3108         if ( bSig == 0 ) {
3109  invalid:
3110             float_raise( float_flag_invalid );
3111             z.low = floatx80_default_nan_low;
3112             z.high = floatx80_default_nan_high;
3113             return z;
3114         }
3115         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3116     }
3117     if ( aExp == 0 ) {
3118         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3119         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3120     }
3121     bSig |= LIT64( 0x8000000000000000 );
3122     zSign = aSign;
3123     expDiff = aExp - bExp;
3124     aSig1 = 0;
3125     if ( expDiff < 0 ) {
3126         if ( expDiff < -1 ) return a;
3127         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3128         expDiff = 0;
3129     }
3130     q = ( bSig <= aSig0 );
3131     if ( q ) aSig0 -= bSig;
3132     expDiff -= 64;
3133     while ( 0 < expDiff ) {
3134         q = estimateDiv128To64( aSig0, aSig1, bSig );
3135         q = ( 2 < q ) ? q - 2 : 0;
3136         mul64To128( bSig, q, &term0, &term1 );
3137         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3138         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3139         expDiff -= 62;
3140     }
3141     expDiff += 64;
3142     if ( 0 < expDiff ) {
3143         q = estimateDiv128To64( aSig0, aSig1, bSig );
3144         q = ( 2 < q ) ? q - 2 : 0;
3145         q >>= 64 - expDiff;
3146         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3147         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3148         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3149         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3150             ++q;
3151             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3152         }
3153     }
3154     else {
3155         term1 = 0;
3156         term0 = bSig;
3157     }
3158     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3159     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3160          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3161               && ( q & 1 ) )
3162        ) {
3163         aSig0 = alternateASig0;
3164         aSig1 = alternateASig1;
3165         zSign = ! zSign;
3166     }
3167     return
3168         normalizeRoundAndPackFloatx80(
3169             80, zSign, bExp + expDiff, aSig0, aSig1 );
3170
3171 }
3172
3173 /*
3174 -------------------------------------------------------------------------------
3175 Returns the square root of the extended double-precision floating-point
3176 value `a'.  The operation is performed according to the IEC/IEEE Standard
3177 for Binary Floating-point Arithmetic.
3178 -------------------------------------------------------------------------------
3179 */
3180 floatx80 floatx80_sqrt( floatx80 a )
3181 {
3182     flag aSign;
3183     int32 aExp, zExp;
3184     bits64 aSig0, aSig1, zSig0, zSig1;
3185     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3186     bits64 shiftedRem0, shiftedRem1;
3187     floatx80 z;
3188
3189     aSig0 = extractFloatx80Frac( a );
3190     aExp = extractFloatx80Exp( a );
3191     aSign = extractFloatx80Sign( a );
3192     if ( aExp == 0x7FFF ) {
3193         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3194         if ( ! aSign ) return a;
3195         goto invalid;
3196     }
3197     if ( aSign ) {
3198         if ( ( aExp | aSig0 ) == 0 ) return a;
3199  invalid:
3200         float_raise( float_flag_invalid );
3201         z.low = floatx80_default_nan_low;
3202         z.high = floatx80_default_nan_high;
3203         return z;
3204     }
3205     if ( aExp == 0 ) {
3206         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3207         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3208     }
3209     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3210     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3211     zSig0 <<= 31;
3212     aSig1 = 0;
3213     shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3214     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3215     if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3216     shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3217     mul64To128( zSig0, zSig0, &term0, &term1 );
3218     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3219     while ( (sbits64) rem0 < 0 ) {
3220         --zSig0;
3221         shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3222         term1 |= 1;
3223         add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3224     }
3225     shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3226     zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3227     if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3228         if ( zSig1 == 0 ) zSig1 = 1;
3229         mul64To128( zSig0, zSig1, &term1, &term2 );
3230         shortShift128Left( term1, term2, 1, &term1, &term2 );
3231         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3232         mul64To128( zSig1, zSig1, &term2, &term3 );
3233         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3234         while ( (sbits64) rem1 < 0 ) {
3235             --zSig1;
3236             shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3237             term3 |= 1;
3238             add192(
3239                 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3240         }
3241         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3242     }
3243     return
3244         roundAndPackFloatx80(
3245             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3246
3247 }
3248
3249 /*
3250 -------------------------------------------------------------------------------
3251 Returns 1 if the extended double-precision floating-point value `a' is
3252 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3253 performed according to the IEC/IEEE Standard for Binary Floating-point
3254 Arithmetic.
3255 -------------------------------------------------------------------------------
3256 */
3257 flag floatx80_eq( floatx80 a, floatx80 b )
3258 {
3259
3260     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3261               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3262          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3263               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3264        ) {
3265         if (    floatx80_is_signaling_nan( a )
3266              || floatx80_is_signaling_nan( b ) ) {
3267             float_raise( float_flag_invalid );
3268         }
3269         return 0;
3270     }
3271     return
3272            ( a.low == b.low )
3273         && (    ( a.high == b.high )
3274              || (    ( a.low == 0 )
3275                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3276            );
3277
3278 }
3279
3280 /*
3281 -------------------------------------------------------------------------------
3282 Returns 1 if the extended double-precision floating-point value `a' is
3283 less than or equal to the corresponding value `b', and 0 otherwise.  The
3284 comparison is performed according to the IEC/IEEE Standard for Binary
3285 Floating-point Arithmetic.
3286 -------------------------------------------------------------------------------
3287 */
3288 flag floatx80_le( floatx80 a, floatx80 b )
3289 {
3290     flag aSign, bSign;
3291
3292     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3293               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3294          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3295               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3296        ) {
3297         float_raise( float_flag_invalid );
3298         return 0;
3299     }
3300     aSign = extractFloatx80Sign( a );
3301     bSign = extractFloatx80Sign( b );
3302     if ( aSign != bSign ) {
3303         return
3304                aSign
3305             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3306                  == 0 );
3307     }
3308     return
3309           aSign ? le128( b.high, b.low, a.high, a.low )
3310         : le128( a.high, a.low, b.high, b.low );
3311
3312 }
3313
3314 /*
3315 -------------------------------------------------------------------------------
3316 Returns 1 if the extended double-precision floating-point value `a' is
3317 less than the corresponding value `b', and 0 otherwise.  The comparison
3318 is performed according to the IEC/IEEE Standard for Binary Floating-point
3319 Arithmetic.
3320 -------------------------------------------------------------------------------
3321 */
3322 flag floatx80_lt( floatx80 a, floatx80 b )
3323 {
3324     flag aSign, bSign;
3325
3326     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3327               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3328          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3329               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3330        ) {
3331         float_raise( float_flag_invalid );
3332         return 0;
3333     }
3334     aSign = extractFloatx80Sign( a );
3335     bSign = extractFloatx80Sign( b );
3336     if ( aSign != bSign ) {
3337         return
3338                aSign
3339             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3340                  != 0 );
3341     }
3342     return
3343           aSign ? lt128( b.high, b.low, a.high, a.low )
3344         : lt128( a.high, a.low, b.high, b.low );
3345
3346 }
3347
3348 /*
3349 -------------------------------------------------------------------------------
3350 Returns 1 if the extended double-precision floating-point value `a' is equal
3351 to the corresponding value `b', and 0 otherwise.  The invalid exception is
3352 raised if either operand is a NaN.  Otherwise, the comparison is performed
3353 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3354 -------------------------------------------------------------------------------
3355 */
3356 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3357 {
3358
3359     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3360               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3361          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3362               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3363        ) {
3364         float_raise( float_flag_invalid );
3365         return 0;
3366     }
3367     return
3368            ( a.low == b.low )
3369         && (    ( a.high == b.high )
3370              || (    ( a.low == 0 )
3371                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3372            );
3373
3374 }
3375
3376 /*
3377 -------------------------------------------------------------------------------
3378 Returns 1 if the extended double-precision floating-point value `a' is less
3379 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
3380 do not cause an exception.  Otherwise, the comparison is performed according
3381 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3382 -------------------------------------------------------------------------------
3383 */
3384 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3385 {
3386     flag aSign, bSign;
3387
3388     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3389               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3390          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3391               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3392        ) {
3393         if (    floatx80_is_signaling_nan( a )
3394              || floatx80_is_signaling_nan( b ) ) {
3395             float_raise( float_flag_invalid );
3396         }
3397         return 0;
3398     }
3399     aSign = extractFloatx80Sign( a );
3400     bSign = extractFloatx80Sign( b );
3401     if ( aSign != bSign ) {
3402         return
3403                aSign
3404             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3405                  == 0 );
3406     }
3407     return
3408           aSign ? le128( b.high, b.low, a.high, a.low )
3409         : le128( a.high, a.low, b.high, b.low );
3410
3411 }
3412
3413 /*
3414 -------------------------------------------------------------------------------
3415 Returns 1 if the extended double-precision floating-point value `a' is less
3416 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
3417 an exception.  Otherwise, the comparison is performed according to the
3418 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3419 -------------------------------------------------------------------------------
3420 */
3421 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3422 {
3423     flag aSign, bSign;
3424
3425     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3426               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3427          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3428               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3429        ) {
3430         if (    floatx80_is_signaling_nan( a )
3431              || floatx80_is_signaling_nan( b ) ) {
3432             float_raise( float_flag_invalid );
3433         }
3434         return 0;
3435     }
3436     aSign = extractFloatx80Sign( a );
3437     bSign = extractFloatx80Sign( b );
3438     if ( aSign != bSign ) {
3439         return
3440                aSign
3441             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3442                  != 0 );
3443     }
3444     return
3445           aSign ? lt128( b.high, b.low, a.high, a.low )
3446         : lt128( a.high, a.low, b.high, b.low );
3447
3448 }
3449
3450 #endif
3451