1 /* 1 2 ============================================== 3 4 This C source file is part of the SoftFloat IE 5 Arithmetic Package, Release 2. 6 7 Written by John R. Hauser. This work was made 8 International Computer Science Institute, loca 9 Street, Berkeley, California 94704. Funding w 10 National Science Foundation under grant MIP-93 11 of this code was written as part of a project 12 processor in collaboration with the University 13 overseen by Profs. Nelson Morgan and John Wawr 14 is available through the web page 15 http://www.jhauser.us/arithmetic/SoftFloat-2b/ 16 17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. 18 has been made to avoid it, THIS SOFTWARE MAY C 19 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF TH 20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAK 21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISI 22 23 Derivative works are acceptable, even for comm 24 (1) they include prominent notice that the wor 25 include prominent notice akin to these three p 26 this code that are retained. 27 28 ============================================== 29 */ 30 31 #include <asm/div64.h> 32 33 #include "fpa11.h" 34 //#include "milieu.h" 35 //#include "softfloat.h" 36 37 /* 38 ---------------------------------------------- 39 Primitive arithmetic functions, including mult 40 division and square root approximations. (Can 41 desired.) 42 ---------------------------------------------- 43 */ 44 #include "softfloat-macros" 45 46 /* 47 ---------------------------------------------- 48 Functions and definitions to determine: (1) w 49 is detected before or after rounding by defaul 50 happens when exceptions are raised, (3) how si 51 from quiet NaNs, (4) the default generated qui 52 are propagated from function inputs to output. 53 specific. 54 ---------------------------------------------- 55 */ 56 #include "softfloat-specialize" 57 58 /* 59 ---------------------------------------------- 60 Takes a 64-bit fixed-point value `absZ' with b 61 and 7, and returns the properly rounded 32-bit 62 input. If `zSign' is nonzero, the input is ne 63 to an integer. Bit 63 of `absZ' must be zero. 64 input is simply rounded to an integer, with th 65 the input cannot be represented exactly as an 66 input is too large, however, the invalid excep 67 positive or negative integer is returned. 68 ---------------------------------------------- 69 */ 70 static int32 roundAndPackInt32( struct roundin 71 { 72 int8 roundingMode; 73 flag roundNearestEven; 74 int8 roundIncrement, roundBits; 75 int32 z; 76 77 roundingMode = roundData->mode; 78 roundNearestEven = ( roundingMode == float 79 roundIncrement = 0x40; 80 if ( ! roundNearestEven ) { 81 if ( roundingMode == float_round_to_ze 82 roundIncrement = 0; 83 } 84 else { 85 roundIncrement = 0x7F; 86 if ( zSign ) { 87 if ( roundingMode == float_rou 88 } 89 else { 90 if ( roundingMode == float_rou 91 } 92 } 93 } 94 roundBits = absZ & 0x7F; 95 absZ = ( absZ + roundIncrement )>>7; 96 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) 97 z = absZ; 98 if ( zSign ) z = - z; 99 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ 100 roundData->exception |= float_flag_inv 101 return zSign ? 0x80000000 : 0x7FFFFFFF 102 } 103 if ( roundBits ) roundData->exception |= f 104 return z; 105 106 } 107 108 /* 109 ---------------------------------------------- 110 Returns the fraction bits of the single-precis 111 ---------------------------------------------- 112 */ 113 INLINE bits32 extractFloat32Frac( float32 a ) 114 { 115 116 return a & 0x007FFFFF; 117 118 } 119 120 /* 121 ---------------------------------------------- 122 Returns the exponent bits of the single-precis 123 ---------------------------------------------- 124 */ 125 INLINE int16 extractFloat32Exp( float32 a ) 126 { 127 128 return ( a>>23 ) & 0xFF; 129 130 } 131 132 /* 133 ---------------------------------------------- 134 Returns the sign bit of the single-precision f 135 ---------------------------------------------- 136 */ 137 #if 0 /* in softfloat.h */ 138 INLINE flag extractFloat32Sign( float32 a ) 139 { 140 141 return a>>31; 142 143 } 144 #endif 145 146 /* 147 ---------------------------------------------- 148 Normalizes the subnormal single-precision floa 149 by the denormalized significand `aSig'. The n 150 significand are stored at the locations pointe 151 `zSigPtr', respectively. 152 ---------------------------------------------- 153 */ 154 static void 155 normalizeFloat32Subnormal( bits32 aSig, int16 156 { 157 int8 shiftCount; 158 159 shiftCount = countLeadingZeros32( aSig ) - 160 *zSigPtr = aSig<<shiftCount; 161 *zExpPtr = 1 - shiftCount; 162 163 } 164 165 /* 166 ---------------------------------------------- 167 Packs the sign `zSign', exponent `zExp', and s 168 single-precision floating-point value, returni 169 shifted into the proper positions, the three f 170 together to form the result. This means that 171 will be added into the exponent. Since a prop 172 will have an integer portion equal to 1, the ` 173 than the desired result exponent whenever `zSi 174 significand. 175 ---------------------------------------------- 176 */ 177 INLINE float32 packFloat32( flag zSign, int16 178 { 179 #if 0 180 float32 f; 181 __asm__("@ packFloat32 182 mov %0, %1, asl #31 183 orr %0, %2, asl #23 184 orr %0, %3" 185 : /* no outputs */ 186 : "g" (f), "g" (zSign), "g" (zExp) 187 : "cc"); 188 return f; 189 #else 190 return ( ( (bits32) zSign )<<31 ) + ( ( (b 191 #endif 192 } 193 194 /* 195 ---------------------------------------------- 196 Takes an abstract floating-point value having 197 and significand `zSig', and returns the proper 198 point value corresponding to the abstract inpu 199 value is simply rounded and packed into the si 200 the inexact exception raised if the abstract i 201 exactly. If the abstract value is too large, 202 inexact exceptions are raised and an infinity 203 returned. If the abstract value is too small, 204 a subnormal number, and the underflow and inex 205 the abstract input cannot be represented exact 206 precision floating-point number. 207 The input significand `zSig' has its binar 208 and 29, which is 7 bits to the left of the usu 209 significand must be normalized or smaller. If 210 `zExp' must be 0; in that case, the result ret 211 and it must not require rounding. In the usua 212 normalized, `zExp' must be 1 less than the ``t 213 The handling of underflow and overflow follows 214 Binary Floating-point Arithmetic. 215 ---------------------------------------------- 216 */ 217 static float32 roundAndPackFloat32( struct rou 218 { 219 int8 roundingMode; 220 flag roundNearestEven; 221 int8 roundIncrement, roundBits; 222 flag isTiny; 223 224 roundingMode = roundData->mode; 225 roundNearestEven = ( roundingMode == float 226 roundIncrement = 0x40; 227 if ( ! roundNearestEven ) { 228 if ( roundingMode == float_round_to_ze 229 roundIncrement = 0; 230 } 231 else { 232 roundIncrement = 0x7F; 233 if ( zSign ) { 234 if ( roundingMode == float_rou 235 } 236 else { 237 if ( roundingMode == float_rou 238 } 239 } 240 } 241 roundBits = zSig & 0x7F; 242 if ( 0xFD <= (bits16) zExp ) { 243 if ( ( 0xFD < zExp ) 244 || ( ( zExp == 0xFD ) 245 && ( (sbits32) ( zSig + roun 246 ) { 247 roundData->exception |= float_flag 248 return packFloat32( zSign, 0xFF, 0 249 } 250 if ( zExp < 0 ) { 251 isTiny = 252 ( float_detect_tininess == 253 || ( zExp < -1 ) 254 || ( zSig + roundIncrement < 0 255 shift32RightJamming( zSig, - zExp, 256 zExp = 0; 257 roundBits = zSig & 0x7F; 258 if ( isTiny && roundBits ) roundDa 259 } 260 } 261 if ( roundBits ) roundData->exception |= f 262 zSig = ( zSig + roundIncrement )>>7; 263 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) 264 if ( zSig == 0 ) zExp = 0; 265 return packFloat32( zSign, zExp, zSig ); 266 267 } 268 269 /* 270 ---------------------------------------------- 271 Takes an abstract floating-point value having 272 and significand `zSig', and returns the proper 273 point value corresponding to the abstract inpu 274 `roundAndPackFloat32' except that `zSig' does 275 any way. In all cases, `zExp' must be 1 less 276 point exponent. 277 ---------------------------------------------- 278 */ 279 static float32 280 normalizeRoundAndPackFloat32( struct rounding 281 { 282 int8 shiftCount; 283 284 shiftCount = countLeadingZeros32( zSig ) - 285 return roundAndPackFloat32( roundData, zSi 286 287 } 288 289 /* 290 ---------------------------------------------- 291 Returns the fraction bits of the double-precis 292 ---------------------------------------------- 293 */ 294 INLINE bits64 extractFloat64Frac( float64 a ) 295 { 296 297 return a & LIT64( 0x000FFFFFFFFFFFFF ); 298 299 } 300 301 /* 302 ---------------------------------------------- 303 Returns the exponent bits of the double-precis 304 ---------------------------------------------- 305 */ 306 INLINE int16 extractFloat64Exp( float64 a ) 307 { 308 309 return ( a>>52 ) & 0x7FF; 310 311 } 312 313 /* 314 ---------------------------------------------- 315 Returns the sign bit of the double-precision f 316 ---------------------------------------------- 317 */ 318 #if 0 /* in softfloat.h */ 319 INLINE flag extractFloat64Sign( float64 a ) 320 { 321 322 return a>>63; 323 324 } 325 #endif 326 327 /* 328 ---------------------------------------------- 329 Normalizes the subnormal double-precision floa 330 by the denormalized significand `aSig'. The n 331 significand are stored at the locations pointe 332 `zSigPtr', respectively. 333 ---------------------------------------------- 334 */ 335 static void 336 normalizeFloat64Subnormal( bits64 aSig, int16 337 { 338 int8 shiftCount; 339 340 shiftCount = countLeadingZeros64( aSig ) - 341 *zSigPtr = aSig<<shiftCount; 342 *zExpPtr = 1 - shiftCount; 343 344 } 345 346 /* 347 ---------------------------------------------- 348 Packs the sign `zSign', exponent `zExp', and s 349 double-precision floating-point value, returni 350 shifted into the proper positions, the three f 351 together to form the result. This means that 352 will be added into the exponent. Since a prop 353 will have an integer portion equal to 1, the ` 354 than the desired result exponent whenever `zSi 355 significand. 356 ---------------------------------------------- 357 */ 358 INLINE float64 packFloat64( flag zSign, int16 359 { 360 361 return ( ( (bits64) zSign )<<63 ) + ( ( (b 362 363 } 364 365 /* 366 ---------------------------------------------- 367 Takes an abstract floating-point value having 368 and significand `zSig', and returns the proper 369 point value corresponding to the abstract inpu 370 value is simply rounded and packed into the do 371 the inexact exception raised if the abstract i 372 exactly. If the abstract value is too large, 373 inexact exceptions are raised and an infinity 374 returned. If the abstract value is too small, 375 a subnormal number, and the underflow and inex 376 the abstract input cannot be represented exact 377 precision floating-point number. 378 The input significand `zSig' has its binar 379 and 61, which is 10 bits to the left of the us 380 significand must be normalized or smaller. If 381 `zExp' must be 0; in that case, the result ret 382 and it must not require rounding. In the usua 383 normalized, `zExp' must be 1 less than the ``t 384 The handling of underflow and overflow follows 385 Binary Floating-point Arithmetic. 386 ---------------------------------------------- 387 */ 388 static float64 roundAndPackFloat64( struct rou 389 { 390 int8 roundingMode; 391 flag roundNearestEven; 392 int16 roundIncrement, roundBits; 393 flag isTiny; 394 395 roundingMode = roundData->mode; 396 roundNearestEven = ( roundingMode == float 397 roundIncrement = 0x200; 398 if ( ! roundNearestEven ) { 399 if ( roundingMode == float_round_to_ze 400 roundIncrement = 0; 401 } 402 else { 403 roundIncrement = 0x3FF; 404 if ( zSign ) { 405 if ( roundingMode == float_rou 406 } 407 else { 408 if ( roundingMode == float_rou 409 } 410 } 411 } 412 roundBits = zSig & 0x3FF; 413 if ( 0x7FD <= (bits16) zExp ) { 414 if ( ( 0x7FD < zExp ) 415 || ( ( zExp == 0x7FD ) 416 && ( (sbits64) ( zSig + roun 417 ) { 418 //register int lr = __builtin_retu 419 //printk("roundAndPackFloat64 call 420 roundData->exception |= float_flag 421 return packFloat64( zSign, 0x7FF, 422 } 423 if ( zExp < 0 ) { 424 isTiny = 425 ( float_detect_tininess == 426 || ( zExp < -1 ) 427 || ( zSig + roundIncrement < L 428 shift64RightJamming( zSig, - zExp, 429 zExp = 0; 430 roundBits = zSig & 0x3FF; 431 if ( isTiny && roundBits ) roundDa 432 } 433 } 434 if ( roundBits ) roundData->exception |= f 435 zSig = ( zSig + roundIncrement )>>10; 436 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) 437 if ( zSig == 0 ) zExp = 0; 438 return packFloat64( zSign, zExp, zSig ); 439 440 } 441 442 /* 443 ---------------------------------------------- 444 Takes an abstract floating-point value having 445 and significand `zSig', and returns the proper 446 point value corresponding to the abstract inpu 447 `roundAndPackFloat64' except that `zSig' does 448 any way. In all cases, `zExp' must be 1 less 449 point exponent. 450 ---------------------------------------------- 451 */ 452 static float64 453 normalizeRoundAndPackFloat64( struct rounding 454 { 455 int8 shiftCount; 456 457 shiftCount = countLeadingZeros64( zSig ) - 458 return roundAndPackFloat64( roundData, zSi 459 460 } 461 462 #ifdef FLOATX80 463 464 /* 465 ---------------------------------------------- 466 Returns the fraction bits of the extended doub 467 value `a'. 468 ---------------------------------------------- 469 */ 470 INLINE bits64 extractFloatx80Frac( floatx80 a 471 { 472 473 return a.low; 474 475 } 476 477 /* 478 ---------------------------------------------- 479 Returns the exponent bits of the extended doub 480 value `a'. 481 ---------------------------------------------- 482 */ 483 INLINE int32 extractFloatx80Exp( floatx80 a ) 484 { 485 486 return a.high & 0x7FFF; 487 488 } 489 490 /* 491 ---------------------------------------------- 492 Returns the sign bit of the extended double-pr 493 `a'. 494 ---------------------------------------------- 495 */ 496 INLINE flag extractFloatx80Sign( floatx80 a ) 497 { 498 499 return a.high>>15; 500 501 } 502 503 /* 504 ---------------------------------------------- 505 Normalizes the subnormal extended double-preci 506 represented by the denormalized significand `a 507 and significand are stored at the locations po 508 `zSigPtr', respectively. 509 ---------------------------------------------- 510 */ 511 static void 512 normalizeFloatx80Subnormal( bits64 aSig, int3 513 { 514 int8 shiftCount; 515 516 shiftCount = countLeadingZeros64( aSig ); 517 *zSigPtr = aSig<<shiftCount; 518 *zExpPtr = 1 - shiftCount; 519 520 } 521 522 /* 523 ---------------------------------------------- 524 Packs the sign `zSign', exponent `zExp', and s 525 extended double-precision floating-point value 526 ---------------------------------------------- 527 */ 528 INLINE floatx80 packFloatx80( flag zSign, int3 529 { 530 floatx80 z; 531 532 z.low = zSig; 533 z.high = ( ( (bits16) zSign )<<15 ) + zExp 534 z.__padding = 0; 535 return z; 536 537 } 538 539 /* 540 ---------------------------------------------- 541 Takes an abstract floating-point value having 542 and extended significand formed by the concate 543 and returns the proper extended double-precisi 544 corresponding to the abstract input. Ordinari 545 rounded and packed into the extended double-pr 546 inexact exception raised if the abstract input 547 exactly. If the abstract value is too large, 548 inexact exceptions are raised and an infinity 549 returned. If the abstract value is too small, 550 a subnormal number, and the underflow and inex 551 the abstract input cannot be represented exact 552 double-precision floating-point number. 553 If `roundingPrecision' is 32 or 64, the re 554 number of bits as single or double precision, 555 result is rounded to the full precision of the 556 format. 557 The input significand must be normalized o 558 significand is not normalized, `zExp' must be 559 returned is a subnormal number, and it must no 560 handling of underflow and overflow follows the 561 Floating-point Arithmetic. 562 ---------------------------------------------- 563 */ 564 static floatx80 565 roundAndPackFloatx80( 566 struct roundingData *roundData, flag zSig 567 ) 568 { 569 int8 roundingMode, roundingPrecision; 570 flag roundNearestEven, increment, isTiny; 571 int64 roundIncrement, roundMask, roundBits 572 573 roundingMode = roundData->mode; 574 roundingPrecision = roundData->precision; 575 roundNearestEven = ( roundingMode == float 576 if ( roundingPrecision == 80 ) goto precis 577 if ( roundingPrecision == 64 ) { 578 roundIncrement = LIT64( 0x000000000000 579 roundMask = LIT64( 0x00000000000007FF 580 } 581 else if ( roundingPrecision == 32 ) { 582 roundIncrement = LIT64( 0x000000800000 583 roundMask = LIT64( 0x000000FFFFFFFFFF 584 } 585 else { 586 goto precision80; 587 } 588 zSig0 |= ( zSig1 != 0 ); 589 if ( ! roundNearestEven ) { 590 if ( roundingMode == float_round_to_ze 591 roundIncrement = 0; 592 } 593 else { 594 roundIncrement = roundMask; 595 if ( zSign ) { 596 if ( roundingMode == float_rou 597 } 598 else { 599 if ( roundingMode == float_rou 600 } 601 } 602 } 603 roundBits = zSig0 & roundMask; 604 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 605 if ( ( 0x7FFE < zExp ) 606 || ( ( zExp == 0x7FFE ) && ( zSig 607 ) { 608 goto overflow; 609 } 610 if ( zExp <= 0 ) { 611 isTiny = 612 ( float_detect_tininess == 613 || ( zExp < 0 ) 614 || ( zSig0 <= zSig0 + roundInc 615 shift64RightJamming( zSig0, 1 - zE 616 zExp = 0; 617 roundBits = zSig0 & roundMask; 618 if ( isTiny && roundBits ) roundDa 619 if ( roundBits ) roundData->except 620 zSig0 += roundIncrement; 621 if ( (sbits64) zSig0 < 0 ) zExp = 622 roundIncrement = roundMask + 1; 623 if ( roundNearestEven && ( roundBi 624 roundMask |= roundIncrement; 625 } 626 zSig0 &= ~ roundMask; 627 return packFloatx80( zSign, zExp, 628 } 629 } 630 if ( roundBits ) roundData->exception |= f 631 zSig0 += roundIncrement; 632 if ( zSig0 < roundIncrement ) { 633 ++zExp; 634 zSig0 = LIT64( 0x8000000000000000 ); 635 } 636 roundIncrement = roundMask + 1; 637 if ( roundNearestEven && ( roundBits<<1 == 638 roundMask |= roundIncrement; 639 } 640 zSig0 &= ~ roundMask; 641 if ( zSig0 == 0 ) zExp = 0; 642 return packFloatx80( zSign, zExp, zSig0 ); 643 precision80: 644 increment = ( (sbits64) zSig1 < 0 ); 645 if ( ! roundNearestEven ) { 646 if ( roundingMode == float_round_to_ze 647 increment = 0; 648 } 649 else { 650 if ( zSign ) { 651 increment = ( roundingMode == 652 } 653 else { 654 increment = ( roundingMode == 655 } 656 } 657 } 658 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 659 if ( ( 0x7FFE < zExp ) 660 || ( ( zExp == 0x7FFE ) 661 && ( zSig0 == LIT64( 0xFFFFF 662 && increment 663 ) 664 ) { 665 roundMask = 0; 666 overflow: 667 roundData->exception |= float_flag 668 if ( ( roundingMode == float_ro 669 || ( zSign && ( roundingMode 670 || ( ! zSign && ( roundingMod 671 ) { 672 return packFloatx80( zSign, 0x 673 } 674 return packFloatx80( zSign, 0x7FFF 675 } 676 if ( zExp <= 0 ) { 677 isTiny = 678 ( float_detect_tininess == 679 || ( zExp < 0 ) 680 || ! increment 681 || ( zSig0 < LIT64( 0xFFFFFFFF 682 shift64ExtraRightJamming( zSig0, z 683 zExp = 0; 684 if ( isTiny && zSig1 ) roundData-> 685 if ( zSig1 ) roundData->exception 686 if ( roundNearestEven ) { 687 increment = ( (sbits64) zSig1 688 } 689 else { 690 if ( zSign ) { 691 increment = ( roundingMode 692 } 693 else { 694 increment = ( roundingMode 695 } 696 } 697 if ( increment ) { 698 ++zSig0; 699 zSig0 &= ~ ( ( zSig1 + zSig1 = 700 if ( (sbits64) zSig0 < 0 ) zEx 701 } 702 return packFloatx80( zSign, zExp, 703 } 704 } 705 if ( zSig1 ) roundData->exception |= float 706 if ( increment ) { 707 ++zSig0; 708 if ( zSig0 == 0 ) { 709 ++zExp; 710 zSig0 = LIT64( 0x8000000000000000 711 } 712 else { 713 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 714 } 715 } 716 else { 717 if ( zSig0 == 0 ) zExp = 0; 718 } 719 720 return packFloatx80( zSign, zExp, zSig0 ); 721 } 722 723 /* 724 ---------------------------------------------- 725 Takes an abstract floating-point value having 726 `zExp', and significand formed by the concaten 727 and returns the proper extended double-precisi 728 corresponding to the abstract input. This rou 729 `roundAndPackFloatx80' except that the input s 730 normalized. 731 ---------------------------------------------- 732 */ 733 static floatx80 734 normalizeRoundAndPackFloatx80( 735 struct roundingData *roundData, flag zSig 736 ) 737 { 738 int8 shiftCount; 739 740 if ( zSig0 == 0 ) { 741 zSig0 = zSig1; 742 zSig1 = 0; 743 zExp -= 64; 744 } 745 shiftCount = countLeadingZeros64( zSig0 ); 746 shortShift128Left( zSig0, zSig1, shiftCoun 747 zExp -= shiftCount; 748 return 749 roundAndPackFloatx80( roundData, zSign 750 751 } 752 753 #endif 754 755 /* 756 ---------------------------------------------- 757 Returns the result of converting the 32-bit tw 758 the single-precision floating-point format. T 759 according to the IEC/IEEE Standard for Binary 760 ---------------------------------------------- 761 */ 762 float32 int32_to_float32(struct roundingData * 763 { 764 flag zSign; 765 766 if ( a == 0 ) return 0; 767 if ( a == 0x80000000 ) return packFloat32( 768 zSign = ( a < 0 ); 769 return normalizeRoundAndPackFloat32( round 770 771 } 772 773 /* 774 ---------------------------------------------- 775 Returns the result of converting the 32-bit tw 776 the double-precision floating-point format. T 777 according to the IEC/IEEE Standard for Binary 778 ---------------------------------------------- 779 */ 780 float64 int32_to_float64( int32 a ) 781 { 782 flag aSign; 783 uint32 absA; 784 int8 shiftCount; 785 bits64 zSig; 786 787 if ( a == 0 ) return 0; 788 aSign = ( a < 0 ); 789 absA = aSign ? - a : a; 790 shiftCount = countLeadingZeros32( absA ) + 791 zSig = absA; 792 return packFloat64( aSign, 0x432 - shiftCo 793 794 } 795 796 #ifdef FLOATX80 797 798 /* 799 ---------------------------------------------- 800 Returns the result of converting the 32-bit tw 801 to the extended double-precision floating-poin 802 is performed according to the IEC/IEEE Standar 803 Arithmetic. 804 ---------------------------------------------- 805 */ 806 floatx80 int32_to_floatx80( int32 a ) 807 { 808 flag zSign; 809 uint32 absA; 810 int8 shiftCount; 811 bits64 zSig; 812 813 if ( a == 0 ) return packFloatx80( 0, 0, 0 814 zSign = ( a < 0 ); 815 absA = zSign ? - a : a; 816 shiftCount = countLeadingZeros32( absA ) + 817 zSig = absA; 818 return packFloatx80( zSign, 0x403E - shift 819 820 } 821 822 #endif 823 824 /* 825 ---------------------------------------------- 826 Returns the result of converting the single-pr 827 `a' to the 32-bit two's complement integer for 828 performed according to the IEC/IEEE Standard f 829 Arithmetic---which means in particular that th 830 according to the current rounding mode. If `a 831 positive integer is returned. Otherwise, if t 832 largest integer with the same sign as `a' is r 833 ---------------------------------------------- 834 */ 835 int32 float32_to_int32( struct roundingData *r 836 { 837 flag aSign; 838 int16 aExp, shiftCount; 839 bits32 aSig; 840 bits64 zSig; 841 842 aSig = extractFloat32Frac( a ); 843 aExp = extractFloat32Exp( a ); 844 aSign = extractFloat32Sign( a ); 845 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0 846 if ( aExp ) aSig |= 0x00800000; 847 shiftCount = 0xAF - aExp; 848 zSig = aSig; 849 zSig <<= 32; 850 if ( 0 < shiftCount ) shift64RightJamming( 851 return roundAndPackInt32( roundData, aSign 852 853 } 854 855 /* 856 ---------------------------------------------- 857 Returns the result of converting the single-pr 858 `a' to the 32-bit two's complement integer for 859 performed according to the IEC/IEEE Standard f 860 Arithmetic, except that the conversion is alwa 861 `a' is a NaN, the largest positive integer is 862 conversion overflows, the largest integer with 863 returned. 864 ---------------------------------------------- 865 */ 866 int32 float32_to_int32_round_to_zero( float32 867 { 868 flag aSign; 869 int16 aExp, shiftCount; 870 bits32 aSig; 871 int32 z; 872 873 aSig = extractFloat32Frac( a ); 874 aExp = extractFloat32Exp( a ); 875 aSign = extractFloat32Sign( a ); 876 shiftCount = aExp - 0x9E; 877 if ( 0 <= shiftCount ) { 878 if ( a == 0xCF000000 ) return 0x800000 879 float_raise( float_flag_invalid ); 880 if ( ! aSign || ( ( aExp == 0xFF ) && 881 return 0x80000000; 882 } 883 else if ( aExp <= 0x7E ) { 884 if ( aExp | aSig ) float_raise( float_ 885 return 0; 886 } 887 aSig = ( aSig | 0x00800000 )<<8; 888 z = aSig>>( - shiftCount ); 889 if ( (bits32) ( aSig<<( shiftCount & 31 ) 890 float_raise( float_flag_inexact ); 891 } 892 return aSign ? - z : z; 893 894 } 895 896 /* 897 ---------------------------------------------- 898 Returns the result of converting the single-pr 899 `a' to the double-precision floating-point for 900 performed according to the IEC/IEEE Standard f 901 Arithmetic. 902 ---------------------------------------------- 903 */ 904 float64 float32_to_float64( float32 a ) 905 { 906 flag aSign; 907 int16 aExp; 908 bits32 aSig; 909 910 aSig = extractFloat32Frac( a ); 911 aExp = extractFloat32Exp( a ); 912 aSign = extractFloat32Sign( a ); 913 if ( aExp == 0xFF ) { 914 if ( aSig ) return commonNaNToFloat64( 915 return packFloat64( aSign, 0x7FF, 0 ); 916 } 917 if ( aExp == 0 ) { 918 if ( aSig == 0 ) return packFloat64( a 919 normalizeFloat32Subnormal( aSig, &aExp 920 --aExp; 921 } 922 return packFloat64( aSign, aExp + 0x380, ( 923 924 } 925 926 #ifdef FLOATX80 927 928 /* 929 ---------------------------------------------- 930 Returns the result of converting the single-pr 931 `a' to the extended double-precision floating- 932 is performed according to the IEC/IEEE Standar 933 Arithmetic. 934 ---------------------------------------------- 935 */ 936 floatx80 float32_to_floatx80( float32 a ) 937 { 938 flag aSign; 939 int16 aExp; 940 bits32 aSig; 941 942 aSig = extractFloat32Frac( a ); 943 aExp = extractFloat32Exp( a ); 944 aSign = extractFloat32Sign( a ); 945 if ( aExp == 0xFF ) { 946 if ( aSig ) return commonNaNToFloatx80 947 return packFloatx80( aSign, 0x7FFF, LI 948 } 949 if ( aExp == 0 ) { 950 if ( aSig == 0 ) return packFloatx80( 951 normalizeFloat32Subnormal( aSig, &aExp 952 } 953 aSig |= 0x00800000; 954 return packFloatx80( aSign, aExp + 0x3F80, 955 956 } 957 958 #endif 959 960 /* 961 ---------------------------------------------- 962 Rounds the single-precision floating-point val 963 returns the result as a single-precision float 964 operation is performed according to the IEC/IE 965 Floating-point Arithmetic. 966 ---------------------------------------------- 967 */ 968 float32 float32_round_to_int( struct roundingD 969 { 970 flag aSign; 971 int16 aExp; 972 bits32 lastBitMask, roundBitsMask; 973 int8 roundingMode; 974 float32 z; 975 976 aExp = extractFloat32Exp( a ); 977 if ( 0x96 <= aExp ) { 978 if ( ( aExp == 0xFF ) && extractFloat3 979 return propagateFloat32NaN( a, a ) 980 } 981 return a; 982 } 983 roundingMode = roundData->mode; 984 if ( aExp <= 0x7E ) { 985 if ( (bits32) ( a<<1 ) == 0 ) return a 986 roundData->exception |= float_flag_ine 987 aSign = extractFloat32Sign( a ); 988 switch ( roundingMode ) { 989 case float_round_nearest_even: 990 if ( ( aExp == 0x7E ) && extractFl 991 return packFloat32( aSign, 0x7 992 } 993 break; 994 case float_round_down: 995 return aSign ? 0xBF800000 : 0; 996 case float_round_up: 997 return aSign ? 0x80000000 : 0x3F80 998 } 999 return packFloat32( aSign, 0, 0 ); 1000 } 1001 lastBitMask = 1; 1002 lastBitMask <<= 0x96 - aExp; 1003 roundBitsMask = lastBitMask - 1; 1004 z = a; 1005 if ( roundingMode == float_round_nearest_ 1006 z += lastBitMask>>1; 1007 if ( ( z & roundBitsMask ) == 0 ) z & 1008 } 1009 else if ( roundingMode != float_round_to_ 1010 if ( extractFloat32Sign( z ) ^ ( roun 1011 z += roundBitsMask; 1012 } 1013 } 1014 z &= ~ roundBitsMask; 1015 if ( z != a ) roundData->exception |= flo 1016 return z; 1017 1018 } 1019 1020 /* 1021 --------------------------------------------- 1022 Returns the result of adding the absolute val 1023 floating-point values `a' and `b'. If `zSign 1024 before being returned. `zSign' is ignored if 1025 addition is performed according to the IEC/IE 1026 Floating-point Arithmetic. 1027 --------------------------------------------- 1028 */ 1029 static float32 addFloat32Sigs( struct roundin 1030 { 1031 int16 aExp, bExp, zExp; 1032 bits32 aSig, bSig, zSig; 1033 int16 expDiff; 1034 1035 aSig = extractFloat32Frac( a ); 1036 aExp = extractFloat32Exp( a ); 1037 bSig = extractFloat32Frac( b ); 1038 bExp = extractFloat32Exp( b ); 1039 expDiff = aExp - bExp; 1040 aSig <<= 6; 1041 bSig <<= 6; 1042 if ( 0 < expDiff ) { 1043 if ( aExp == 0xFF ) { 1044 if ( aSig ) return propagateFloat 1045 return a; 1046 } 1047 if ( bExp == 0 ) { 1048 --expDiff; 1049 } 1050 else { 1051 bSig |= 0x20000000; 1052 } 1053 shift32RightJamming( bSig, expDiff, & 1054 zExp = aExp; 1055 } 1056 else if ( expDiff < 0 ) { 1057 if ( bExp == 0xFF ) { 1058 if ( bSig ) return propagateFloat 1059 return packFloat32( zSign, 0xFF, 1060 } 1061 if ( aExp == 0 ) { 1062 ++expDiff; 1063 } 1064 else { 1065 aSig |= 0x20000000; 1066 } 1067 shift32RightJamming( aSig, - expDiff, 1068 zExp = bExp; 1069 } 1070 else { 1071 if ( aExp == 0xFF ) { 1072 if ( aSig | bSig ) return propaga 1073 return a; 1074 } 1075 if ( aExp == 0 ) return packFloat32( 1076 zSig = 0x40000000 + aSig + bSig; 1077 zExp = aExp; 1078 goto roundAndPack; 1079 } 1080 aSig |= 0x20000000; 1081 zSig = ( aSig + bSig )<<1; 1082 --zExp; 1083 if ( (sbits32) zSig < 0 ) { 1084 zSig = aSig + bSig; 1085 ++zExp; 1086 } 1087 roundAndPack: 1088 return roundAndPackFloat32( roundData, zS 1089 1090 } 1091 1092 /* 1093 --------------------------------------------- 1094 Returns the result of subtracting the absolut 1095 precision floating-point values `a' and `b'. 1096 difference is negated before being returned. 1097 result is a NaN. The subtraction is performe 1098 Standard for Binary Floating-point Arithmetic 1099 --------------------------------------------- 1100 */ 1101 static float32 subFloat32Sigs( struct roundin 1102 { 1103 int16 aExp, bExp, zExp; 1104 bits32 aSig, bSig, zSig; 1105 int16 expDiff; 1106 1107 aSig = extractFloat32Frac( a ); 1108 aExp = extractFloat32Exp( a ); 1109 bSig = extractFloat32Frac( b ); 1110 bExp = extractFloat32Exp( b ); 1111 expDiff = aExp - bExp; 1112 aSig <<= 7; 1113 bSig <<= 7; 1114 if ( 0 < expDiff ) goto aExpBigger; 1115 if ( expDiff < 0 ) goto bExpBigger; 1116 if ( aExp == 0xFF ) { 1117 if ( aSig | bSig ) return propagateFl 1118 roundData->exception |= float_flag_in 1119 return float32_default_nan; 1120 } 1121 if ( aExp == 0 ) { 1122 aExp = 1; 1123 bExp = 1; 1124 } 1125 if ( bSig < aSig ) goto aBigger; 1126 if ( aSig < bSig ) goto bBigger; 1127 return packFloat32( roundData->mode == fl 1128 bExpBigger: 1129 if ( bExp == 0xFF ) { 1130 if ( bSig ) return propagateFloat32Na 1131 return packFloat32( zSign ^ 1, 0xFF, 1132 } 1133 if ( aExp == 0 ) { 1134 ++expDiff; 1135 } 1136 else { 1137 aSig |= 0x40000000; 1138 } 1139 shift32RightJamming( aSig, - expDiff, &aS 1140 bSig |= 0x40000000; 1141 bBigger: 1142 zSig = bSig - aSig; 1143 zExp = bExp; 1144 zSign ^= 1; 1145 goto normalizeRoundAndPack; 1146 aExpBigger: 1147 if ( aExp == 0xFF ) { 1148 if ( aSig ) return propagateFloat32Na 1149 return a; 1150 } 1151 if ( bExp == 0 ) { 1152 --expDiff; 1153 } 1154 else { 1155 bSig |= 0x40000000; 1156 } 1157 shift32RightJamming( bSig, expDiff, &bSig 1158 aSig |= 0x40000000; 1159 aBigger: 1160 zSig = aSig - bSig; 1161 zExp = aExp; 1162 normalizeRoundAndPack: 1163 --zExp; 1164 return normalizeRoundAndPackFloat32( roun 1165 1166 } 1167 1168 /* 1169 --------------------------------------------- 1170 Returns the result of adding the single-preci 1171 and `b'. The operation is performed accordin 1172 Binary Floating-point Arithmetic. 1173 --------------------------------------------- 1174 */ 1175 float32 float32_add( struct roundingData *rou 1176 { 1177 flag aSign, bSign; 1178 1179 aSign = extractFloat32Sign( a ); 1180 bSign = extractFloat32Sign( b ); 1181 if ( aSign == bSign ) { 1182 return addFloat32Sigs( roundData, a, 1183 } 1184 else { 1185 return subFloat32Sigs( roundData, a, 1186 } 1187 1188 } 1189 1190 /* 1191 --------------------------------------------- 1192 Returns the result of subtracting the single- 1193 `a' and `b'. The operation is performed acco 1194 for Binary Floating-point Arithmetic. 1195 --------------------------------------------- 1196 */ 1197 float32 float32_sub( struct roundingData *rou 1198 { 1199 flag aSign, bSign; 1200 1201 aSign = extractFloat32Sign( a ); 1202 bSign = extractFloat32Sign( b ); 1203 if ( aSign == bSign ) { 1204 return subFloat32Sigs( roundData, a, 1205 } 1206 else { 1207 return addFloat32Sigs( roundData, a, 1208 } 1209 1210 } 1211 1212 /* 1213 --------------------------------------------- 1214 Returns the result of multiplying the single- 1215 `a' and `b'. The operation is performed acco 1216 for Binary Floating-point Arithmetic. 1217 --------------------------------------------- 1218 */ 1219 float32 float32_mul( struct roundingData *rou 1220 { 1221 flag aSign, bSign, zSign; 1222 int16 aExp, bExp, zExp; 1223 bits32 aSig, bSig; 1224 bits64 zSig64; 1225 bits32 zSig; 1226 1227 aSig = extractFloat32Frac( a ); 1228 aExp = extractFloat32Exp( a ); 1229 aSign = extractFloat32Sign( a ); 1230 bSig = extractFloat32Frac( b ); 1231 bExp = extractFloat32Exp( b ); 1232 bSign = extractFloat32Sign( b ); 1233 zSign = aSign ^ bSign; 1234 if ( aExp == 0xFF ) { 1235 if ( aSig || ( ( bExp == 0xFF ) && bS 1236 return propagateFloat32NaN( a, b 1237 } 1238 if ( ( bExp | bSig ) == 0 ) { 1239 roundData->exception |= float_fla 1240 return float32_default_nan; 1241 } 1242 return packFloat32( zSign, 0xFF, 0 ); 1243 } 1244 if ( bExp == 0xFF ) { 1245 if ( bSig ) return propagateFloat32Na 1246 if ( ( aExp | aSig ) == 0 ) { 1247 roundData->exception |= float_fla 1248 return float32_default_nan; 1249 } 1250 return packFloat32( zSign, 0xFF, 0 ); 1251 } 1252 if ( aExp == 0 ) { 1253 if ( aSig == 0 ) return packFloat32( 1254 normalizeFloat32Subnormal( aSig, &aEx 1255 } 1256 if ( bExp == 0 ) { 1257 if ( bSig == 0 ) return packFloat32( 1258 normalizeFloat32Subnormal( bSig, &bEx 1259 } 1260 zExp = aExp + bExp - 0x7F; 1261 aSig = ( aSig | 0x00800000 )<<7; 1262 bSig = ( bSig | 0x00800000 )<<8; 1263 shift64RightJamming( ( (bits64) aSig ) * 1264 zSig = zSig64; 1265 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1266 zSig <<= 1; 1267 --zExp; 1268 } 1269 return roundAndPackFloat32( roundData, zS 1270 1271 } 1272 1273 /* 1274 --------------------------------------------- 1275 Returns the result of dividing the single-pre 1276 by the corresponding value `b'. The operatio 1277 IEC/IEEE Standard for Binary Floating-point A 1278 --------------------------------------------- 1279 */ 1280 float32 float32_div( struct roundingData *rou 1281 { 1282 flag aSign, bSign, zSign; 1283 int16 aExp, bExp, zExp; 1284 bits32 aSig, bSig, zSig; 1285 1286 aSig = extractFloat32Frac( a ); 1287 aExp = extractFloat32Exp( a ); 1288 aSign = extractFloat32Sign( a ); 1289 bSig = extractFloat32Frac( b ); 1290 bExp = extractFloat32Exp( b ); 1291 bSign = extractFloat32Sign( b ); 1292 zSign = aSign ^ bSign; 1293 if ( aExp == 0xFF ) { 1294 if ( aSig ) return propagateFloat32Na 1295 if ( bExp == 0xFF ) { 1296 if ( bSig ) return propagateFloat 1297 roundData->exception |= float_fla 1298 return float32_default_nan; 1299 } 1300 return packFloat32( zSign, 0xFF, 0 ); 1301 } 1302 if ( bExp == 0xFF ) { 1303 if ( bSig ) return propagateFloat32Na 1304 return packFloat32( zSign, 0, 0 ); 1305 } 1306 if ( bExp == 0 ) { 1307 if ( bSig == 0 ) { 1308 if ( ( aExp | aSig ) == 0 ) { 1309 roundData->exception |= float 1310 return float32_default_nan; 1311 } 1312 roundData->exception |= float_fla 1313 return packFloat32( zSign, 0xFF, 1314 } 1315 normalizeFloat32Subnormal( bSig, &bEx 1316 } 1317 if ( aExp == 0 ) { 1318 if ( aSig == 0 ) return packFloat32( 1319 normalizeFloat32Subnormal( aSig, &aEx 1320 } 1321 zExp = aExp - bExp + 0x7D; 1322 aSig = ( aSig | 0x00800000 )<<7; 1323 bSig = ( bSig | 0x00800000 )<<8; 1324 if ( bSig <= ( aSig + aSig ) ) { 1325 aSig >>= 1; 1326 ++zExp; 1327 } 1328 { 1329 bits64 tmp = ( (bits64) aSig )<<32; 1330 do_div( tmp, bSig ); 1331 zSig = tmp; 1332 } 1333 if ( ( zSig & 0x3F ) == 0 ) { 1334 zSig |= ( ( (bits64) bSig ) * zSig != 1335 } 1336 return roundAndPackFloat32( roundData, zS 1337 1338 } 1339 1340 /* 1341 --------------------------------------------- 1342 Returns the remainder of the single-precision 1343 with respect to the corresponding value `b'. 1344 according to the IEC/IEEE Standard for Binary 1345 --------------------------------------------- 1346 */ 1347 float32 float32_rem( struct roundingData *rou 1348 { 1349 flag aSign, bSign, zSign; 1350 int16 aExp, bExp, expDiff; 1351 bits32 aSig, bSig; 1352 bits32 q; 1353 bits64 aSig64, bSig64, q64; 1354 bits32 alternateASig; 1355 sbits32 sigMean; 1356 1357 aSig = extractFloat32Frac( a ); 1358 aExp = extractFloat32Exp( a ); 1359 aSign = extractFloat32Sign( a ); 1360 bSig = extractFloat32Frac( b ); 1361 bExp = extractFloat32Exp( b ); 1362 bSign = extractFloat32Sign( b ); 1363 if ( aExp == 0xFF ) { 1364 if ( aSig || ( ( bExp == 0xFF ) && bS 1365 return propagateFloat32NaN( a, b 1366 } 1367 roundData->exception |= float_flag_in 1368 return float32_default_nan; 1369 } 1370 if ( bExp == 0xFF ) { 1371 if ( bSig ) return propagateFloat32Na 1372 return a; 1373 } 1374 if ( bExp == 0 ) { 1375 if ( bSig == 0 ) { 1376 roundData->exception |= float_fla 1377 return float32_default_nan; 1378 } 1379 normalizeFloat32Subnormal( bSig, &bEx 1380 } 1381 if ( aExp == 0 ) { 1382 if ( aSig == 0 ) return a; 1383 normalizeFloat32Subnormal( aSig, &aEx 1384 } 1385 expDiff = aExp - bExp; 1386 aSig |= 0x00800000; 1387 bSig |= 0x00800000; 1388 if ( expDiff < 32 ) { 1389 aSig <<= 8; 1390 bSig <<= 8; 1391 if ( expDiff < 0 ) { 1392 if ( expDiff < -1 ) return a; 1393 aSig >>= 1; 1394 } 1395 q = ( bSig <= aSig ); 1396 if ( q ) aSig -= bSig; 1397 if ( 0 < expDiff ) { 1398 bits64 tmp = ( (bits64) aSig )<<3 1399 do_div( tmp, bSig ); 1400 q = tmp; 1401 q >>= 32 - expDiff; 1402 bSig >>= 2; 1403 aSig = ( ( aSig>>1 )<<( expDiff - 1404 } 1405 else { 1406 aSig >>= 2; 1407 bSig >>= 2; 1408 } 1409 } 1410 else { 1411 if ( bSig <= aSig ) aSig -= bSig; 1412 aSig64 = ( (bits64) aSig )<<40; 1413 bSig64 = ( (bits64) bSig )<<40; 1414 expDiff -= 64; 1415 while ( 0 < expDiff ) { 1416 q64 = estimateDiv128To64( aSig64, 1417 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 1418 aSig64 = - ( ( bSig * q64 )<<38 ) 1419 expDiff -= 62; 1420 } 1421 expDiff += 64; 1422 q64 = estimateDiv128To64( aSig64, 0, 1423 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 1424 q = q64>>( 64 - expDiff ); 1425 bSig <<= 6; 1426 aSig = ( ( aSig64>>33 )<<( expDiff - 1427 } 1428 do { 1429 alternateASig = aSig; 1430 ++q; 1431 aSig -= bSig; 1432 } while ( 0 <= (sbits32) aSig ); 1433 sigMean = aSig + alternateASig; 1434 if ( ( sigMean < 0 ) || ( ( sigMean == 0 1435 aSig = alternateASig; 1436 } 1437 zSign = ( (sbits32) aSig < 0 ); 1438 if ( zSign ) aSig = - aSig; 1439 return normalizeRoundAndPackFloat32( roun 1440 1441 } 1442 1443 /* 1444 --------------------------------------------- 1445 Returns the square root of the single-precisi 1446 The operation is performed according to the I 1447 Floating-point Arithmetic. 1448 --------------------------------------------- 1449 */ 1450 float32 float32_sqrt( struct roundingData *ro 1451 { 1452 flag aSign; 1453 int16 aExp, zExp; 1454 bits32 aSig, zSig; 1455 bits64 rem, term; 1456 1457 aSig = extractFloat32Frac( a ); 1458 aExp = extractFloat32Exp( a ); 1459 aSign = extractFloat32Sign( a ); 1460 if ( aExp == 0xFF ) { 1461 if ( aSig ) return propagateFloat32Na 1462 if ( ! aSign ) return a; 1463 roundData->exception |= float_flag_in 1464 return float32_default_nan; 1465 } 1466 if ( aSign ) { 1467 if ( ( aExp | aSig ) == 0 ) return a; 1468 roundData->exception |= float_flag_in 1469 return float32_default_nan; 1470 } 1471 if ( aExp == 0 ) { 1472 if ( aSig == 0 ) return 0; 1473 normalizeFloat32Subnormal( aSig, &aEx 1474 } 1475 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 1476 aSig = ( aSig | 0x00800000 )<<8; 1477 zSig = estimateSqrt32( aExp, aSig ) + 2; 1478 if ( ( zSig & 0x7F ) <= 5 ) { 1479 if ( zSig < 2 ) { 1480 zSig = 0xFFFFFFFF; 1481 } 1482 else { 1483 aSig >>= aExp & 1; 1484 term = ( (bits64) zSig ) * zSig; 1485 rem = ( ( (bits64) aSig )<<32 ) - 1486 while ( (sbits64) rem < 0 ) { 1487 --zSig; 1488 rem += ( ( (bits64) zSig )<<1 1489 } 1490 zSig |= ( rem != 0 ); 1491 } 1492 } 1493 shift32RightJamming( zSig, 1, &zSig ); 1494 return roundAndPackFloat32( roundData, 0, 1495 1496 } 1497 1498 /* 1499 --------------------------------------------- 1500 Returns 1 if the single-precision floating-po 1501 corresponding value `b', and 0 otherwise. Th 1502 according to the IEC/IEEE Standard for Binary 1503 --------------------------------------------- 1504 */ 1505 flag float32_eq( float32 a, float32 b ) 1506 { 1507 1508 if ( ( ( extractFloat32Exp( a ) == 0xF 1509 || ( ( extractFloat32Exp( b ) == 0xF 1510 ) { 1511 if ( float32_is_signaling_nan( a ) || 1512 float_raise( float_flag_invalid ) 1513 } 1514 return 0; 1515 } 1516 return ( a == b ) || ( (bits32) ( ( a | b 1517 1518 } 1519 1520 /* 1521 --------------------------------------------- 1522 Returns 1 if the single-precision floating-po 1523 equal to the corresponding value `b', and 0 o 1524 performed according to the IEC/IEEE Standard 1525 Arithmetic. 1526 --------------------------------------------- 1527 */ 1528 flag float32_le( float32 a, float32 b ) 1529 { 1530 flag aSign, bSign; 1531 1532 if ( ( ( extractFloat32Exp( a ) == 0xF 1533 || ( ( extractFloat32Exp( b ) == 0xF 1534 ) { 1535 float_raise( float_flag_invalid ); 1536 return 0; 1537 } 1538 aSign = extractFloat32Sign( a ); 1539 bSign = extractFloat32Sign( b ); 1540 if ( aSign != bSign ) return aSign || ( ( 1541 return ( a == b ) || ( aSign ^ ( a < b ) 1542 1543 } 1544 1545 /* 1546 --------------------------------------------- 1547 Returns 1 if the single-precision floating-po 1548 the corresponding value `b', and 0 otherwise. 1549 according to the IEC/IEEE Standard for Binary 1550 --------------------------------------------- 1551 */ 1552 flag float32_lt( float32 a, float32 b ) 1553 { 1554 flag aSign, bSign; 1555 1556 if ( ( ( extractFloat32Exp( a ) == 0xF 1557 || ( ( extractFloat32Exp( b ) == 0xF 1558 ) { 1559 float_raise( float_flag_invalid ); 1560 return 0; 1561 } 1562 aSign = extractFloat32Sign( a ); 1563 bSign = extractFloat32Sign( b ); 1564 if ( aSign != bSign ) return aSign && ( ( 1565 return ( a != b ) && ( aSign ^ ( a < b ) 1566 1567 } 1568 1569 /* 1570 --------------------------------------------- 1571 Returns 1 if the single-precision floating-po 1572 corresponding value `b', and 0 otherwise. Th 1573 if either operand is a NaN. Otherwise, the c 1574 according to the IEC/IEEE Standard for Binary 1575 --------------------------------------------- 1576 */ 1577 flag float32_eq_signaling( float32 a, float32 1578 { 1579 1580 if ( ( ( extractFloat32Exp( a ) == 0xF 1581 || ( ( extractFloat32Exp( b ) == 0xF 1582 ) { 1583 float_raise( float_flag_invalid ); 1584 return 0; 1585 } 1586 return ( a == b ) || ( (bits32) ( ( a | b 1587 1588 } 1589 1590 /* 1591 --------------------------------------------- 1592 Returns 1 if the single-precision floating-po 1593 equal to the corresponding value `b', and 0 o 1594 cause an exception. Otherwise, the compariso 1595 IEC/IEEE Standard for Binary Floating-point A 1596 --------------------------------------------- 1597 */ 1598 flag float32_le_quiet( float32 a, float32 b ) 1599 { 1600 flag aSign, bSign; 1601 //int16 aExp, bExp; 1602 1603 if ( ( ( extractFloat32Exp( a ) == 0xF 1604 || ( ( extractFloat32Exp( b ) == 0xF 1605 ) { 1606 /* Do nothing, even if NaN as we're q 1607 return 0; 1608 } 1609 aSign = extractFloat32Sign( a ); 1610 bSign = extractFloat32Sign( b ); 1611 if ( aSign != bSign ) return aSign || ( ( 1612 return ( a == b ) || ( aSign ^ ( a < b ) 1613 1614 } 1615 1616 /* 1617 --------------------------------------------- 1618 Returns 1 if the single-precision floating-po 1619 the corresponding value `b', and 0 otherwise. 1620 exception. Otherwise, the comparison is perf 1621 Standard for Binary Floating-point Arithmetic 1622 --------------------------------------------- 1623 */ 1624 flag float32_lt_quiet( float32 a, float32 b ) 1625 { 1626 flag aSign, bSign; 1627 1628 if ( ( ( extractFloat32Exp( a ) == 0xF 1629 || ( ( extractFloat32Exp( b ) == 0xF 1630 ) { 1631 /* Do nothing, even if NaN as we're q 1632 return 0; 1633 } 1634 aSign = extractFloat32Sign( a ); 1635 bSign = extractFloat32Sign( b ); 1636 if ( aSign != bSign ) return aSign && ( ( 1637 return ( a != b ) && ( aSign ^ ( a < b ) 1638 1639 } 1640 1641 /* 1642 --------------------------------------------- 1643 Returns the result of converting the double-p 1644 `a' to the 32-bit two's complement integer fo 1645 performed according to the IEC/IEEE Standard 1646 Arithmetic---which means in particular that t 1647 according to the current rounding mode. If ` 1648 positive integer is returned. Otherwise, if 1649 largest integer with the same sign as `a' is 1650 --------------------------------------------- 1651 */ 1652 int32 float64_to_int32( struct roundingData * 1653 { 1654 flag aSign; 1655 int16 aExp, shiftCount; 1656 bits64 aSig; 1657 1658 aSig = extractFloat64Frac( a ); 1659 aExp = extractFloat64Exp( a ); 1660 aSign = extractFloat64Sign( a ); 1661 if ( ( aExp == 0x7FF ) && aSig ) aSign = 1662 if ( aExp ) aSig |= LIT64( 0x001000000000 1663 shiftCount = 0x42C - aExp; 1664 if ( 0 < shiftCount ) shift64RightJamming 1665 return roundAndPackInt32( roundData, aSig 1666 1667 } 1668 1669 /* 1670 --------------------------------------------- 1671 Returns the result of converting the double-p 1672 `a' to the 32-bit two's complement integer fo 1673 performed according to the IEC/IEEE Standard 1674 Arithmetic, except that the conversion is alw 1675 `a' is a NaN, the largest positive integer is 1676 conversion overflows, the largest integer wit 1677 returned. 1678 --------------------------------------------- 1679 */ 1680 int32 float64_to_int32_round_to_zero( float64 1681 { 1682 flag aSign; 1683 int16 aExp, shiftCount; 1684 bits64 aSig, savedASig; 1685 int32 z; 1686 1687 aSig = extractFloat64Frac( a ); 1688 aExp = extractFloat64Exp( a ); 1689 aSign = extractFloat64Sign( a ); 1690 shiftCount = 0x433 - aExp; 1691 if ( shiftCount < 21 ) { 1692 if ( ( aExp == 0x7FF ) && aSig ) aSig 1693 goto invalid; 1694 } 1695 else if ( 52 < shiftCount ) { 1696 if ( aExp || aSig ) float_raise( floa 1697 return 0; 1698 } 1699 aSig |= LIT64( 0x0010000000000000 ); 1700 savedASig = aSig; 1701 aSig >>= shiftCount; 1702 z = aSig; 1703 if ( aSign ) z = - z; 1704 if ( ( z < 0 ) ^ aSign ) { 1705 invalid: 1706 float_raise( float_flag_invalid ); 1707 return aSign ? 0x80000000 : 0x7FFFFFF 1708 } 1709 if ( ( aSig<<shiftCount ) != savedASig ) 1710 float_raise( float_flag_inexact ); 1711 } 1712 return z; 1713 1714 } 1715 1716 /* 1717 --------------------------------------------- 1718 Returns the result of converting the double-p 1719 `a' to the 32-bit two's complement unsigned i 1720 is performed according to the IEC/IEEE Standa 1721 Arithmetic---which means in particular that t 1722 according to the current rounding mode. If ` 1723 positive integer is returned. Otherwise, if 1724 largest positive integer is returned. 1725 --------------------------------------------- 1726 */ 1727 int32 float64_to_uint32( struct roundingData 1728 { 1729 flag aSign; 1730 int16 aExp, shiftCount; 1731 bits64 aSig; 1732 1733 aSig = extractFloat64Frac( a ); 1734 aExp = extractFloat64Exp( a ); 1735 aSign = 0; //extractFloat64Sign( a ); 1736 //if ( ( aExp == 0x7FF ) && aSig ) aSign 1737 if ( aExp ) aSig |= LIT64( 0x001000000000 1738 shiftCount = 0x42C - aExp; 1739 if ( 0 < shiftCount ) shift64RightJamming 1740 return roundAndPackInt32( roundData, aSig 1741 } 1742 1743 /* 1744 --------------------------------------------- 1745 Returns the result of converting the double-p 1746 `a' to the 32-bit two's complement integer fo 1747 performed according to the IEC/IEEE Standard 1748 Arithmetic, except that the conversion is alw 1749 `a' is a NaN, the largest positive integer is 1750 conversion overflows, the largest positive in 1751 --------------------------------------------- 1752 */ 1753 int32 float64_to_uint32_round_to_zero( float6 1754 { 1755 flag aSign; 1756 int16 aExp, shiftCount; 1757 bits64 aSig, savedASig; 1758 int32 z; 1759 1760 aSig = extractFloat64Frac( a ); 1761 aExp = extractFloat64Exp( a ); 1762 aSign = extractFloat64Sign( a ); 1763 shiftCount = 0x433 - aExp; 1764 if ( shiftCount < 21 ) { 1765 if ( ( aExp == 0x7FF ) && aSig ) aSig 1766 goto invalid; 1767 } 1768 else if ( 52 < shiftCount ) { 1769 if ( aExp || aSig ) float_raise( floa 1770 return 0; 1771 } 1772 aSig |= LIT64( 0x0010000000000000 ); 1773 savedASig = aSig; 1774 aSig >>= shiftCount; 1775 z = aSig; 1776 if ( aSign ) z = - z; 1777 if ( ( z < 0 ) ^ aSign ) { 1778 invalid: 1779 float_raise( float_flag_invalid ); 1780 return aSign ? 0x80000000 : 0x7FFFFFF 1781 } 1782 if ( ( aSig<<shiftCount ) != savedASig ) 1783 float_raise( float_flag_inexact ); 1784 } 1785 return z; 1786 } 1787 1788 /* 1789 --------------------------------------------- 1790 Returns the result of converting the double-p 1791 `a' to the single-precision floating-point fo 1792 performed according to the IEC/IEEE Standard 1793 Arithmetic. 1794 --------------------------------------------- 1795 */ 1796 float32 float64_to_float32( struct roundingDa 1797 { 1798 flag aSign; 1799 int16 aExp; 1800 bits64 aSig; 1801 bits32 zSig; 1802 1803 aSig = extractFloat64Frac( a ); 1804 aExp = extractFloat64Exp( a ); 1805 aSign = extractFloat64Sign( a ); 1806 if ( aExp == 0x7FF ) { 1807 if ( aSig ) return commonNaNToFloat32 1808 return packFloat32( aSign, 0xFF, 0 ); 1809 } 1810 shift64RightJamming( aSig, 22, &aSig ); 1811 zSig = aSig; 1812 if ( aExp || zSig ) { 1813 zSig |= 0x40000000; 1814 aExp -= 0x381; 1815 } 1816 return roundAndPackFloat32( roundData, aS 1817 1818 } 1819 1820 #ifdef FLOATX80 1821 1822 /* 1823 --------------------------------------------- 1824 Returns the result of converting the double-p 1825 `a' to the extended double-precision floating 1826 is performed according to the IEC/IEEE Standa 1827 Arithmetic. 1828 --------------------------------------------- 1829 */ 1830 floatx80 float64_to_floatx80( float64 a ) 1831 { 1832 flag aSign; 1833 int16 aExp; 1834 bits64 aSig; 1835 1836 aSig = extractFloat64Frac( a ); 1837 aExp = extractFloat64Exp( a ); 1838 aSign = extractFloat64Sign( a ); 1839 if ( aExp == 0x7FF ) { 1840 if ( aSig ) return commonNaNToFloatx8 1841 return packFloatx80( aSign, 0x7FFF, L 1842 } 1843 if ( aExp == 0 ) { 1844 if ( aSig == 0 ) return packFloatx80( 1845 normalizeFloat64Subnormal( aSig, &aEx 1846 } 1847 return 1848 packFloatx80( 1849 aSign, aExp + 0x3C00, ( aSig | LI 1850 1851 } 1852 1853 #endif 1854 1855 /* 1856 --------------------------------------------- 1857 Rounds the double-precision floating-point va 1858 returns the result as a double-precision floa 1859 operation is performed according to the IEC/I 1860 Floating-point Arithmetic. 1861 --------------------------------------------- 1862 */ 1863 float64 float64_round_to_int( struct rounding 1864 { 1865 flag aSign; 1866 int16 aExp; 1867 bits64 lastBitMask, roundBitsMask; 1868 int8 roundingMode; 1869 float64 z; 1870 1871 aExp = extractFloat64Exp( a ); 1872 if ( 0x433 <= aExp ) { 1873 if ( ( aExp == 0x7FF ) && extractFloa 1874 return propagateFloat64NaN( a, a 1875 } 1876 return a; 1877 } 1878 if ( aExp <= 0x3FE ) { 1879 if ( (bits64) ( a<<1 ) == 0 ) return 1880 roundData->exception |= float_flag_in 1881 aSign = extractFloat64Sign( a ); 1882 switch ( roundData->mode ) { 1883 case float_round_nearest_even: 1884 if ( ( aExp == 0x3FE ) && extract 1885 return packFloat64( aSign, 0x 1886 } 1887 break; 1888 case float_round_down: 1889 return aSign ? LIT64( 0xBFF000000 1890 case float_round_up: 1891 return 1892 aSign ? LIT64( 0x8000000000000000 1893 } 1894 return packFloat64( aSign, 0, 0 ); 1895 } 1896 lastBitMask = 1; 1897 lastBitMask <<= 0x433 - aExp; 1898 roundBitsMask = lastBitMask - 1; 1899 z = a; 1900 roundingMode = roundData->mode; 1901 if ( roundingMode == float_round_nearest_ 1902 z += lastBitMask>>1; 1903 if ( ( z & roundBitsMask ) == 0 ) z & 1904 } 1905 else if ( roundingMode != float_round_to_ 1906 if ( extractFloat64Sign( z ) ^ ( roun 1907 z += roundBitsMask; 1908 } 1909 } 1910 z &= ~ roundBitsMask; 1911 if ( z != a ) roundData->exception |= flo 1912 return z; 1913 1914 } 1915 1916 /* 1917 --------------------------------------------- 1918 Returns the result of adding the absolute val 1919 floating-point values `a' and `b'. If `zSign 1920 before being returned. `zSign' is ignored if 1921 addition is performed according to the IEC/IE 1922 Floating-point Arithmetic. 1923 --------------------------------------------- 1924 */ 1925 static float64 addFloat64Sigs( struct roundin 1926 { 1927 int16 aExp, bExp, zExp; 1928 bits64 aSig, bSig, zSig; 1929 int16 expDiff; 1930 1931 aSig = extractFloat64Frac( a ); 1932 aExp = extractFloat64Exp( a ); 1933 bSig = extractFloat64Frac( b ); 1934 bExp = extractFloat64Exp( b ); 1935 expDiff = aExp - bExp; 1936 aSig <<= 9; 1937 bSig <<= 9; 1938 if ( 0 < expDiff ) { 1939 if ( aExp == 0x7FF ) { 1940 if ( aSig ) return propagateFloat 1941 return a; 1942 } 1943 if ( bExp == 0 ) { 1944 --expDiff; 1945 } 1946 else { 1947 bSig |= LIT64( 0x2000000000000000 1948 } 1949 shift64RightJamming( bSig, expDiff, & 1950 zExp = aExp; 1951 } 1952 else if ( expDiff < 0 ) { 1953 if ( bExp == 0x7FF ) { 1954 if ( bSig ) return propagateFloat 1955 return packFloat64( zSign, 0x7FF, 1956 } 1957 if ( aExp == 0 ) { 1958 ++expDiff; 1959 } 1960 else { 1961 aSig |= LIT64( 0x2000000000000000 1962 } 1963 shift64RightJamming( aSig, - expDiff, 1964 zExp = bExp; 1965 } 1966 else { 1967 if ( aExp == 0x7FF ) { 1968 if ( aSig | bSig ) return propaga 1969 return a; 1970 } 1971 if ( aExp == 0 ) return packFloat64( 1972 zSig = LIT64( 0x4000000000000000 ) + 1973 zExp = aExp; 1974 goto roundAndPack; 1975 } 1976 aSig |= LIT64( 0x2000000000000000 ); 1977 zSig = ( aSig + bSig )<<1; 1978 --zExp; 1979 if ( (sbits64) zSig < 0 ) { 1980 zSig = aSig + bSig; 1981 ++zExp; 1982 } 1983 roundAndPack: 1984 return roundAndPackFloat64( roundData, zS 1985 1986 } 1987 1988 /* 1989 --------------------------------------------- 1990 Returns the result of subtracting the absolut 1991 precision floating-point values `a' and `b'. 1992 difference is negated before being returned. 1993 result is a NaN. The subtraction is performe 1994 Standard for Binary Floating-point Arithmetic 1995 --------------------------------------------- 1996 */ 1997 static float64 subFloat64Sigs( struct roundin 1998 { 1999 int16 aExp, bExp, zExp; 2000 bits64 aSig, bSig, zSig; 2001 int16 expDiff; 2002 2003 aSig = extractFloat64Frac( a ); 2004 aExp = extractFloat64Exp( a ); 2005 bSig = extractFloat64Frac( b ); 2006 bExp = extractFloat64Exp( b ); 2007 expDiff = aExp - bExp; 2008 aSig <<= 10; 2009 bSig <<= 10; 2010 if ( 0 < expDiff ) goto aExpBigger; 2011 if ( expDiff < 0 ) goto bExpBigger; 2012 if ( aExp == 0x7FF ) { 2013 if ( aSig | bSig ) return propagateFl 2014 roundData->exception |= float_flag_in 2015 return float64_default_nan; 2016 } 2017 if ( aExp == 0 ) { 2018 aExp = 1; 2019 bExp = 1; 2020 } 2021 if ( bSig < aSig ) goto aBigger; 2022 if ( aSig < bSig ) goto bBigger; 2023 return packFloat64( roundData->mode == fl 2024 bExpBigger: 2025 if ( bExp == 0x7FF ) { 2026 if ( bSig ) return propagateFloat64Na 2027 return packFloat64( zSign ^ 1, 0x7FF, 2028 } 2029 if ( aExp == 0 ) { 2030 ++expDiff; 2031 } 2032 else { 2033 aSig |= LIT64( 0x4000000000000000 ); 2034 } 2035 shift64RightJamming( aSig, - expDiff, &aS 2036 bSig |= LIT64( 0x4000000000000000 ); 2037 bBigger: 2038 zSig = bSig - aSig; 2039 zExp = bExp; 2040 zSign ^= 1; 2041 goto normalizeRoundAndPack; 2042 aExpBigger: 2043 if ( aExp == 0x7FF ) { 2044 if ( aSig ) return propagateFloat64Na 2045 return a; 2046 } 2047 if ( bExp == 0 ) { 2048 --expDiff; 2049 } 2050 else { 2051 bSig |= LIT64( 0x4000000000000000 ); 2052 } 2053 shift64RightJamming( bSig, expDiff, &bSig 2054 aSig |= LIT64( 0x4000000000000000 ); 2055 aBigger: 2056 zSig = aSig - bSig; 2057 zExp = aExp; 2058 normalizeRoundAndPack: 2059 --zExp; 2060 return normalizeRoundAndPackFloat64( roun 2061 2062 } 2063 2064 /* 2065 --------------------------------------------- 2066 Returns the result of adding the double-preci 2067 and `b'. The operation is performed accordin 2068 Binary Floating-point Arithmetic. 2069 --------------------------------------------- 2070 */ 2071 float64 float64_add( struct roundingData *rou 2072 { 2073 flag aSign, bSign; 2074 2075 aSign = extractFloat64Sign( a ); 2076 bSign = extractFloat64Sign( b ); 2077 if ( aSign == bSign ) { 2078 return addFloat64Sigs( roundData, a, 2079 } 2080 else { 2081 return subFloat64Sigs( roundData, a, 2082 } 2083 2084 } 2085 2086 /* 2087 --------------------------------------------- 2088 Returns the result of subtracting the double- 2089 `a' and `b'. The operation is performed acco 2090 for Binary Floating-point Arithmetic. 2091 --------------------------------------------- 2092 */ 2093 float64 float64_sub( struct roundingData *rou 2094 { 2095 flag aSign, bSign; 2096 2097 aSign = extractFloat64Sign( a ); 2098 bSign = extractFloat64Sign( b ); 2099 if ( aSign == bSign ) { 2100 return subFloat64Sigs( roundData, a, 2101 } 2102 else { 2103 return addFloat64Sigs( roundData, a, 2104 } 2105 2106 } 2107 2108 /* 2109 --------------------------------------------- 2110 Returns the result of multiplying the double- 2111 `a' and `b'. The operation is performed acco 2112 for Binary Floating-point Arithmetic. 2113 --------------------------------------------- 2114 */ 2115 float64 float64_mul( struct roundingData *rou 2116 { 2117 flag aSign, bSign, zSign; 2118 int16 aExp, bExp, zExp; 2119 bits64 aSig, bSig, zSig0, zSig1; 2120 2121 aSig = extractFloat64Frac( a ); 2122 aExp = extractFloat64Exp( a ); 2123 aSign = extractFloat64Sign( a ); 2124 bSig = extractFloat64Frac( b ); 2125 bExp = extractFloat64Exp( b ); 2126 bSign = extractFloat64Sign( b ); 2127 zSign = aSign ^ bSign; 2128 if ( aExp == 0x7FF ) { 2129 if ( aSig || ( ( bExp == 0x7FF ) && b 2130 return propagateFloat64NaN( a, b 2131 } 2132 if ( ( bExp | bSig ) == 0 ) { 2133 roundData->exception |= float_fla 2134 return float64_default_nan; 2135 } 2136 return packFloat64( zSign, 0x7FF, 0 ) 2137 } 2138 if ( bExp == 0x7FF ) { 2139 if ( bSig ) return propagateFloat64Na 2140 if ( ( aExp | aSig ) == 0 ) { 2141 roundData->exception |= float_fla 2142 return float64_default_nan; 2143 } 2144 return packFloat64( zSign, 0x7FF, 0 ) 2145 } 2146 if ( aExp == 0 ) { 2147 if ( aSig == 0 ) return packFloat64( 2148 normalizeFloat64Subnormal( aSig, &aEx 2149 } 2150 if ( bExp == 0 ) { 2151 if ( bSig == 0 ) return packFloat64( 2152 normalizeFloat64Subnormal( bSig, &bEx 2153 } 2154 zExp = aExp + bExp - 0x3FF; 2155 aSig = ( aSig | LIT64( 0x0010000000000000 2156 bSig = ( bSig | LIT64( 0x0010000000000000 2157 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2158 zSig0 |= ( zSig1 != 0 ); 2159 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2160 zSig0 <<= 1; 2161 --zExp; 2162 } 2163 return roundAndPackFloat64( roundData, zS 2164 2165 } 2166 2167 /* 2168 --------------------------------------------- 2169 Returns the result of dividing the double-pre 2170 by the corresponding value `b'. The operatio 2171 the IEC/IEEE Standard for Binary Floating-poi 2172 --------------------------------------------- 2173 */ 2174 float64 float64_div( struct roundingData *rou 2175 { 2176 flag aSign, bSign, zSign; 2177 int16 aExp, bExp, zExp; 2178 bits64 aSig, bSig, zSig; 2179 bits64 rem0, rem1; 2180 bits64 term0, term1; 2181 2182 aSig = extractFloat64Frac( a ); 2183 aExp = extractFloat64Exp( a ); 2184 aSign = extractFloat64Sign( a ); 2185 bSig = extractFloat64Frac( b ); 2186 bExp = extractFloat64Exp( b ); 2187 bSign = extractFloat64Sign( b ); 2188 zSign = aSign ^ bSign; 2189 if ( aExp == 0x7FF ) { 2190 if ( aSig ) return propagateFloat64Na 2191 if ( bExp == 0x7FF ) { 2192 if ( bSig ) return propagateFloat 2193 roundData->exception |= float_fla 2194 return float64_default_nan; 2195 } 2196 return packFloat64( zSign, 0x7FF, 0 ) 2197 } 2198 if ( bExp == 0x7FF ) { 2199 if ( bSig ) return propagateFloat64Na 2200 return packFloat64( zSign, 0, 0 ); 2201 } 2202 if ( bExp == 0 ) { 2203 if ( bSig == 0 ) { 2204 if ( ( aExp | aSig ) == 0 ) { 2205 roundData->exception |= float 2206 return float64_default_nan; 2207 } 2208 roundData->exception |= float_fla 2209 return packFloat64( zSign, 0x7FF, 2210 } 2211 normalizeFloat64Subnormal( bSig, &bEx 2212 } 2213 if ( aExp == 0 ) { 2214 if ( aSig == 0 ) return packFloat64( 2215 normalizeFloat64Subnormal( aSig, &aEx 2216 } 2217 zExp = aExp - bExp + 0x3FD; 2218 aSig = ( aSig | LIT64( 0x0010000000000000 2219 bSig = ( bSig | LIT64( 0x0010000000000000 2220 if ( bSig <= ( aSig + aSig ) ) { 2221 aSig >>= 1; 2222 ++zExp; 2223 } 2224 zSig = estimateDiv128To64( aSig, 0, bSig 2225 if ( ( zSig & 0x1FF ) <= 2 ) { 2226 mul64To128( bSig, zSig, &term0, &term 2227 sub128( aSig, 0, term0, term1, &rem0, 2228 while ( (sbits64) rem0 < 0 ) { 2229 --zSig; 2230 add128( rem0, rem1, 0, bSig, &rem 2231 } 2232 zSig |= ( rem1 != 0 ); 2233 } 2234 return roundAndPackFloat64( roundData, zS 2235 2236 } 2237 2238 /* 2239 --------------------------------------------- 2240 Returns the remainder of the double-precision 2241 with respect to the corresponding value `b'. 2242 according to the IEC/IEEE Standard for Binary 2243 --------------------------------------------- 2244 */ 2245 float64 float64_rem( struct roundingData *rou 2246 { 2247 flag aSign, bSign, zSign; 2248 int16 aExp, bExp, expDiff; 2249 bits64 aSig, bSig; 2250 bits64 q, alternateASig; 2251 sbits64 sigMean; 2252 2253 aSig = extractFloat64Frac( a ); 2254 aExp = extractFloat64Exp( a ); 2255 aSign = extractFloat64Sign( a ); 2256 bSig = extractFloat64Frac( b ); 2257 bExp = extractFloat64Exp( b ); 2258 bSign = extractFloat64Sign( b ); 2259 if ( aExp == 0x7FF ) { 2260 if ( aSig || ( ( bExp == 0x7FF ) && b 2261 return propagateFloat64NaN( a, b 2262 } 2263 roundData->exception |= float_flag_in 2264 return float64_default_nan; 2265 } 2266 if ( bExp == 0x7FF ) { 2267 if ( bSig ) return propagateFloat64Na 2268 return a; 2269 } 2270 if ( bExp == 0 ) { 2271 if ( bSig == 0 ) { 2272 roundData->exception |= float_fla 2273 return float64_default_nan; 2274 } 2275 normalizeFloat64Subnormal( bSig, &bEx 2276 } 2277 if ( aExp == 0 ) { 2278 if ( aSig == 0 ) return a; 2279 normalizeFloat64Subnormal( aSig, &aEx 2280 } 2281 expDiff = aExp - bExp; 2282 aSig = ( aSig | LIT64( 0x0010000000000000 2283 bSig = ( bSig | LIT64( 0x0010000000000000 2284 if ( expDiff < 0 ) { 2285 if ( expDiff < -1 ) return a; 2286 aSig >>= 1; 2287 } 2288 q = ( bSig <= aSig ); 2289 if ( q ) aSig -= bSig; 2290 expDiff -= 64; 2291 while ( 0 < expDiff ) { 2292 q = estimateDiv128To64( aSig, 0, bSig 2293 q = ( 2 < q ) ? q - 2 : 0; 2294 aSig = - ( ( bSig>>2 ) * q ); 2295 expDiff -= 62; 2296 } 2297 expDiff += 64; 2298 if ( 0 < expDiff ) { 2299 q = estimateDiv128To64( aSig, 0, bSig 2300 q = ( 2 < q ) ? q - 2 : 0; 2301 q >>= 64 - expDiff; 2302 bSig >>= 2; 2303 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) 2304 } 2305 else { 2306 aSig >>= 2; 2307 bSig >>= 2; 2308 } 2309 do { 2310 alternateASig = aSig; 2311 ++q; 2312 aSig -= bSig; 2313 } while ( 0 <= (sbits64) aSig ); 2314 sigMean = aSig + alternateASig; 2315 if ( ( sigMean < 0 ) || ( ( sigMean == 0 2316 aSig = alternateASig; 2317 } 2318 zSign = ( (sbits64) aSig < 0 ); 2319 if ( zSign ) aSig = - aSig; 2320 return normalizeRoundAndPackFloat64( roun 2321 2322 } 2323 2324 /* 2325 --------------------------------------------- 2326 Returns the square root of the double-precisi 2327 The operation is performed according to the I 2328 Floating-point Arithmetic. 2329 --------------------------------------------- 2330 */ 2331 float64 float64_sqrt( struct roundingData *ro 2332 { 2333 flag aSign; 2334 int16 aExp, zExp; 2335 bits64 aSig, zSig; 2336 bits64 rem0, rem1, term0, term1; //, shif 2337 //float64 z; 2338 2339 aSig = extractFloat64Frac( a ); 2340 aExp = extractFloat64Exp( a ); 2341 aSign = extractFloat64Sign( a ); 2342 if ( aExp == 0x7FF ) { 2343 if ( aSig ) return propagateFloat64Na 2344 if ( ! aSign ) return a; 2345 roundData->exception |= float_flag_in 2346 return float64_default_nan; 2347 } 2348 if ( aSign ) { 2349 if ( ( aExp | aSig ) == 0 ) return a; 2350 roundData->exception |= float_flag_in 2351 return float64_default_nan; 2352 } 2353 if ( aExp == 0 ) { 2354 if ( aSig == 0 ) return 0; 2355 normalizeFloat64Subnormal( aSig, &aEx 2356 } 2357 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 2358 aSig |= LIT64( 0x0010000000000000 ); 2359 zSig = estimateSqrt32( aExp, aSig>>21 ); 2360 zSig <<= 31; 2361 aSig <<= 9 - ( aExp & 1 ); 2362 zSig = estimateDiv128To64( aSig, 0, zSig 2363 if ( ( zSig & 0x3FF ) <= 5 ) { 2364 if ( zSig < 2 ) { 2365 zSig = LIT64( 0xFFFFFFFFFFFFFFFF 2366 } 2367 else { 2368 aSig <<= 2; 2369 mul64To128( zSig, zSig, &term0, & 2370 sub128( aSig, 0, term0, term1, &r 2371 while ( (sbits64) rem0 < 0 ) { 2372 --zSig; 2373 shortShift128Left( 0, zSig, 1 2374 term1 |= 1; 2375 add128( rem0, rem1, term0, te 2376 } 2377 zSig |= ( ( rem0 | rem1 ) != 0 ); 2378 } 2379 } 2380 shift64RightJamming( zSig, 1, &zSig ); 2381 return roundAndPackFloat64( roundData, 0, 2382 2383 } 2384 2385 /* 2386 --------------------------------------------- 2387 Returns 1 if the double-precision floating-po 2388 corresponding value `b', and 0 otherwise. Th 2389 according to the IEC/IEEE Standard for Binary 2390 --------------------------------------------- 2391 */ 2392 flag float64_eq( float64 a, float64 b ) 2393 { 2394 2395 if ( ( ( extractFloat64Exp( a ) == 0x7 2396 || ( ( extractFloat64Exp( b ) == 0x7 2397 ) { 2398 if ( float64_is_signaling_nan( a ) || 2399 float_raise( float_flag_invalid ) 2400 } 2401 return 0; 2402 } 2403 return ( a == b ) || ( (bits64) ( ( a | b 2404 2405 } 2406 2407 /* 2408 --------------------------------------------- 2409 Returns 1 if the double-precision floating-po 2410 equal to the corresponding value `b', and 0 o 2411 performed according to the IEC/IEEE Standard 2412 Arithmetic. 2413 --------------------------------------------- 2414 */ 2415 flag float64_le( float64 a, float64 b ) 2416 { 2417 flag aSign, bSign; 2418 2419 if ( ( ( extractFloat64Exp( a ) == 0x7 2420 || ( ( extractFloat64Exp( b ) == 0x7 2421 ) { 2422 float_raise( float_flag_invalid ); 2423 return 0; 2424 } 2425 aSign = extractFloat64Sign( a ); 2426 bSign = extractFloat64Sign( b ); 2427 if ( aSign != bSign ) return aSign || ( ( 2428 return ( a == b ) || ( aSign ^ ( a < b ) 2429 2430 } 2431 2432 /* 2433 --------------------------------------------- 2434 Returns 1 if the double-precision floating-po 2435 the corresponding value `b', and 0 otherwise. 2436 according to the IEC/IEEE Standard for Binary 2437 --------------------------------------------- 2438 */ 2439 flag float64_lt( float64 a, float64 b ) 2440 { 2441 flag aSign, bSign; 2442 2443 if ( ( ( extractFloat64Exp( a ) == 0x7 2444 || ( ( extractFloat64Exp( b ) == 0x7 2445 ) { 2446 float_raise( float_flag_invalid ); 2447 return 0; 2448 } 2449 aSign = extractFloat64Sign( a ); 2450 bSign = extractFloat64Sign( b ); 2451 if ( aSign != bSign ) return aSign && ( ( 2452 return ( a != b ) && ( aSign ^ ( a < b ) 2453 2454 } 2455 2456 /* 2457 --------------------------------------------- 2458 Returns 1 if the double-precision floating-po 2459 corresponding value `b', and 0 otherwise. Th 2460 if either operand is a NaN. Otherwise, the c 2461 according to the IEC/IEEE Standard for Binary 2462 --------------------------------------------- 2463 */ 2464 flag float64_eq_signaling( float64 a, float64 2465 { 2466 2467 if ( ( ( extractFloat64Exp( a ) == 0x7 2468 || ( ( extractFloat64Exp( b ) == 0x7 2469 ) { 2470 float_raise( float_flag_invalid ); 2471 return 0; 2472 } 2473 return ( a == b ) || ( (bits64) ( ( a | b 2474 2475 } 2476 2477 /* 2478 --------------------------------------------- 2479 Returns 1 if the double-precision floating-po 2480 equal to the corresponding value `b', and 0 o 2481 cause an exception. Otherwise, the compariso 2482 IEC/IEEE Standard for Binary Floating-point A 2483 --------------------------------------------- 2484 */ 2485 flag float64_le_quiet( float64 a, float64 b ) 2486 { 2487 flag aSign, bSign; 2488 //int16 aExp, bExp; 2489 2490 if ( ( ( extractFloat64Exp( a ) == 0x7 2491 || ( ( extractFloat64Exp( b ) == 0x7 2492 ) { 2493 /* Do nothing, even if NaN as we're q 2494 return 0; 2495 } 2496 aSign = extractFloat64Sign( a ); 2497 bSign = extractFloat64Sign( b ); 2498 if ( aSign != bSign ) return aSign || ( ( 2499 return ( a == b ) || ( aSign ^ ( a < b ) 2500 2501 } 2502 2503 /* 2504 --------------------------------------------- 2505 Returns 1 if the double-precision floating-po 2506 the corresponding value `b', and 0 otherwise. 2507 exception. Otherwise, the comparison is perf 2508 Standard for Binary Floating-point Arithmetic 2509 --------------------------------------------- 2510 */ 2511 flag float64_lt_quiet( float64 a, float64 b ) 2512 { 2513 flag aSign, bSign; 2514 2515 if ( ( ( extractFloat64Exp( a ) == 0x7 2516 || ( ( extractFloat64Exp( b ) == 0x7 2517 ) { 2518 /* Do nothing, even if NaN as we're q 2519 return 0; 2520 } 2521 aSign = extractFloat64Sign( a ); 2522 bSign = extractFloat64Sign( b ); 2523 if ( aSign != bSign ) return aSign && ( ( 2524 return ( a != b ) && ( aSign ^ ( a < b ) 2525 2526 } 2527 2528 #ifdef FLOATX80 2529 2530 /* 2531 --------------------------------------------- 2532 Returns the result of converting the extended 2533 point value `a' to the 32-bit two's complemen 2534 conversion is performed according to the IEC/ 2535 Floating-point Arithmetic---which means in pa 2536 is rounded according to the current rounding 2537 largest positive integer is returned. Otherw 2538 overflows, the largest integer with the same 2539 --------------------------------------------- 2540 */ 2541 int32 floatx80_to_int32( struct roundingData 2542 { 2543 flag aSign; 2544 int32 aExp, shiftCount; 2545 bits64 aSig; 2546 2547 aSig = extractFloatx80Frac( a ); 2548 aExp = extractFloatx80Exp( a ); 2549 aSign = extractFloatx80Sign( a ); 2550 if ( ( aExp == 0x7FFF ) && (bits64) ( aSi 2551 shiftCount = 0x4037 - aExp; 2552 if ( shiftCount <= 0 ) shiftCount = 1; 2553 shift64RightJamming( aSig, shiftCount, &a 2554 return roundAndPackInt32( roundData, aSig 2555 2556 } 2557 2558 /* 2559 --------------------------------------------- 2560 Returns the result of converting the extended 2561 point value `a' to the 32-bit two's complemen 2562 conversion is performed according to the IEC/ 2563 Floating-point Arithmetic, except that the co 2564 toward zero. If `a' is a NaN, the largest po 2565 Otherwise, if the conversion overflows, the l 2566 sign as `a' is returned. 2567 --------------------------------------------- 2568 */ 2569 int32 floatx80_to_int32_round_to_zero( floatx 2570 { 2571 flag aSign; 2572 int32 aExp, shiftCount; 2573 bits64 aSig, savedASig; 2574 int32 z; 2575 2576 aSig = extractFloatx80Frac( a ); 2577 aExp = extractFloatx80Exp( a ); 2578 aSign = extractFloatx80Sign( a ); 2579 shiftCount = 0x403E - aExp; 2580 if ( shiftCount < 32 ) { 2581 if ( ( aExp == 0x7FFF ) && (bits64) ( 2582 goto invalid; 2583 } 2584 else if ( 63 < shiftCount ) { 2585 if ( aExp || aSig ) float_raise( floa 2586 return 0; 2587 } 2588 savedASig = aSig; 2589 aSig >>= shiftCount; 2590 z = aSig; 2591 if ( aSign ) z = - z; 2592 if ( ( z < 0 ) ^ aSign ) { 2593 invalid: 2594 float_raise( float_flag_invalid ); 2595 return aSign ? 0x80000000 : 0x7FFFFFF 2596 } 2597 if ( ( aSig<<shiftCount ) != savedASig ) 2598 float_raise( float_flag_inexact ); 2599 } 2600 return z; 2601 2602 } 2603 2604 /* 2605 --------------------------------------------- 2606 Returns the result of converting the extended 2607 point value `a' to the single-precision float 2608 conversion is performed according to the IEC/ 2609 Floating-point Arithmetic. 2610 --------------------------------------------- 2611 */ 2612 float32 floatx80_to_float32( struct roundingD 2613 { 2614 flag aSign; 2615 int32 aExp; 2616 bits64 aSig; 2617 2618 aSig = extractFloatx80Frac( a ); 2619 aExp = extractFloatx80Exp( a ); 2620 aSign = extractFloatx80Sign( a ); 2621 if ( aExp == 0x7FFF ) { 2622 if ( (bits64) ( aSig<<1 ) ) { 2623 return commonNaNToFloat32( floatx 2624 } 2625 return packFloat32( aSign, 0xFF, 0 ); 2626 } 2627 shift64RightJamming( aSig, 33, &aSig ); 2628 if ( aExp || aSig ) aExp -= 0x3F81; 2629 return roundAndPackFloat32( roundData, aS 2630 2631 } 2632 2633 /* 2634 --------------------------------------------- 2635 Returns the result of converting the extended 2636 point value `a' to the double-precision float 2637 conversion is performed according to the IEC/ 2638 Floating-point Arithmetic. 2639 --------------------------------------------- 2640 */ 2641 float64 floatx80_to_float64( struct roundingD 2642 { 2643 flag aSign; 2644 int32 aExp; 2645 bits64 aSig, zSig; 2646 2647 aSig = extractFloatx80Frac( a ); 2648 aExp = extractFloatx80Exp( a ); 2649 aSign = extractFloatx80Sign( a ); 2650 if ( aExp == 0x7FFF ) { 2651 if ( (bits64) ( aSig<<1 ) ) { 2652 return commonNaNToFloat64( floatx 2653 } 2654 return packFloat64( aSign, 0x7FF, 0 ) 2655 } 2656 shift64RightJamming( aSig, 1, &zSig ); 2657 if ( aExp || aSig ) aExp -= 0x3C01; 2658 return roundAndPackFloat64( roundData, aS 2659 2660 } 2661 2662 /* 2663 --------------------------------------------- 2664 Rounds the extended double-precision floating 2665 and returns the result as an extended quadrup 2666 value. The operation is performed according 2667 Binary Floating-point Arithmetic. 2668 --------------------------------------------- 2669 */ 2670 floatx80 floatx80_round_to_int( struct roundi 2671 { 2672 flag aSign; 2673 int32 aExp; 2674 bits64 lastBitMask, roundBitsMask; 2675 int8 roundingMode; 2676 floatx80 z; 2677 2678 aExp = extractFloatx80Exp( a ); 2679 if ( 0x403E <= aExp ) { 2680 if ( ( aExp == 0x7FFF ) && (bits64) ( 2681 return propagateFloatx80NaN( a, a 2682 } 2683 return a; 2684 } 2685 if ( aExp <= 0x3FFE ) { 2686 if ( ( aExp == 0 ) 2687 && ( (bits64) ( extractFloatx80F 2688 return a; 2689 } 2690 roundData->exception |= float_flag_in 2691 aSign = extractFloatx80Sign( a ); 2692 switch ( roundData->mode ) { 2693 case float_round_nearest_even: 2694 if ( ( aExp == 0x3FFE ) && (bits6 2695 ) { 2696 return 2697 packFloatx80( aSign, 0x3F 2698 } 2699 break; 2700 case float_round_down: 2701 return 2702 aSign ? 2703 packFloatx80( 1, 0x3FFF 2704 : packFloatx80( 0, 0, 0 ); 2705 case float_round_up: 2706 return 2707 aSign ? packFloatx80( 1, 0, 2708 : packFloatx80( 0, 0x3FFF, LI 2709 } 2710 return packFloatx80( aSign, 0, 0 ); 2711 } 2712 lastBitMask = 1; 2713 lastBitMask <<= 0x403E - aExp; 2714 roundBitsMask = lastBitMask - 1; 2715 z = a; 2716 roundingMode = roundData->mode; 2717 if ( roundingMode == float_round_nearest_ 2718 z.low += lastBitMask>>1; 2719 if ( ( z.low & roundBitsMask ) == 0 ) 2720 } 2721 else if ( roundingMode != float_round_to_ 2722 if ( extractFloatx80Sign( z ) ^ ( rou 2723 z.low += roundBitsMask; 2724 } 2725 } 2726 z.low &= ~ roundBitsMask; 2727 if ( z.low == 0 ) { 2728 ++z.high; 2729 z.low = LIT64( 0x8000000000000000 ); 2730 } 2731 if ( z.low != a.low ) roundData->exceptio 2732 return z; 2733 2734 } 2735 2736 /* 2737 --------------------------------------------- 2738 Returns the result of adding the absolute val 2739 precision floating-point values `a' and `b'. 2740 negated before being returned. `zSign' is ig 2741 The addition is performed according to the IE 2742 Floating-point Arithmetic. 2743 --------------------------------------------- 2744 */ 2745 static floatx80 addFloatx80Sigs( struct round 2746 { 2747 int32 aExp, bExp, zExp; 2748 bits64 aSig, bSig, zSig0, zSig1; 2749 int32 expDiff; 2750 2751 aSig = extractFloatx80Frac( a ); 2752 aExp = extractFloatx80Exp( a ); 2753 bSig = extractFloatx80Frac( b ); 2754 bExp = extractFloatx80Exp( b ); 2755 expDiff = aExp - bExp; 2756 if ( 0 < expDiff ) { 2757 if ( aExp == 0x7FFF ) { 2758 if ( (bits64) ( aSig<<1 ) ) retur 2759 return a; 2760 } 2761 if ( bExp == 0 ) --expDiff; 2762 shift64ExtraRightJamming( bSig, 0, ex 2763 zExp = aExp; 2764 } 2765 else if ( expDiff < 0 ) { 2766 if ( bExp == 0x7FFF ) { 2767 if ( (bits64) ( bSig<<1 ) ) retur 2768 return packFloatx80( zSign, 0x7FF 2769 } 2770 if ( aExp == 0 ) ++expDiff; 2771 shift64ExtraRightJamming( aSig, 0, - 2772 zExp = bExp; 2773 } 2774 else { 2775 if ( aExp == 0x7FFF ) { 2776 if ( (bits64) ( ( aSig | bSig )<< 2777 return propagateFloatx80NaN( 2778 } 2779 return a; 2780 } 2781 zSig1 = 0; 2782 zSig0 = aSig + bSig; 2783 if ( aExp == 0 ) { 2784 normalizeFloatx80Subnormal( zSig0 2785 goto roundAndPack; 2786 } 2787 zExp = aExp; 2788 goto shiftRight1; 2789 } 2790 2791 zSig0 = aSig + bSig; 2792 2793 if ( (sbits64) zSig0 < 0 ) goto roundAndP 2794 shiftRight1: 2795 shift64ExtraRightJamming( zSig0, zSig1, 1 2796 zSig0 |= LIT64( 0x8000000000000000 ); 2797 ++zExp; 2798 roundAndPack: 2799 return 2800 roundAndPackFloatx80( 2801 roundData, zSign, zExp, zSig0, zS 2802 2803 } 2804 2805 /* 2806 --------------------------------------------- 2807 Returns the result of subtracting the absolut 2808 double-precision floating-point values `a' an 2809 the difference is negated before being return 2810 result is a NaN. The subtraction is performe 2811 Standard for Binary Floating-point Arithmetic 2812 --------------------------------------------- 2813 */ 2814 static floatx80 subFloatx80Sigs( struct round 2815 { 2816 int32 aExp, bExp, zExp; 2817 bits64 aSig, bSig, zSig0, zSig1; 2818 int32 expDiff; 2819 floatx80 z; 2820 2821 aSig = extractFloatx80Frac( a ); 2822 aExp = extractFloatx80Exp( a ); 2823 bSig = extractFloatx80Frac( b ); 2824 bExp = extractFloatx80Exp( b ); 2825 expDiff = aExp - bExp; 2826 if ( 0 < expDiff ) goto aExpBigger; 2827 if ( expDiff < 0 ) goto bExpBigger; 2828 if ( aExp == 0x7FFF ) { 2829 if ( (bits64) ( ( aSig | bSig )<<1 ) 2830 return propagateFloatx80NaN( a, b 2831 } 2832 roundData->exception |= float_flag_in 2833 z.low = floatx80_default_nan_low; 2834 z.high = floatx80_default_nan_high; 2835 z.__padding = 0; 2836 return z; 2837 } 2838 if ( aExp == 0 ) { 2839 aExp = 1; 2840 bExp = 1; 2841 } 2842 zSig1 = 0; 2843 if ( bSig < aSig ) goto aBigger; 2844 if ( aSig < bSig ) goto bBigger; 2845 return packFloatx80( roundData->mode == f 2846 bExpBigger: 2847 if ( bExp == 0x7FFF ) { 2848 if ( (bits64) ( bSig<<1 ) ) return pr 2849 return packFloatx80( zSign ^ 1, 0x7FF 2850 } 2851 if ( aExp == 0 ) ++expDiff; 2852 shift128RightJamming( aSig, 0, - expDiff, 2853 bBigger: 2854 sub128( bSig, 0, aSig, zSig1, &zSig0, &zS 2855 zExp = bExp; 2856 zSign ^= 1; 2857 goto normalizeRoundAndPack; 2858 aExpBigger: 2859 if ( aExp == 0x7FFF ) { 2860 if ( (bits64) ( aSig<<1 ) ) return pr 2861 return a; 2862 } 2863 if ( bExp == 0 ) --expDiff; 2864 shift128RightJamming( bSig, 0, expDiff, & 2865 aBigger: 2866 sub128( aSig, 0, bSig, zSig1, &zSig0, &zS 2867 zExp = aExp; 2868 normalizeRoundAndPack: 2869 return 2870 normalizeRoundAndPackFloatx80( 2871 roundData, zSign, zExp, zSig0, zS 2872 2873 } 2874 2875 /* 2876 --------------------------------------------- 2877 Returns the result of adding the extended dou 2878 values `a' and `b'. The operation is perform 2879 Standard for Binary Floating-point Arithmetic 2880 --------------------------------------------- 2881 */ 2882 floatx80 floatx80_add( struct roundingData *r 2883 { 2884 flag aSign, bSign; 2885 2886 aSign = extractFloatx80Sign( a ); 2887 bSign = extractFloatx80Sign( b ); 2888 if ( aSign == bSign ) { 2889 return addFloatx80Sigs( roundData, a, 2890 } 2891 else { 2892 return subFloatx80Sigs( roundData, a, 2893 } 2894 2895 } 2896 2897 /* 2898 --------------------------------------------- 2899 Returns the result of subtracting the extende 2900 point values `a' and `b'. The operation is p 2901 IEC/IEEE Standard for Binary Floating-point A 2902 --------------------------------------------- 2903 */ 2904 floatx80 floatx80_sub( struct roundingData *r 2905 { 2906 flag aSign, bSign; 2907 2908 aSign = extractFloatx80Sign( a ); 2909 bSign = extractFloatx80Sign( b ); 2910 if ( aSign == bSign ) { 2911 return subFloatx80Sigs( roundData, a, 2912 } 2913 else { 2914 return addFloatx80Sigs( roundData, a, 2915 } 2916 2917 } 2918 2919 /* 2920 --------------------------------------------- 2921 Returns the result of multiplying the extende 2922 point values `a' and `b'. The operation is p 2923 IEC/IEEE Standard for Binary Floating-point A 2924 --------------------------------------------- 2925 */ 2926 floatx80 floatx80_mul( struct roundingData *r 2927 { 2928 flag aSign, bSign, zSign; 2929 int32 aExp, bExp, zExp; 2930 bits64 aSig, bSig, zSig0, zSig1; 2931 floatx80 z; 2932 2933 aSig = extractFloatx80Frac( a ); 2934 aExp = extractFloatx80Exp( a ); 2935 aSign = extractFloatx80Sign( a ); 2936 bSig = extractFloatx80Frac( b ); 2937 bExp = extractFloatx80Exp( b ); 2938 bSign = extractFloatx80Sign( b ); 2939 zSign = aSign ^ bSign; 2940 if ( aExp == 0x7FFF ) { 2941 if ( (bits64) ( aSig<<1 ) 2942 || ( ( bExp == 0x7FFF ) && (bits 2943 return propagateFloatx80NaN( a, b 2944 } 2945 if ( ( bExp | bSig ) == 0 ) goto inva 2946 return packFloatx80( zSign, 0x7FFF, L 2947 } 2948 if ( bExp == 0x7FFF ) { 2949 if ( (bits64) ( bSig<<1 ) ) return pr 2950 if ( ( aExp | aSig ) == 0 ) { 2951 invalid: 2952 roundData->exception |= float_fla 2953 z.low = floatx80_default_nan_low; 2954 z.high = floatx80_default_nan_hig 2955 z.__padding = 0; 2956 return z; 2957 } 2958 return packFloatx80( zSign, 0x7FFF, L 2959 } 2960 if ( aExp == 0 ) { 2961 if ( aSig == 0 ) return packFloatx80( 2962 normalizeFloatx80Subnormal( aSig, &aE 2963 } 2964 if ( bExp == 0 ) { 2965 if ( bSig == 0 ) return packFloatx80( 2966 normalizeFloatx80Subnormal( bSig, &bE 2967 } 2968 zExp = aExp + bExp - 0x3FFE; 2969 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2970 if ( 0 < (sbits64) zSig0 ) { 2971 shortShift128Left( zSig0, zSig1, 1, & 2972 --zExp; 2973 } 2974 return 2975 roundAndPackFloatx80( 2976 roundData, zSign, zExp, zSig0, zS 2977 2978 } 2979 2980 /* 2981 --------------------------------------------- 2982 Returns the result of dividing the extended d 2983 value `a' by the corresponding value `b'. Th 2984 according to the IEC/IEEE Standard for Binary 2985 --------------------------------------------- 2986 */ 2987 floatx80 floatx80_div( struct roundingData *r 2988 { 2989 flag aSign, bSign, zSign; 2990 int32 aExp, bExp, zExp; 2991 bits64 aSig, bSig, zSig0, zSig1; 2992 bits64 rem0, rem1, rem2, term0, term1, te 2993 floatx80 z; 2994 2995 aSig = extractFloatx80Frac( a ); 2996 aExp = extractFloatx80Exp( a ); 2997 aSign = extractFloatx80Sign( a ); 2998 bSig = extractFloatx80Frac( b ); 2999 bExp = extractFloatx80Exp( b ); 3000 bSign = extractFloatx80Sign( b ); 3001 zSign = aSign ^ bSign; 3002 if ( aExp == 0x7FFF ) { 3003 if ( (bits64) ( aSig<<1 ) ) return pr 3004 if ( bExp == 0x7FFF ) { 3005 if ( (bits64) ( bSig<<1 ) ) retur 3006 goto invalid; 3007 } 3008 return packFloatx80( zSign, 0x7FFF, L 3009 } 3010 if ( bExp == 0x7FFF ) { 3011 if ( (bits64) ( bSig<<1 ) ) return pr 3012 return packFloatx80( zSign, 0, 0 ); 3013 } 3014 if ( bExp == 0 ) { 3015 if ( bSig == 0 ) { 3016 if ( ( aExp | aSig ) == 0 ) { 3017 invalid: 3018 roundData->exception |= float 3019 z.low = floatx80_default_nan_ 3020 z.high = floatx80_default_nan 3021 z.__padding = 0; 3022 return z; 3023 } 3024 roundData->exception |= float_fla 3025 return packFloatx80( zSign, 0x7FF 3026 } 3027 normalizeFloatx80Subnormal( bSig, &bE 3028 } 3029 if ( aExp == 0 ) { 3030 if ( aSig == 0 ) return packFloatx80( 3031 normalizeFloatx80Subnormal( aSig, &aE 3032 } 3033 zExp = aExp - bExp + 0x3FFE; 3034 rem1 = 0; 3035 if ( bSig <= aSig ) { 3036 shift128Right( aSig, 0, 1, &aSig, &re 3037 ++zExp; 3038 } 3039 zSig0 = estimateDiv128To64( aSig, rem1, b 3040 mul64To128( bSig, zSig0, &term0, &term1 ) 3041 sub128( aSig, rem1, term0, term1, &rem0, 3042 while ( (sbits64) rem0 < 0 ) { 3043 --zSig0; 3044 add128( rem0, rem1, 0, bSig, &rem0, & 3045 } 3046 zSig1 = estimateDiv128To64( rem1, 0, bSig 3047 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3048 mul64To128( bSig, zSig1, &term1, &ter 3049 sub128( rem1, 0, term1, term2, &rem1, 3050 while ( (sbits64) rem1 < 0 ) { 3051 --zSig1; 3052 add128( rem1, rem2, 0, bSig, &rem 3053 } 3054 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3055 } 3056 return 3057 roundAndPackFloatx80( 3058 roundData, zSign, zExp, zSig0, zS 3059 3060 } 3061 3062 /* 3063 --------------------------------------------- 3064 Returns the remainder of the extended double- 3065 `a' with respect to the corresponding value ` 3066 according to the IEC/IEEE Standard for Binary 3067 --------------------------------------------- 3068 */ 3069 floatx80 floatx80_rem( struct roundingData *r 3070 { 3071 flag aSign, bSign, zSign; 3072 int32 aExp, bExp, expDiff; 3073 bits64 aSig0, aSig1, bSig; 3074 bits64 q, term0, term1, alternateASig0, a 3075 floatx80 z; 3076 3077 aSig0 = extractFloatx80Frac( a ); 3078 aExp = extractFloatx80Exp( a ); 3079 aSign = extractFloatx80Sign( a ); 3080 bSig = extractFloatx80Frac( b ); 3081 bExp = extractFloatx80Exp( b ); 3082 bSign = extractFloatx80Sign( b ); 3083 if ( aExp == 0x7FFF ) { 3084 if ( (bits64) ( aSig0<<1 ) 3085 || ( ( bExp == 0x7FFF ) && (bits 3086 return propagateFloatx80NaN( a, b 3087 } 3088 goto invalid; 3089 } 3090 if ( bExp == 0x7FFF ) { 3091 if ( (bits64) ( bSig<<1 ) ) return pr 3092 return a; 3093 } 3094 if ( bExp == 0 ) { 3095 if ( bSig == 0 ) { 3096 invalid: 3097 roundData->exception |= float_fla 3098 z.low = floatx80_default_nan_low; 3099 z.high = floatx80_default_nan_hig 3100 z.__padding = 0; 3101 return z; 3102 } 3103 normalizeFloatx80Subnormal( bSig, &bE 3104 } 3105 if ( aExp == 0 ) { 3106 if ( (bits64) ( aSig0<<1 ) == 0 ) ret 3107 normalizeFloatx80Subnormal( aSig0, &a 3108 } 3109 bSig |= LIT64( 0x8000000000000000 ); 3110 zSign = aSign; 3111 expDiff = aExp - bExp; 3112 aSig1 = 0; 3113 if ( expDiff < 0 ) { 3114 if ( expDiff < -1 ) return a; 3115 shift128Right( aSig0, 0, 1, &aSig0, & 3116 expDiff = 0; 3117 } 3118 q = ( bSig <= aSig0 ); 3119 if ( q ) aSig0 -= bSig; 3120 expDiff -= 64; 3121 while ( 0 < expDiff ) { 3122 q = estimateDiv128To64( aSig0, aSig1, 3123 q = ( 2 < q ) ? q - 2 : 0; 3124 mul64To128( bSig, q, &term0, &term1 ) 3125 sub128( aSig0, aSig1, term0, term1, & 3126 shortShift128Left( aSig0, aSig1, 62, 3127 expDiff -= 62; 3128 } 3129 expDiff += 64; 3130 if ( 0 < expDiff ) { 3131 q = estimateDiv128To64( aSig0, aSig1, 3132 q = ( 2 < q ) ? q - 2 : 0; 3133 q >>= 64 - expDiff; 3134 mul64To128( bSig, q<<( 64 - expDiff ) 3135 sub128( aSig0, aSig1, term0, term1, & 3136 shortShift128Left( 0, bSig, 64 - expD 3137 while ( le128( term0, term1, aSig0, a 3138 ++q; 3139 sub128( aSig0, aSig1, term0, term 3140 } 3141 } 3142 else { 3143 term1 = 0; 3144 term0 = bSig; 3145 } 3146 sub128( term0, term1, aSig0, aSig1, &alte 3147 if ( lt128( alternateASig0, alternateA 3148 || ( eq128( alternateASig0, alter 3149 && ( q & 1 ) ) 3150 ) { 3151 aSig0 = alternateASig0; 3152 aSig1 = alternateASig1; 3153 zSign = ! zSign; 3154 } 3155 3156 return 3157 normalizeRoundAndPackFloatx80( 3158 roundData, zSign, bExp + expDiff, 3159 3160 } 3161 3162 /* 3163 --------------------------------------------- 3164 Returns the square root of the extended doubl 3165 value `a'. The operation is performed accord 3166 for Binary Floating-point Arithmetic. 3167 --------------------------------------------- 3168 */ 3169 floatx80 floatx80_sqrt( struct roundingData * 3170 { 3171 flag aSign; 3172 int32 aExp, zExp; 3173 bits64 aSig0, aSig1, zSig0, zSig1; 3174 bits64 rem0, rem1, rem2, rem3, term0, ter 3175 bits64 shiftedRem0, shiftedRem1; 3176 floatx80 z; 3177 3178 aSig0 = extractFloatx80Frac( a ); 3179 aExp = extractFloatx80Exp( a ); 3180 aSign = extractFloatx80Sign( a ); 3181 if ( aExp == 0x7FFF ) { 3182 if ( (bits64) ( aSig0<<1 ) ) return p 3183 if ( ! aSign ) return a; 3184 goto invalid; 3185 } 3186 if ( aSign ) { 3187 if ( ( aExp | aSig0 ) == 0 ) return a 3188 invalid: 3189 roundData->exception |= float_flag_in 3190 z.low = floatx80_default_nan_low; 3191 z.high = floatx80_default_nan_high; 3192 z.__padding = 0; 3193 return z; 3194 } 3195 if ( aExp == 0 ) { 3196 if ( aSig0 == 0 ) return packFloatx80 3197 normalizeFloatx80Subnormal( aSig0, &a 3198 } 3199 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 3200 zSig0 = estimateSqrt32( aExp, aSig0>>32 ) 3201 zSig0 <<= 31; 3202 aSig1 = 0; 3203 shift128Right( aSig0, 0, ( aExp & 1 ) + 2 3204 zSig0 = estimateDiv128To64( aSig0, aSig1, 3205 if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64 3206 shortShift128Left( aSig0, aSig1, 2, &aSig 3207 mul64To128( zSig0, zSig0, &term0, &term1 3208 sub128( aSig0, aSig1, term0, term1, &rem0 3209 while ( (sbits64) rem0 < 0 ) { 3210 --zSig0; 3211 shortShift128Left( 0, zSig0, 1, &term 3212 term1 |= 1; 3213 add128( rem0, rem1, term0, term1, &re 3214 } 3215 shortShift128Left( rem0, rem1, 63, &shift 3216 zSig1 = estimateDiv128To64( shiftedRem0, 3217 if ( (bits64) ( zSig1<<1 ) <= 10 ) { 3218 if ( zSig1 == 0 ) zSig1 = 1; 3219 mul64To128( zSig0, zSig1, &term1, &te 3220 shortShift128Left( term1, term2, 1, & 3221 sub128( rem1, 0, term1, term2, &rem1, 3222 mul64To128( zSig1, zSig1, &term2, &te 3223 sub192( rem1, rem2, 0, 0, term2, term 3224 while ( (sbits64) rem1 < 0 ) { 3225 --zSig1; 3226 shortShift192Left( 0, zSig0, zSig 3227 term3 |= 1; 3228 add192( 3229 rem1, rem2, rem3, term1, term 3230 } 3231 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 3232 } 3233 return 3234 roundAndPackFloatx80( 3235 roundData, 0, zExp, zSig0, zSig1 3236 3237 } 3238 3239 /* 3240 --------------------------------------------- 3241 Returns 1 if the extended double-precision fl 3242 equal to the corresponding value `b', and 0 o 3243 performed according to the IEC/IEEE Standard 3244 Arithmetic. 3245 --------------------------------------------- 3246 */ 3247 flag floatx80_eq( floatx80 a, floatx80 b ) 3248 { 3249 3250 if ( ( ( extractFloatx80Exp( a ) == 3251 && (bits64) ( extractFloatx80Fr 3252 || ( ( extractFloatx80Exp( b ) == 3253 && (bits64) ( extractFloatx80Fr 3254 ) { 3255 if ( floatx80_is_signaling_nan( a 3256 || floatx80_is_signaling_nan( b 3257 float_raise( float_flag_invalid ) 3258 } 3259 return 0; 3260 } 3261 return 3262 ( a.low == b.low ) 3263 && ( ( a.high == b.high ) 3264 || ( ( a.low == 0 ) 3265 && ( (bits16) ( ( a.high | 3266 ); 3267 3268 } 3269 3270 /* 3271 --------------------------------------------- 3272 Returns 1 if the extended double-precision fl 3273 less than or equal to the corresponding value 3274 comparison is performed according to the IEC/ 3275 Floating-point Arithmetic. 3276 --------------------------------------------- 3277 */ 3278 flag floatx80_le( floatx80 a, floatx80 b ) 3279 { 3280 flag aSign, bSign; 3281 3282 if ( ( ( extractFloatx80Exp( a ) == 3283 && (bits64) ( extractFloatx80Fr 3284 || ( ( extractFloatx80Exp( b ) == 3285 && (bits64) ( extractFloatx80Fr 3286 ) { 3287 float_raise( float_flag_invalid ); 3288 return 0; 3289 } 3290 aSign = extractFloatx80Sign( a ); 3291 bSign = extractFloatx80Sign( b ); 3292 if ( aSign != bSign ) { 3293 return 3294 aSign 3295 || ( ( ( (bits16) ( ( a.high | 3296 == 0 ); 3297 } 3298 return 3299 aSign ? le128( b.high, b.low, a.hig 3300 : le128( a.high, a.low, b.high, b.low 3301 3302 } 3303 3304 /* 3305 --------------------------------------------- 3306 Returns 1 if the extended double-precision fl 3307 less than the corresponding value `b', and 0 3308 is performed according to the IEC/IEEE Standa 3309 Arithmetic. 3310 --------------------------------------------- 3311 */ 3312 flag floatx80_lt( floatx80 a, floatx80 b ) 3313 { 3314 flag aSign, bSign; 3315 3316 if ( ( ( extractFloatx80Exp( a ) == 3317 && (bits64) ( extractFloatx80Fr 3318 || ( ( extractFloatx80Exp( b ) == 3319 && (bits64) ( extractFloatx80Fr 3320 ) { 3321 float_raise( float_flag_invalid ); 3322 return 0; 3323 } 3324 aSign = extractFloatx80Sign( a ); 3325 bSign = extractFloatx80Sign( b ); 3326 if ( aSign != bSign ) { 3327 return 3328 aSign 3329 && ( ( ( (bits16) ( ( a.high | 3330 != 0 ); 3331 } 3332 return 3333 aSign ? lt128( b.high, b.low, a.hig 3334 : lt128( a.high, a.low, b.high, b.low 3335 3336 } 3337 3338 /* 3339 --------------------------------------------- 3340 Returns 1 if the extended double-precision fl 3341 to the corresponding value `b', and 0 otherwi 3342 raised if either operand is a NaN. Otherwise 3343 according to the IEC/IEEE Standard for Binary 3344 --------------------------------------------- 3345 */ 3346 flag floatx80_eq_signaling( floatx80 a, float 3347 { 3348 3349 if ( ( ( extractFloatx80Exp( a ) == 3350 && (bits64) ( extractFloatx80Fr 3351 || ( ( extractFloatx80Exp( b ) == 3352 && (bits64) ( extractFloatx80Fr 3353 ) { 3354 float_raise( float_flag_invalid ); 3355 return 0; 3356 } 3357 return 3358 ( a.low == b.low ) 3359 && ( ( a.high == b.high ) 3360 || ( ( a.low == 0 ) 3361 && ( (bits16) ( ( a.high | 3362 ); 3363 3364 } 3365 3366 /* 3367 --------------------------------------------- 3368 Returns 1 if the extended double-precision fl 3369 than or equal to the corresponding value `b', 3370 do not cause an exception. Otherwise, the co 3371 to the IEC/IEEE Standard for Binary Floating- 3372 --------------------------------------------- 3373 */ 3374 flag floatx80_le_quiet( floatx80 a, floatx80 3375 { 3376 flag aSign, bSign; 3377 3378 if ( ( ( extractFloatx80Exp( a ) == 3379 && (bits64) ( extractFloatx80Fr 3380 || ( ( extractFloatx80Exp( b ) == 3381 && (bits64) ( extractFloatx80Fr 3382 ) { 3383 /* Do nothing, even if NaN as we're q 3384 return 0; 3385 } 3386 aSign = extractFloatx80Sign( a ); 3387 bSign = extractFloatx80Sign( b ); 3388 if ( aSign != bSign ) { 3389 return 3390 aSign 3391 || ( ( ( (bits16) ( ( a.high | 3392 == 0 ); 3393 } 3394 return 3395 aSign ? le128( b.high, b.low, a.hig 3396 : le128( a.high, a.low, b.high, b.low 3397 3398 } 3399 3400 /* 3401 --------------------------------------------- 3402 Returns 1 if the extended double-precision fl 3403 than the corresponding value `b', and 0 other 3404 an exception. Otherwise, the comparison is p 3405 IEC/IEEE Standard for Binary Floating-point A 3406 --------------------------------------------- 3407 */ 3408 flag floatx80_lt_quiet( floatx80 a, floatx80 3409 { 3410 flag aSign, bSign; 3411 3412 if ( ( ( extractFloatx80Exp( a ) == 3413 && (bits64) ( extractFloatx80Fr 3414 || ( ( extractFloatx80Exp( b ) == 3415 && (bits64) ( extractFloatx80Fr 3416 ) { 3417 /* Do nothing, even if NaN as we're q 3418 return 0; 3419 } 3420 aSign = extractFloatx80Sign( a ); 3421 bSign = extractFloatx80Sign( b ); 3422 if ( aSign != bSign ) { 3423 return 3424 aSign 3425 && ( ( ( (bits16) ( ( a.high | 3426 != 0 ); 3427 } 3428 return 3429 aSign ? lt128( b.high, b.low, a.hig 3430 : lt128( a.high, a.low, b.high, b.low 3431 3432 } 3433 3434 #endif 3435 3436
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.