CoCalc -- r128.h

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/misc/r128.h
⁹⁹⁰² views
1
/*
2
r128.h: 128-bit (64.64) signed fixed-point arithmetic. Version 1.6.0
3

4
COMPILATION
5
-----------
6
Drop this header file somewhere in your project and include it wherever it is
7
needed. There is no separate .c file for this library. To get the code, in ONE
8
file in your project, put:
9

10
#define R128_IMPLEMENTATION
11

12
before you include this file. You may also provide a definition for R128_ASSERT
13
to force the library to use a custom assert macro.
14

15
COMPILER/LIBRARY SUPPORT
16
------------------------
17
This library requires a C89 compiler with support for 64-bit integers. If your
18
compiler does not support the long long data type, the R128_U64, etc. macros
19
must be set appropriately. On x86 and x64 targets, Intel intrinsics are used
20
for speed. If your compiler does not support these intrinsics, you can add
21
#define R128_STDC_ONLY
22
in your implementation file before including r128.h.
23

24
The only C runtime library functionality used by this library is <assert.h>.
25
This can be avoided by defining an R128_ASSERT macro in your implementation
26
file. Since this library uses 64-bit arithmetic, this may implicitly add a
27
runtime library dependency on 32-bit platforms.
28

29
C++ SUPPORT
30
-----------
31
Operator overloads are supplied for C++ files that include this file. Since all
32
C++ functions are declared inline (or static inline), the R128_IMPLEMENTATION
33
file can be either C++ or C.
34

35
LICENSE
36
-------
37
This is free and unencumbered software released into the public domain.
38

39
Anyone is free to copy, modify, publish, use, compile, sell, or
40
distribute this software, either in source code form or as a compiled
41
binary, for any purpose, commercial or non-commercial, and by any
42
means.
43

44
In jurisdictions that recognize copyright laws, the author or authors
45
of this software dedicate any and all copyright interest in the
46
software to the public domain. We make this dedication for the benefit
47
of the public at large and to the detriment of our heirs and
48
successors. We intend this dedication to be an overt act of
49
relinquishment in perpetuity of all present and future rights to this
50
software under copyright law.
51

52
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
53
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
54
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
55
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
56
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
57
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
58
OTHER DEALINGS IN THE SOFTWARE.
59
*/
60

61
#ifndef H_R128_H
62
#define H_R128_H
63

64
#include <stddef.h>
65

66
// 64-bit integer support
67
// If your compiler does not have stdint.h, add appropriate defines for these macros.
68
#if defined(_MSC_VER) && (_MSC_VER < 1600)
69
#  define R128_S32 __int32
70
#  define R128_U32 unsigned __int32
71
#  define R128_S64 __int64
72
#  define R128_U64 unsigned __int64
73
#  define R128_LIT_S64(x) x##i64
74
#  define R128_LIT_U64(x) x##ui64
75
#else
76
#  include <stdint.h>
77
#  define R128_S32 int32_t
78
#  define R128_U32 uint32_t
79
#  define R128_S64 long long
80
#  define R128_U64 unsigned long long
81
#  define R128_LIT_S64(x) x##ll
82
#  define R128_LIT_U64(x) x##ull
83
#endif
84

85
#ifdef __cplusplus
86
extern "C" {
87
#endif
88

89
typedef struct R128 {
90
   R128_U64 lo;
91
   R128_U64 hi;
92

93
#ifdef __cplusplus
94
   R128();
95
   R128(double);
96
   R128(int);
97
   R128(R128_S64);
98
   R128(R128_U64 low, R128_U64 high);
99

100
   operator double() const;
101
   operator R128_S64() const;
102
   operator int() const;
103
   operator bool() const;
104

105
   bool operator!() const;
106
   R128 operator~() const;
107
   R128 operator-() const;
108
   R128 &operator|=(const R128 &rhs);
109
   R128 &operator&=(const R128 &rhs);
110
   R128 &operator^=(const R128 &rhs);
111
   R128 &operator+=(const R128 &rhs);
112
   R128 &operator-=(const R128 &rhs);
113
   R128 &operator*=(const R128 &rhs);
114
   R128 &operator/=(const R128 &rhs);
115
   R128 &operator%=(const R128 &rhs);
116
   R128 &operator<<=(int amount);
117
   R128 &operator>>=(int amount);
118
#endif   //__cplusplus
119
} R128;
120

121
// Type conversion
122
extern void r128FromInt(R128 *dst, R128_S64 v);
123
extern void r128FromFloat(R128 *dst, double v);
124
extern R128_S64 r128ToInt(const R128 *v);
125
extern double r128ToFloat(const R128 *v);
126

127
// Copy
128
extern void r128Copy(R128 *dst, const R128 *src);
129

130
// Sign manipulation
131
extern void r128Neg(R128 *dst, const R128 *v);   // -v
132
extern void r128Abs(R128* dst, const R128* v);   // abs(v)
133
extern void r128Nabs(R128* dst, const R128* v);  // -abs(v)
134

135
// Bitwise operations
136
extern void r128Not(R128 *dst, const R128 *src);               // ~a
137
extern void r128Or(R128 *dst, const R128 *a, const R128 *b);   // a | b
138
extern void r128And(R128 *dst, const R128 *a, const R128 *b);  // a & b
139
extern void r128Xor(R128 *dst, const R128 *a, const R128 *b);  // a ^ b
140
extern void r128Shl(R128 *dst, const R128 *src, int amount);   // shift left by amount mod 128
141
extern void r128Shr(R128 *dst, const R128 *src, int amount);   // shift right logical by amount mod 128
142
extern void r128Sar(R128 *dst, const R128 *src, int amount);   // shift right arithmetic by amount mod 128
143

144
// Arithmetic
145
extern void r128Add(R128 *dst, const R128 *a, const R128 *b);  // a + b
146
extern void r128Sub(R128 *dst, const R128 *a, const R128 *b);  // a - b
147
extern void r128Mul(R128 *dst, const R128 *a, const R128 *b);  // a * b
148
extern void r128Div(R128 *dst, const R128 *a, const R128 *b);  // a / b
149
extern void r128Mod(R128 *dst, const R128 *a, const R128 *b);  // a - toInt(a / b) * b
150

151
extern void r128Sqrt(R128 *dst, const R128 *v);  // sqrt(v)
152
extern void r128Rsqrt(R128 *dst, const R128 *v); // 1 / sqrt(v)
153

154
// Comparison
155
extern int  r128Cmp(const R128 *a, const R128 *b);  // sign of a-b
156
extern void r128Min(R128 *dst, const R128 *a, const R128 *b);
157
extern void r128Max(R128 *dst, const R128 *a, const R128 *b);
158
extern void r128Floor(R128 *dst, const R128 *v);
159
extern void r128Ceil(R128 *dst, const R128 *v);
160
extern void r128Round(R128 *dst, const R128 *v);    // round to nearest, rounding halfway values away from zero
161
extern int  r128IsNeg(const R128 *v); // quick check for < 0
162

163
// String conversion
164
//
165
typedef enum R128ToStringSign {
166
   R128ToStringSign_Default,  // no sign character for positive values
167
   R128ToStringSign_Space,    // leading space for positive values
168
   R128ToStringSign_Plus,     // leading '+' for positive values
169
} R128ToStringSign;
170

171
// Formatting options for use with r128ToStringOpt. The "defaults" correspond
172
// to a format string of "%f".
173
//
174
typedef struct R128ToStringFormat {
175
   // sign character for positive values. Default is R128ToStringSign_Default.
176
   R128ToStringSign sign;
177

178
   // minimum number of characters to write. Default is 0.
179
   int width;
180

181
   // place to the right of the decimal at which rounding is performed. If negative,
182
   // a maximum of 20 decimal places will be written, with no trailing zeroes.
183
   // (20 places is sufficient to ensure that r128FromString will convert back to the
184
   // original value.) Default is -1. NOTE: This is not the same default that the C
185
   // standard library uses for %f.
186
   int precision;
187

188
   // If non-zero, pads the output string with leading zeroes if the final result is
189
   // fewer than width characters. Otherwise, leading spaces are used. Default is 0.
190
   int zeroPad;
191

192
   // Always print a decimal point, even if the value is an integer. Default is 0.
193
   int decimal;
194

195
   // Left-align output if width specifier requires padding.
196
   // Default is 0 (right align).
197
   int leftAlign;
198
} R128ToStringFormat;
199

200
// r128ToStringOpt: convert R128 to a decimal string, with formatting.
201
//
202
// dst and dstSize: specify the buffer to write into. At most dstSize bytes will be written
203
// (including null terminator). No additional rounding is performed if dstSize is not large
204
// enough to hold the entire string.
205
//
206
// opt: an R128ToStringFormat struct (q.v.) with formatting options.
207
//
208
// Uses the R128_decimal global as the decimal point character.
209
// Always writes a null terminator, even if the destination buffer is not large enough.
210
//
211
// Number of bytes that will be written (i.e. how big does dst need to be?):
212
// If width is specified: width + 1 bytes.
213
// If precision is specified: at most precision + 22 bytes.
214
// If neither is specified: at most 42 bytes.
215
//
216
// Returns the number of bytes that would have been written if dst was sufficiently large,
217
// not including the final null terminator.
218
//
219
extern int r128ToStringOpt(char *dst, size_t dstSize, const R128 *v, const R128ToStringFormat *opt);
220

221
// r128ToStringf: convert R128 to a decimal string, with formatting.
222
//
223
// dst and dstSize: specify the buffer to write into. At most dstSize bytes will be written
224
// (including null terminator).
225
//
226
// format: a printf-style format specifier, as one would use with floating point types.
227
//    e.g. "%+5.2f". (The leading % and trailing f are optional.)
228
//    NOTE: This is NOT a full replacement for sprintf. Any characters in the format string
229
//       that do not correspond to a format placeholder are ignored.
230
//
231
// Uses the R128_decimal global as the decimal point character.
232
// Always writes a null terminator, even if the destination buffer is not large enough.
233
//
234
// Number of bytes that will be written (i.e. how big does dst need to be?):
235
// If the precision field is specified: at most max(width, precision + 21) + 1 bytes
236
// Otherwise: at most max(width, 41) + 1 bytes.
237
//
238
// Returns the number of bytes that would have been written if dst was sufficiently large,
239
// not including the final null terminator.
240
//
241
extern int r128ToStringf(char *dst, size_t dstSize, const char *format, const R128 *v);
242

243
// r128ToString: convert R128 to a decimal string, with default formatting.
244
// Equivalent to r128ToStringf(dst, dstSize, "%f", v).
245
//
246
// Uses the R128_decimal global as the decimal point character.
247
// Always writes a null terminator, even if the destination buffer is not large enough.
248
//
249
// Will write at most 42 bytes (including NUL) to dst.
250
//
251
// Returns the number of bytes that would have been written if dst was sufficiently large,
252
// not including the final null terminator.
253
//
254
extern int r128ToString(char *dst, size_t dstSize, const R128 *v);
255

256
// r128FromString: Convert string to R128.
257
//
258
// The string can be formatted either as a decimal number with optional sign
259
// or as hexadecimal with a prefix of 0x or 0X.
260
//
261
// endptr, if not NULL, is set to the character following the last character
262
//   used in the conversion.
263
//
264
extern void r128FromString(R128 *dst, const char *s, char **endptr);
265

266
// Constants
267
extern const R128 R128_min;      // minimum (most negative) value
268
extern const R128 R128_max;      // maximum (most positive) value
269
extern const R128 R128_smallest; // smallest positive value
270
extern const R128 R128_zero;     // zero
271
extern const R128 R128_one;      // 1.0
272

273
extern char R128_decimal;        // decimal point character used by r128From/ToString. defaults to '.'
274

275
#ifdef __cplusplus
276
}
277

278
#include <limits>
279
namespace std {
280
template<>
281
struct numeric_limits<R128>
282
{
283
   static const bool is_specialized = true;
284

285
   static R128 min() throw() { return R128_min; }
286
   static R128 max() throw() { return R128_max; }
287

288
   static const int digits = 127;
289
   static const int digits10 = 38;
290
   static const bool is_signed = true;
291
   static const bool is_integer = false;
292
   static const bool is_exact = false;
293
   static const int radix = 2;
294
   static R128 epsilon() throw() { return R128_smallest; }
295
   static R128 round_error() throw() { return R128_one; }
296

297
   static const int min_exponent = 0;
298
   static const int min_exponent10 = 0;
299
   static const int max_exponent = 0;
300
   static const int max_exponent10 = 0;
301

302
   static const bool has_infinity = false;
303
   static const bool has_quiet_NaN = false;
304
   static const bool has_signaling_NaN = false;
305
   static const float_denorm_style has_denorm = denorm_absent;
306
   static const bool has_denorm_loss = false;
307

308
   static R128 infinity() throw() { return R128_zero; }
309
   static R128 quiet_NaN() throw() { return R128_zero; }
310
   static R128 signaling_NaN() throw() { return R128_zero; }
311
   static R128 denorm_min() throw() { return R128_zero; }
312

313
   static const bool is_iec559 = false;
314
   static const bool is_bounded = true;
315
   static const bool is_modulo = true;
316

317
   static const bool traps = numeric_limits<R128_U64>::traps;
318
   static const bool tinyness_before = false;
319
   static const float_round_style round_style = round_toward_zero;
320
};
321
}  //namespace std
322

323
inline R128::R128() {}
324

325
inline R128::R128(double v)
326
{
327
   r128FromFloat(this, v);
328
}
329

330
inline R128::R128(int v)
331
{
332
   r128FromInt(this, v);
333
}
334

335
inline R128::R128(R128_S64 v)
336
{
337
   r128FromInt(this, v);
338
}
339

340
inline R128::R128(R128_U64 low, R128_U64 high)
341
{
342
   lo = low;
343
   hi = high;
344
}
345

346
inline R128::operator double() const
347
{
348
   return r128ToFloat(this);
349
}
350

351
inline R128::operator R128_S64() const
352
{
353
   return r128ToInt(this);
354
}
355

356
inline R128::operator int() const
357
{
358
   return (int) r128ToInt(this);
359
}
360

361
inline R128::operator bool() const
362
{
363
   return lo || hi;
364
}
365

366
inline bool R128::operator!() const
367
{
368
   return !lo && !hi;
369
}
370

371
inline R128 R128::operator~() const
372
{
373
   R128 r;
374
   r128Not(&r, this);
375
   return r;
376
}
377

378
inline R128 R128::operator-() const
379
{
380
   R128 r;
381
   r128Neg(&r, this);
382
   return r;
383
}
384

385
inline R128 &R128::operator|=(const R128 &rhs)
386
{
387
   r128Or(this, this, &rhs);
388
   return *this;
389
}
390

391
inline R128 &R128::operator&=(const R128 &rhs)
392
{
393
   r128And(this, this, &rhs);
394
   return *this;
395
}
396

397
inline R128 &R128::operator^=(const R128 &rhs)
398
{
399
   r128Xor(this, this, &rhs);
400
   return *this;
401
}
402

403
inline R128 &R128::operator+=(const R128 &rhs)
404
{
405
   r128Add(this, this, &rhs);
406
   return *this;
407
}
408

409
inline R128 &R128::operator-=(const R128 &rhs)
410
{
411
   r128Sub(this, this, &rhs);
412
   return *this;
413
}
414

415
inline R128 &R128::operator*=(const R128 &rhs)
416
{
417
   r128Mul(this, this, &rhs);
418
   return *this;
419
}
420

421
inline R128 &R128::operator/=(const R128 &rhs)
422
{
423
   r128Div(this, this, &rhs);
424
   return *this;
425
}
426

427
inline R128 &R128::operator%=(const R128 &rhs)
428
{
429
   r128Mod(this, this, &rhs);
430
   return *this;
431
}
432

433
inline R128 &R128::operator<<=(int amount)
434
{
435
   r128Shl(this, this, amount);
436
   return *this;
437
}
438

439
inline R128 &R128::operator>>=(int amount)
440
{
441
   r128Sar(this, this, amount);
442
   return *this;
443
}
444

445
static inline R128 operator|(const R128 &lhs, const R128 &rhs)
446
{
447
   R128 r(lhs);
448
   return r |= rhs;
449
}
450

451
static inline R128 operator&(const R128 &lhs, const R128 &rhs)
452
{
453
   R128 r(lhs);
454
   return r &= rhs;
455
}
456

457
static inline R128 operator^(const R128 &lhs, const R128 &rhs)
458
{
459
   R128 r(lhs);
460
   return r ^= rhs;
461
}
462

463
static inline R128 operator+(const R128 &lhs, const R128 &rhs)
464
{
465
   R128 r(lhs);
466
   return r += rhs;
467
}
468

469
static inline R128 operator-(const R128 &lhs, const R128 &rhs)
470
{
471
   R128 r(lhs);
472
   return r -= rhs;
473
}
474

475
static inline R128 operator*(const R128 &lhs, const R128 &rhs)
476
{
477
   R128 r(lhs);
478
   return r *= rhs;
479
}
480

481
static inline R128 operator/(const R128 &lhs, const R128 &rhs)
482
{
483
   R128 r(lhs);
484
   return r /= rhs;
485
}
486

487
static inline R128 operator%(const R128 &lhs, const R128 &rhs)
488
{
489
   R128 r(lhs);
490
   return r %= rhs;
491
}
492

493
static inline R128 operator<<(const R128 &lhs, int amount)
494
{
495
   R128 r(lhs);
496
   return r <<= amount;
497
}
498

499
static inline R128 operator>>(const R128 &lhs, int amount)
500
{
501
   R128 r(lhs);
502
   return r >>= amount;
503
}
504

505
static inline bool operator<(const R128 &lhs, const R128 &rhs)
506
{
507
   return r128Cmp(&lhs, &rhs) < 0;
508
}
509

510
static inline bool operator>(const R128 &lhs, const R128 &rhs)
511
{
512
   return r128Cmp(&lhs, &rhs) > 0;
513
}
514

515
static inline bool operator<=(const R128 &lhs, const R128 &rhs)
516
{
517
   return r128Cmp(&lhs, &rhs) <= 0;
518
}
519

520
static inline bool operator>=(const R128 &lhs, const R128 &rhs)
521
{
522
   return r128Cmp(&lhs, &rhs) >= 0;
523
}
524

525
static inline bool operator==(const R128 &lhs, const R128 &rhs)
526
{
527
   return lhs.lo == rhs.lo && lhs.hi == rhs.hi;
528
}
529

530
static inline bool operator!=(const R128 &lhs, const R128 &rhs)
531
{
532
   return lhs.lo != rhs.lo || lhs.hi != rhs.hi;
533
}
534

535
#endif   //__cplusplus
536
#endif   //H_R128_H
537

538
#ifdef R128_IMPLEMENTATION
539

540
#ifdef R128_DEBUG_VIS
541
#  define R128_DEBUG_SET(x)   r128ToString(R128_last, sizeof(R128_last), x)
542
#else
543
#  define R128_DEBUG_SET(x)
544
#endif
545

546
#define R128_SET2(x, l, h) do { (x)->lo = (R128_U64)(l); (x)->hi = (R128_U64)(h); } while(0)
547
#define R128_R0(x) ((R128_U32)(x)->lo)
548
#define R128_R2(x) ((R128_U32)(x)->hi)
549
#if defined(_M_IX86)
550
// workaround: MSVC x86's handling of 64-bit values is not great
551
#  define R128_SET4(x, r0, r1, r2, r3) do { \
552
      ((R128_U32*)&(x)->lo)[0] = (R128_U32)(r0); \
553
      ((R128_U32*)&(x)->lo)[1] = (R128_U32)(r1); \
554
      ((R128_U32*)&(x)->hi)[0] = (R128_U32)(r2); \
555
      ((R128_U32*)&(x)->hi)[1] = (R128_U32)(r3); \
556
      } while(0)
557
#  define R128_R1(x) (((R128_U32*)&(x)->lo)[1])
558
#  define R128_R3(x) (((R128_U32*)&(x)->hi)[1])
559
#else
560
#  define R128_SET4(x, r0, r1, r2, r3) do { (x)->lo = (R128_U64)(r0) | ((R128_U64)(r1) << 32); \
561
      (x)->hi = (R128_U64)(r2) | ((R128_U64)(r3) << 32); } while(0)
562
#  define R128_R1(x) ((R128_U32)((x)->lo >> 32))
563
#  define R128_R3(x) ((R128_U32)((x)->hi >> 32))
564
#endif
565

566
#if defined(_M_X64)
567
#  define R128_INTEL 1
568
#  define R128_64BIT 1
569
#  ifndef R128_STDC_ONLY
570
#     include <intrin.h>
571
#  endif
572
#elif defined(__x86_64__)
573
#  define R128_INTEL 1
574
#  define R128_64BIT 1
575
#  ifndef R128_STDC_ONLY
576
#     include <x86intrin.h>
577
#  endif
578
#elif defined(_M_IX86)
579
#  define R128_INTEL 1
580
#  ifndef R128_STDC_ONLY
581
#     include <intrin.h>
582
#  endif
583
#elif defined(__i386__)
584
#  define R128_INTEL 1
585
#  ifndef R128_STDC_ONLY
586
#     include <x86intrin.h>
587
#  endif
588
#elif defined(_M_ARM)
589
#  ifndef R128_STDC_ONLY
590
#     include <intrin.h>
591
#  endif
592
#elif defined(_M_ARM64)
593
#  define R128_64BIT 1
594
#  ifndef R128_STDC_ONLY
595
#     include <intrin.h>
596
#  endif
597
#elif defined(__aarch64__)
598
#  define R128_64BIT 1
599
#endif
600

601
#ifndef R128_INTEL
602
#  define R128_INTEL 0
603
#endif
604

605
#ifndef R128_64BIT
606
#  define R128_64BIT 0
607
#endif
608

609
#ifndef R128_ASSERT
610
#  include <assert.h>
611
#  define R128_ASSERT(x) assert(x)
612
#endif
613

614
#include <stdlib.h>  // for NULL
615

616
static const R128ToStringFormat R128__defaultFormat = {
617
   R128ToStringSign_Default,
618
   0,
619
   -1,
620
   0,
621
   0,
622
   0
623
};
624

625
const R128 R128_min = { 0, R128_LIT_U64(0x8000000000000000) };
626
const R128 R128_max = { R128_LIT_U64(0xffffffffffffffff), R128_LIT_U64(0x7fffffffffffffff) };
627
const R128 R128_smallest = { 1, 0 };
628
const R128 R128_zero = { 0, 0 };
629
const R128 R128_one = { 0, 1 };
630
char R128_decimal = '.';
631
#ifdef R128_DEBUG_VIS
632
char R128_last[42];
633
#endif
634

635
static int r128__clz64(R128_U64 x)
636
{
637
#if defined(R128_STDC_ONLY)
638
   R128_U64 n = 64, y;
639
   y = x >> 32; if (y) { n -= 32; x = y; }
640
   y = x >> 16; if (y) { n -= 16; x = y; }
641
   y = x >>  8; if (y) { n -=  8; x = y; }
642
   y = x >>  4; if (y) { n -=  4; x = y; }
643
   y = x >>  2; if (y) { n -=  2; x = y; }
644
   y = x >>  1; if (y) { n -=  1; x = y; }
645
   return (int)(n - x);
646
#elif defined(_M_X64) || defined(_M_ARM64)
647
   unsigned long idx;
648
   if (_BitScanReverse64(&idx, x)) {
649
      return 63 - (int)idx;
650
   } else {
651
      return 64;
652
   }
653
#elif defined(_MSC_VER)
654
   unsigned long idx;
655
   if (_BitScanReverse(&idx, (R128_U32)(x >> 32))) {
656
      return 31 - (int)idx;
657
   } else if (_BitScanReverse(&idx, (R128_U32)x)) {
658
      return 63 - (int)idx;
659
   } else {
660
      return 64;
661
   }
662
#else
663
   return x ? __builtin_clzll(x) : 64;
664
#endif
665
}
666

667
#if !R128_64BIT
668
// 32*32->64
669
static R128_U64 r128__umul64(R128_U32 a, R128_U32 b)
670
{
671
#  if defined(_M_IX86) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
672
   return __emulu(a, b);
673
#  elif defined(_M_ARM) && !defined(R128_STDC_ONLY)
674
   return _arm_umull(a, b);
675
#  else
676
   return a * (R128_U64)b;
677
#  endif
678
}
679

680
// 64/32->32
681
static R128_U32 r128__udiv64(R128_U32 nlo, R128_U32 nhi, R128_U32 d, R128_U32 *rem)
682
{
683
#  if defined(_M_IX86) && (_MSC_VER >= 1920) && !defined(R128_STDC_ONLY)
684
   unsigned __int64 n = ((unsigned __int64)nhi << 32) | nlo;
685
   return _udiv64(n, d, rem);
686
#  elif defined(_M_IX86) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
687
   __asm {
688
      mov eax, nlo
689
      mov edx, nhi
690
      div d
691
      mov ecx, rem
692
      mov dword ptr [ecx], edx
693
   }
694
#  elif defined(__i386__) && !defined(R128_STDC_ONLY)
695
   R128_U32 q, r;
696
   __asm("divl %4"
697
      : "=a"(q), "=d"(r)
698
      : "a"(nlo), "d"(nhi), "X"(d));
699
   *rem = r;
700
   return q;
701
#  else
702
   R128_U64 n64 = ((R128_U64)nhi << 32) | nlo;
703
   *rem = (R128_U32)(n64 % d);
704
   return (R128_U32)(n64 / d);
705
#  endif
706
}
707
#elif defined(R128_STDC_ONLY) || !R128_INTEL
708
#define r128__umul64(a, b) ((a) * (R128_U64)(b))
709
static R128_U32 r128__udiv64(R128_U32 nlo, R128_U32 nhi, R128_U32 d, R128_U32 *rem)
710
{
711
   R128_U64 n64 = ((R128_U64)nhi << 32) | nlo;
712
   *rem = (R128_U32)(n64 % d);
713
   return (R128_U32)(n64 / d);
714
}
715
#endif   //!R128_64BIT
716

717
static void r128__neg(R128 *dst, const R128 *src)
718
{
719
   R128_ASSERT(dst != NULL);
720
   R128_ASSERT(src != NULL);
721

722
#if R128_INTEL && !defined(R128_STDC_ONLY)
723
   {
724
      unsigned char carry = 0;
725
#  if R128_64BIT
726
      carry = _addcarry_u64(carry, ~src->lo, 1, &dst->lo);
727
      carry = _addcarry_u64(carry, ~src->hi, 0, &dst->hi);
728
#  else
729
      R128_U32 r0, r1, r2, r3;
730
      carry = _addcarry_u32(carry, ~R128_R0(src), 1, &r0);
731
      carry = _addcarry_u32(carry, ~R128_R1(src), 0, &r1);
732
      carry = _addcarry_u32(carry, ~R128_R2(src), 0, &r2);
733
      carry = _addcarry_u32(carry, ~R128_R3(src), 0, &r3);
734
      R128_SET4(dst, r0, r1, r2, r3);
735
#  endif //R128_64BIT
736
   }
737
#else
738
   if (src->lo) {
739
      dst->lo = ~src->lo + 1;
740
      dst->hi = ~src->hi;
741
   } else {
742
      dst->lo = 0;
743
      dst->hi = ~src->hi + 1;
744
   }
745
#endif   //R128_INTEL
746
}
747

748
// 64*64->128
749
static void r128__umul128(R128 *dst, R128_U64 a, R128_U64 b)
750
{
751
#if defined(_M_X64) && !defined(R128_STDC_ONLY)
752
   dst->lo = _umul128(a, b, &dst->hi);
753
#elif R128_64BIT && !defined(_MSC_VER) && !defined(R128_STDC_ONLY)
754
   unsigned __int128 p0 = a * (unsigned __int128)b;
755
   dst->hi = (R128_U64)(p0 >> 64);
756
   dst->lo = (R128_U64)p0;
757
#else
758
   R128_U32 alo = (R128_U32)a;
759
   R128_U32 ahi = (R128_U32)(a >> 32);
760
   R128_U32 blo = (R128_U32)b;
761
   R128_U32 bhi = (R128_U32)(b >> 32);
762
   R128_U64 p0, p1, p2, p3;
763

764
   p0 = r128__umul64(alo, blo);
765
   p1 = r128__umul64(alo, bhi);
766
   p2 = r128__umul64(ahi, blo);
767
   p3 = r128__umul64(ahi, bhi);
768

769
   {
770
#if R128_INTEL && !defined(R128_STDC_ONLY)
771
      R128_U32 r0, r1, r2, r3;
772
      unsigned char carry;
773

774
      r0 = (R128_U32)(p0);
775
      r1 = (R128_U32)(p0 >> 32);
776
      r2 = (R128_U32)(p1 >> 32);
777
      r3 = (R128_U32)(p3 >> 32);
778

779
      carry = _addcarry_u32(0, r1, (R128_U32)p1, &r1);
780
      carry = _addcarry_u32(carry, r2, (R128_U32)(p2 >> 32), &r2);
781
      _addcarry_u32(carry, r3, 0, &r3);
782
      carry = _addcarry_u32(0, r1, (R128_U32)p2, &r1);
783
      carry = _addcarry_u32(carry, r2, (R128_U32)p3, &r2);
784
      _addcarry_u32(carry, r3, 0, &r3);
785

786
      R128_SET4(dst, r0, r1, r2, r3);
787
#else
788
      R128_U64 carry, lo, hi;
789
      carry = ((R128_U64)(R128_U32)p1 + (R128_U64)(R128_U32)p2 + (p0 >> 32)) >> 32;
790

791
      lo = p0 + ((p1 + p2) << 32);
792
      hi = p3 + ((R128_U32)(p1 >> 32) + (R128_U32)(p2 >> 32)) + carry;
793

794
      R128_SET2(dst, lo, hi);
795
#endif
796
   }
797
#endif
798
}
799

800
// 128/64->64
801
#if defined(_M_X64) && (_MSC_VER < 1920) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
802
// MSVC x64 provides neither inline assembly nor (pre-2019) a div intrinsic, so we do fake
803
// "inline assembly" to avoid long division or outline assembly.
804
#pragma code_seg(".text")
805
__declspec(allocate(".text") align(16)) static const unsigned char r128__udiv128Code[] = {
806
   0x48, 0x8B, 0xC1,       //mov  rax, rcx
807
   0x49, 0xF7, 0xF0,       //div  rax, r8
808
   0x49, 0x89, 0x11,       //mov  qword ptr [r9], rdx
809
   0xC3                    //ret
810
};
811
typedef R128_U64 (*r128__udiv128Proc)(R128_U64 nlo, R128_U64 nhi, R128_U64 d, R128_U64 *rem);
812
static const r128__udiv128Proc r128__udiv128 = (r128__udiv128Proc)(void*)r128__udiv128Code;
813
#else
814
static R128_U64 r128__udiv128(R128_U64 nlo, R128_U64 nhi, R128_U64 d, R128_U64 *rem)
815
{
816
#if defined(_M_X64) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
817
   return _udiv128(nhi, nlo, d, rem);
818
#elif defined(__x86_64__) && !defined(R128_STDC_ONLY)
819
   R128_U64 q, r;
820
   __asm("divq %4"
821
      : "=a"(q), "=d"(r)
822
      : "a"(nlo), "d"(nhi), "X"(d));
823
   *rem = r;
824
   return q;
825
#else
826
   R128_U64 tmp;
827
   R128_U32 d0, d1;
828
   R128_U32 n3, n2, n1, n0;
829
   R128_U32 q0, q1;
830
   R128_U32 r;
831
   int shift;
832

833
   R128_ASSERT(d != 0);    //division by zero
834
   R128_ASSERT(nhi < d);   //overflow
835

836
   // normalize
837
   shift = r128__clz64(d);
838

839
   if (shift) {
840
      R128 tmp128;
841
      R128_SET2(&tmp128, nlo, nhi);
842
      r128Shl(&tmp128, &tmp128, shift);
843
      n3 = R128_R3(&tmp128);
844
      n2 = R128_R2(&tmp128);
845
      n1 = R128_R1(&tmp128);
846
      n0 = R128_R0(&tmp128);
847
      d <<= shift;
848
   } else {
849
      n3 = (R128_U32)(nhi >> 32);
850
      n2 = (R128_U32)nhi;
851
      n1 = (R128_U32)(nlo >> 32);
852
      n0 = (R128_U32)nlo;
853
   }
854

855
   d1 = (R128_U32)(d >> 32);
856
   d0 = (R128_U32)d;
857

858
   // first digit
859
   R128_ASSERT(n3 <= d1);
860
   if (n3 < d1) {
861
      q1 = r128__udiv64(n2, n3, d1, &r);
862
   } else {
863
      q1 = 0xffffffffu;
864
      r = n2 + d1;
865
   }
866
refine1:
867
   if (r128__umul64(q1, d0) > ((R128_U64)r << 32) + n1) {
868
      --q1;
869
      if (r < ~d1 + 1) {
870
         r += d1;
871
         goto refine1;
872
      }
873
   }
874

875
   tmp = ((R128_U64)n2 << 32) + n1 - (r128__umul64(q1, d0) + (r128__umul64(q1, d1) << 32));
876
   n2 = (R128_U32)(tmp >> 32);
877
   n1 = (R128_U32)tmp;
878

879
   // second digit
880
   R128_ASSERT(n2 <= d1);
881
   if (n2 < d1) {
882
      q0 = r128__udiv64(n1, n2, d1, &r);
883
   } else {
884
      q0 = 0xffffffffu;
885
      r = n1 + d1;
886
   }
887
refine0:
888
   if (r128__umul64(q0, d0) > ((R128_U64)r << 32) + n0) {
889
      --q0;
890
      if (r < ~d1 + 1) {
891
         r += d1;
892
         goto refine0;
893
      }
894
   }
895

896
   tmp = ((R128_U64)n1 << 32) + n0 - (r128__umul64(q0, d0) + (r128__umul64(q0, d1) << 32));
897
   n1 = (R128_U32)(tmp >> 32);
898
   n0 = (R128_U32)tmp;
899

900
   *rem = (((R128_U64)n1 << 32) + n0) >> shift;
901
   return ((R128_U64)q1 << 32) + q0;
902
#endif
903
}
904
#endif
905

906
static int r128__ucmp(const R128 *a, const R128 *b)
907
{
908
   if (a->hi != b->hi) {
909
      if (a->hi > b->hi) {
910
         return 1;
911
      } else {
912
         return -1;
913
      }
914
   } else {
915
      if (a->lo == b->lo) {
916
         return 0;
917
      } else if (a->lo > b->lo) {
918
         return 1;
919
      } else {
920
         return -1;
921
      }
922
   }
923
}
924

925
static void r128__umul(R128 *dst, const R128 *a, const R128 *b)
926
{
927
#if defined(_M_X64) && !defined(R128_STDC_ONLY)
928
   R128_U64 t0, t1;
929
   R128_U64 lo, hi = 0;
930
   unsigned char carry;
931

932
   t0 = _umul128(a->lo, b->lo, &t1);
933
   carry = _addcarry_u64(0, t1, t0 >> 63, &lo);
934
   _addcarry_u64(carry, hi, hi, &hi);
935

936
   t0 = _umul128(a->lo, b->hi, &t1);
937
   carry = _addcarry_u64(0, lo, t0, &lo);
938
   _addcarry_u64(carry, hi, t1, &hi);
939

940
   t0 = _umul128(a->hi, b->lo, &t1);
941
   carry = _addcarry_u64(0, lo, t0, &lo);
942
   _addcarry_u64(carry, hi, t1, &hi);
943

944
   t0 = _umul128(a->hi, b->hi, &t1);
945
   hi += t0;
946

947
   R128_SET2(dst, lo, hi);
948
#elif defined(__x86_64__) && !defined(R128_STDC_ONLY)
949
   unsigned __int128 p0, p1, p2, p3;
950
   p0 = a->lo * (unsigned __int128)b->lo;
951
   p1 = a->lo * (unsigned __int128)b->hi;
952
   p2 = a->hi * (unsigned __int128)b->lo;
953
   p3 = a->hi * (unsigned __int128)b->hi;
954

955
   p0 = (p3 << 64) + p2 + p1 + (p0 >> 64) + ((R128_U64)p0 >> 63);
956
   dst->lo = (R128_U64)p0;
957
   dst->hi = (R128_U64)(p0 >> 64);
958
#else
959
   R128 p0, p1, p2, p3, round;
960

961
   r128__umul128(&p0, a->lo, b->lo);
962
   round.hi = 0; round.lo = p0.lo >> 63;
963
   p0.lo = p0.hi; p0.hi = 0; //r128Shr(&p0, &p0, 64);
964
   r128Add(&p0, &p0, &round);
965

966
   r128__umul128(&p1, a->hi, b->lo);
967
   r128Add(&p0, &p0, &p1);
968

969
   r128__umul128(&p2, a->lo, b->hi);
970
   r128Add(&p0, &p0, &p2);
971

972
   r128__umul128(&p3, a->hi, b->hi);
973
   p3.hi = p3.lo; p3.lo = 0; //r128Shl(&p3, &p3, 64);
974
   r128Add(&p0, &p0, &p3);
975

976
   R128_SET2(dst, p0.lo, p0.hi);
977
#endif
978
}
979

980
// Shift d left until the high bit is set, and shift n left by the same amount.
981
// returns non-zero on overflow.
982
static int r128__norm(R128 *n, R128 *d, R128_U64 *n2)
983
{
984
   R128_U64 d0, d1;
985
   R128_U64 n0, n1;
986
   int shift;
987

988
   d1 = d->hi;
989
   d0 = d->lo;
990
   n1 = n->hi;
991
   n0 = n->lo;
992

993
   if (d1) {
994
      shift = r128__clz64(d1);
995
      if (shift) {
996
         d1 = (d1 << shift) | (d0 >> (64 - shift));
997
         d0 = d0 << shift;
998
         *n2 = n1 >> (64 - shift);
999
         n1 = (n1 << shift) | (n0 >> (64 - shift));
1000
         n0 = n0 << shift;
1001
      } else {
1002
         *n2 = 0;
1003
      }
1004
   } else {
1005
      shift = r128__clz64(d0);
1006
      if (r128__clz64(n1) <= shift) {
1007
         return 1; // overflow
1008
      }
1009

1010
      if (shift) {
1011
         d1 = d0 << shift;
1012
         d0 = 0;
1013
         *n2 = (n1 << shift) | (n0 >> (64 - shift));
1014
         n1 = n0 << shift;
1015
         n0 = 0;
1016
      } else {
1017
         d1 = d0;
1018
         d0 = 0;
1019
         *n2 = n1;
1020
         n1 = n0;
1021
         n0 = 0;
1022
      }
1023
   }
1024

1025
   R128_SET2(n, n0, n1);
1026
   R128_SET2(d, d0, d1);
1027
   return 0;
1028
}
1029

1030
static void r128__udiv(R128 *quotient, const R128 *dividend, const R128 *divisor)
1031
{
1032
   R128 tmp;
1033
   R128_U64 d0, d1;
1034
   R128_U64 n1, n2, n3;
1035
   R128 q;
1036

1037
   R128_ASSERT(dividend != NULL);
1038
   R128_ASSERT(divisor != NULL);
1039
   R128_ASSERT(quotient != NULL);
1040
   R128_ASSERT(divisor->hi != 0 || divisor->lo != 0);  // divide by zero
1041

1042
   // scale dividend and normalize
1043
   {
1044
      R128 n, d;
1045
      R128_SET2(&n, dividend->lo, dividend->hi);
1046
      R128_SET2(&d, divisor->lo, divisor->hi);
1047
      if (r128__norm(&n, &d, &n3)) {
1048
         R128_SET2(quotient, R128_max.lo, R128_max.hi);
1049
         return;
1050
      }
1051

1052
      d1 = d.hi;
1053
      d0 = d.lo;
1054
      n2 = n.hi;
1055
      n1 = n.lo;
1056
   }
1057

1058
   // first digit
1059
   R128_ASSERT(n3 <= d1);
1060
   {
1061
      R128 t0, t1;
1062
      t0.lo = n1;
1063
      if (n3 < d1) {
1064
         q.hi = r128__udiv128(n2, n3, d1, &t0.hi);
1065
      } else {
1066
         q.hi = R128_LIT_U64(0xffffffffffffffff);
1067
         t0.hi = n2 + d1;
1068
      }
1069

1070
refine1:
1071
      r128__umul128(&t1, q.hi, d0);
1072
      if (r128__ucmp(&t1, &t0) > 0) {
1073
         --q.hi;
1074
         if (t0.hi < ~d1 + 1) {
1075
            t0.hi += d1;
1076
            goto refine1;
1077
         }
1078
      }
1079
   }
1080

1081
   {
1082
      R128 t0, t1, t2;
1083
      t0.hi = n2;
1084
      t0.lo = n1;
1085

1086
      r128__umul128(&t1, q.hi, d0);
1087
      r128__umul128(&t2, q.hi, d1);
1088

1089
      t2.hi = t2.lo; t2.lo = 0;  //r128Shl(&t2, &t2, 64);
1090
      r128Add(&tmp, &t1, &t2);
1091
      r128Sub(&tmp, &t0, &tmp);
1092
   }
1093
   n2 = tmp.hi;
1094
   n1 = tmp.lo;
1095

1096
   // second digit
1097
   R128_ASSERT(n2 <= d1);
1098
   {
1099
      R128 t0, t1;
1100
      t0.lo = 0;
1101
      if (n2 < d1) {
1102
         q.lo = r128__udiv128(n1, n2, d1, &t0.hi);
1103
      } else {
1104
         q.lo = R128_LIT_U64(0xffffffffffffffff);
1105
         t0.hi = n1 + d1;
1106
      }
1107

1108
   refine0:
1109
      r128__umul128(&t1, q.lo, d0);
1110
      if (r128__ucmp(&t1, &t0) > 0) {
1111
         --q.lo;
1112
         if (t0.hi < ~d1 + 1) {
1113
            t0.hi += d1;
1114
            goto refine0;
1115
         }
1116
      }
1117
   }
1118

1119
   R128_SET2(quotient, q.lo, q.hi);
1120
}
1121

1122
static R128_U64 r128__umod(R128 *n, R128 *d)
1123
{
1124
   R128_U64 d0, d1;
1125
   R128_U64 n3, n2, n1;
1126
   R128_U64 q;
1127

1128
   R128_ASSERT(d != NULL);
1129
   R128_ASSERT(n != NULL);
1130
   R128_ASSERT(d->hi != 0 || d->lo != 0);  // divide by zero
1131

1132
   if (r128__norm(n, d, &n3)) {
1133
      return R128_LIT_U64(0xffffffffffffffff);
1134
   }
1135

1136
   d1 = d->hi;
1137
   d0 = d->lo;
1138
   n2 = n->hi;
1139
   n1 = n->lo;
1140

1141
   R128_ASSERT(n3 < d1);
1142
   {
1143
      R128 t0, t1;
1144
      t0.lo = n1;
1145
      q = r128__udiv128(n2, n3, d1, &t0.hi);
1146

1147
   refine1:
1148
      r128__umul128(&t1, q, d0);
1149
      if (r128__ucmp(&t1, &t0) > 0) {
1150
         --q;
1151
         if (t0.hi < ~d1 + 1) {
1152
            t0.hi += d1;
1153
            goto refine1;
1154
         }
1155
      }
1156
   }
1157

1158
   return q;
1159
}
1160

1161
static int r128__format(char *dst, size_t dstSize, const R128 *v, const R128ToStringFormat *format)
1162
{
1163
   char buf[128];
1164
   R128 tmp;
1165
   R128_U64 whole;
1166
   char *cursor, *decimal, *dstp = dst;
1167
   int sign = 0;
1168
   int fullPrecision = 1;
1169
   int width, precision;
1170
   int padCnt, trail = 0;
1171

1172
   R128_ASSERT(dst != NULL && dstSize > 0);
1173
   R128_ASSERT(v != NULL);
1174
   R128_ASSERT(format != NULL);
1175

1176
   --dstSize;
1177

1178
   R128_SET2(&tmp, v->lo, v->hi);
1179
   if (r128IsNeg(&tmp)) {
1180
      r128__neg(&tmp, &tmp);
1181
      sign = 1;
1182
   }
1183

1184
   width = format->width;
1185
   if (width < 0) {
1186
      width = 0;
1187
   }
1188

1189
   precision = format->precision;
1190
   if (precision < 0) {
1191
      // print a maximum of 20 digits
1192
      fullPrecision = 0;
1193
      precision = 20;
1194
   } else if (precision > sizeof(buf) - 21) {
1195
      trail = precision - (sizeof(buf) - 21);
1196
      precision -= trail;
1197
   }
1198

1199
   whole = tmp.hi;
1200
   decimal = cursor = buf;
1201

1202
   // fractional part first in case a carry into the whole part is required
1203
   if (tmp.lo || format->decimal) {
1204
      while (tmp.lo || (fullPrecision && precision)) {
1205
         if ((int)(cursor - buf) == precision) {
1206
            if ((R128_S64)tmp.lo < 0) {
1207
               // round up, propagate carry backwards
1208
               char *c;
1209
               for (c = cursor - 1; c >= buf; --c) {
1210
                  char d = ++*c;
1211
                  if (d <= '9') {
1212
                     goto endfrac;
1213
                  } else {
1214
                     *c = '0';
1215
                  }
1216
               }
1217

1218
               // carry out into the whole part
1219
               whole++;
1220
            }
1221

1222
            break;
1223
         }
1224

1225
         r128__umul128(&tmp, tmp.lo, 10);
1226
         *cursor++ = (char)tmp.hi + '0';
1227
      }
1228

1229
   endfrac:
1230
      if (format->decimal || precision) {
1231
         decimal = cursor;
1232
         *cursor++ = R128_decimal;
1233
      }
1234
   }
1235

1236
   // whole part
1237
   do {
1238
      char digit = (char)(whole % 10);
1239
      whole /= 10;
1240
      *cursor++ = digit + '0';
1241
   } while (whole);
1242

1243
#define R128__WRITE(c) do { if (dstp < dst + dstSize) *dstp = c; ++dstp; } while(0)
1244

1245
   padCnt = width - (int)(cursor - buf) - 1;
1246

1247
   // left padding
1248
   if (!format->leftAlign) {
1249
      char padChar = format->zeroPad ? '0' : ' ';
1250
      if (format->zeroPad) {
1251
         if (sign) {
1252
            R128__WRITE('-');
1253
         } else if (format->sign == R128ToStringSign_Plus) {
1254
            R128__WRITE('+');
1255
         } else if (format->sign == R128ToStringSign_Space) {
1256
            R128__WRITE(' ');
1257
         } else {
1258
            ++padCnt;
1259
         }
1260
      }
1261

1262
      for (; padCnt > 0; --padCnt) {
1263
         R128__WRITE(padChar);
1264
      }
1265
   }
1266

1267
   if (format->leftAlign || !format->zeroPad) {
1268
      if (sign) {
1269
         R128__WRITE('-');
1270
      } else if (format->sign == R128ToStringSign_Plus) {
1271
         R128__WRITE('+');
1272
      } else if (format->sign == R128ToStringSign_Space) {
1273
         R128__WRITE(' ');
1274
      } else {
1275
         ++padCnt;
1276
      }
1277
   }
1278

1279
   {
1280
      char *i;
1281

1282
      // reverse the whole part
1283
      for (i = cursor - 1; i >= decimal; --i) {
1284
         R128__WRITE(*i);
1285
      }
1286

1287
      // copy the fractional part
1288
      for (i = buf; i < decimal; ++i) {
1289
         R128__WRITE(*i);
1290
      }
1291
   }
1292

1293
   // right padding
1294
   if (format->leftAlign) {
1295
      char padChar = format->zeroPad ? '0' : ' ';
1296
      for (; padCnt > 0; --padCnt) {
1297
         R128__WRITE(padChar);
1298
      }
1299
   }
1300

1301
   // trailing zeroes for very large precision
1302
   while (trail--) {
1303
      R128__WRITE('0');
1304
   }
1305

1306
#undef R128__WRITE
1307

1308
   if (dstp <= dst + dstSize) {
1309
      *dstp = '\0';
1310
   } else {
1311
      dst[dstSize] = '\0';
1312
   }
1313
   return (int)(dstp - dst);
1314
}
1315

1316
void r128FromInt(R128 *dst, R128_S64 v)
1317
{
1318
   R128_ASSERT(dst != NULL);
1319
   dst->lo = 0;
1320
   dst->hi = (R128_U64)v;
1321
   R128_DEBUG_SET(dst);
1322
}
1323

1324
void r128FromFloat(R128 *dst, double v)
1325
{
1326
   R128_ASSERT(dst != NULL);
1327

1328
   if (v < -9223372036854775808.0) {
1329
      r128Copy(dst, &R128_min);
1330
   } else if (v >= 9223372036854775808.0) {
1331
      r128Copy(dst, &R128_max);
1332
   } else {
1333
      R128 r;
1334
      int sign = 0;
1335

1336
      if (v < 0) {
1337
         v = -v;
1338
         sign = 1;
1339
      }
1340

1341
      r.hi = (R128_U64)(R128_S64)v;
1342
      v -= (R128_S64)v;
1343
      r.lo = (R128_U64)(v * 18446744073709551616.0);
1344

1345
      if (sign) {
1346
         r128__neg(&r, &r);
1347
      }
1348

1349
      r128Copy(dst, &r);
1350
   }
1351
}
1352

1353
void r128FromString(R128 *dst, const char *s, char **endptr)
1354
{
1355
   R128_U64 lo = 0, hi = 0;
1356
   R128_U64 base = 10;
1357

1358
   int sign = 0;
1359

1360
   R128_ASSERT(dst != NULL);
1361
   R128_ASSERT(s != NULL);
1362

1363
   R128_SET2(dst, 0, 0);
1364

1365
   // consume whitespace
1366
   for (;;) {
1367
      if (*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n' || *s == '\v') {
1368
         ++s;
1369
      } else {
1370
         break;
1371
      }
1372
   }
1373

1374
   // sign
1375
   if (*s == '-') {
1376
      sign = 1;
1377
      ++s;
1378
   } else if (*s == '+') {
1379
      ++s;
1380
   }
1381

1382
   // parse base prefix
1383
   if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) {
1384
      base = 16;
1385
      s += 2;
1386
   }
1387

1388
   // whole part
1389
   for (;; ++s) {
1390
      R128_U64 digit;
1391

1392
      if ('0' <= *s && *s <= '9') {
1393
         digit = *s - '0';
1394
      } else if (base == 16 && 'a' <= *s && *s <= 'f') {
1395
         digit = *s - 'a' + 10;
1396
      } else if (base == 16 && 'A' <= *s && *s <= 'F') {
1397
         digit = *s - 'A' + 10;
1398
      } else {
1399
         break;
1400
      }
1401

1402
      hi = hi * base + digit;
1403
   }
1404

1405
   // fractional part
1406
   if (*s == R128_decimal) {
1407
      const char *exp = ++s;
1408

1409
      // find the last digit and work backwards
1410
      for (;; ++s) {
1411
         if ('0' <= *s && *s <= '9') {
1412
         } else if (base == 16 && ('a' <= *s && *s <= 'f')) {
1413
         } else if (base == 16 && ('A' <= *s && *s <= 'F')) {
1414
         } else {
1415
            break;
1416
         }
1417
      }
1418

1419
      for (const char *c = s - 1; c >= exp; --c) {
1420
         R128_U64 digit, unused;
1421

1422
         if ('0' <= *c && *c <= '9') {
1423
            digit = *c - '0';
1424
         } else if ('a' <= *c && *c <= 'f') {
1425
            digit = *c - 'a' + 10;
1426
         } else {
1427
            digit = *c - 'A' + 10;
1428
         }
1429

1430
         lo = r128__udiv128(lo, digit, base, &unused);
1431
      }
1432
   }
1433

1434
   R128_SET2(dst, lo, hi);
1435
   if (sign) {
1436
      r128__neg(dst, dst);
1437
   }
1438

1439
   if (endptr) {
1440
      *endptr = (char *) s;
1441
   }
1442
}
1443

1444
R128_S64 r128ToInt(const R128 *v)
1445
{
1446
   R128_ASSERT(v != NULL);
1447
   if ((R128_S64)v->hi < 0) {
1448
      return (R128_S64)v->hi + (v->lo != 0);
1449
   } else {
1450
      return (R128_S64)v->hi;
1451
   }
1452
}
1453

1454
double r128ToFloat(const R128 *v)
1455
{
1456
   R128 tmp;
1457
   int sign = 0;
1458
   double d;
1459

1460
   R128_ASSERT(v != NULL);
1461

1462
   R128_SET2(&tmp, v->lo, v->hi);
1463
   if (r128IsNeg(&tmp)) {
1464
      r128__neg(&tmp, &tmp);
1465
      sign = 1;
1466
   }
1467

1468
   d = tmp.hi + tmp.lo * (1 / 18446744073709551616.0);
1469
   if (sign) {
1470
      d = -d;
1471
   }
1472

1473
   return d;
1474
}
1475

1476
int r128ToStringOpt(char *dst, size_t dstSize, const R128 *v, const R128ToStringFormat *opt)
1477
{
1478
   return r128__format(dst, dstSize, v, opt);
1479
}
1480

1481
int r128ToStringf(char *dst, size_t dstSize, const char *format, const R128 *v)
1482
{
1483
   R128ToStringFormat opts;
1484

1485
   R128_ASSERT(dst != NULL && dstSize);
1486
   R128_ASSERT(format != NULL);
1487
   R128_ASSERT(v != NULL);
1488

1489
   opts.sign = R128__defaultFormat.sign;
1490
   opts.precision = R128__defaultFormat.precision;
1491
   opts.zeroPad = R128__defaultFormat.zeroPad;
1492
   opts.decimal = R128__defaultFormat.decimal;
1493
   opts.leftAlign = R128__defaultFormat.leftAlign;
1494

1495
   if (*format == '%') {
1496
      ++format;
1497
   }
1498

1499
   // flags field
1500
   for (;; ++format) {
1501
      if (*format == ' ' && opts.sign != R128ToStringSign_Plus) {
1502
         opts.sign = R128ToStringSign_Space;
1503
      } else if (*format == '+') {
1504
         opts.sign = R128ToStringSign_Plus;
1505
      } else if (*format == '0') {
1506
         opts.zeroPad = 1;
1507
      } else if (*format == '-') {
1508
         opts.leftAlign = 1;
1509
      } else if (*format == '#') {
1510
         opts.decimal = 1;
1511
      } else {
1512
         break;
1513
      }
1514
   }
1515

1516
   // width field
1517
   opts.width = 0;
1518
   for (;;) {
1519
      if ('0' <= *format && *format <= '9') {
1520
         opts.width = opts.width * 10 + *format++ - '0';
1521
      } else {
1522
         break;
1523
      }
1524
   }
1525

1526
   // precision field
1527
   if (*format == '.') {
1528
      opts.precision = 0;
1529
      ++format;
1530
      for (;;) {
1531
         if ('0' <= *format && *format <= '9') {
1532
            opts.precision = opts.precision * 10 + *format++ - '0';
1533
         } else {
1534
            break;
1535
         }
1536
      }
1537
   }
1538

1539
   return r128__format(dst, dstSize, v, &opts);
1540
}
1541

1542
int r128ToString(char *dst, size_t dstSize, const R128 *v)
1543
{
1544
   return r128__format(dst, dstSize, v, &R128__defaultFormat);
1545
}
1546

1547
void r128Copy(R128 *dst, const R128 *src)
1548
{
1549
   R128_ASSERT(dst != NULL);
1550
   R128_ASSERT(src != NULL);
1551
   dst->lo = src->lo;
1552
   dst->hi = src->hi;
1553
   R128_DEBUG_SET(dst);
1554
}
1555

1556
void r128Neg(R128 *dst, const R128 *v)
1557
{
1558
   r128__neg(dst, v);
1559
   R128_DEBUG_SET(dst);
1560
}
1561

1562
void r128Abs(R128* dst, const R128* v)
1563
{
1564
    R128 sign, inv;
1565

1566
    R128_ASSERT(dst != NULL);
1567
    R128_ASSERT(v != NULL);
1568

1569
    sign.lo = sign.hi = (R128_U64)(((R128_S64)v->hi) >> 63);
1570
    inv.lo = v->lo ^ sign.lo;
1571
    inv.hi = v->hi ^ sign.hi;
1572

1573
    r128Sub(dst, &inv, &sign);
1574
}
1575

1576
void r128Nabs(R128* dst, const R128* v)
1577
{
1578
    R128 sign, inv;
1579

1580
    R128_ASSERT(dst != NULL);
1581
    R128_ASSERT(v != NULL);
1582

1583
    sign.lo = sign.hi = (R128_U64)(((R128_S64)v->hi) >> 63);
1584
    inv.lo = v->lo ^ sign.lo;
1585
    inv.hi = v->hi ^ sign.hi;
1586

1587
    r128Sub(dst, &sign, &inv);
1588
}
1589

1590
void r128Not(R128 *dst, const R128 *src)
1591
{
1592
   R128_ASSERT(dst != NULL);
1593
   R128_ASSERT(src != NULL);
1594

1595
   dst->lo = ~src->lo;
1596
   dst->hi = ~src->hi;
1597
   R128_DEBUG_SET(dst);
1598
}
1599

1600
void r128Or(R128 *dst, const R128 *a, const R128 *b)
1601
{
1602
   R128_ASSERT(dst != NULL);
1603
   R128_ASSERT(a != NULL);
1604
   R128_ASSERT(b != NULL);
1605

1606
   dst->lo = a->lo | b->lo;
1607
   dst->hi = a->hi | b->hi;
1608
   R128_DEBUG_SET(dst);
1609
}
1610

1611
void r128And(R128 *dst, const R128 *a, const R128 *b)
1612
{
1613
   R128_ASSERT(dst != NULL);
1614
   R128_ASSERT(a != NULL);
1615
   R128_ASSERT(b != NULL);
1616

1617
   dst->lo = a->lo & b->lo;
1618
   dst->hi = a->hi & b->hi;
1619
   R128_DEBUG_SET(dst);
1620
}
1621

1622
void r128Xor(R128 *dst, const R128 *a, const R128 *b)
1623
{
1624
   R128_ASSERT(dst != NULL);
1625
   R128_ASSERT(a != NULL);
1626
   R128_ASSERT(b != NULL);
1627

1628
   dst->lo = a->lo ^ b->lo;
1629
   dst->hi = a->hi ^ b->hi;
1630
   R128_DEBUG_SET(dst);
1631
}
1632

1633
void r128Shl(R128 *dst, const R128 *src, int amount)
1634
{
1635
   R128_U64 r[4];
1636

1637
   R128_ASSERT(dst != NULL);
1638
   R128_ASSERT(src != NULL);
1639

1640
#if defined(_M_IX86) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
1641
   __asm {
1642
      // load src
1643
      mov edx, dword ptr[src]
1644
      mov ecx, amount
1645

1646
      mov edi, dword ptr[edx]
1647
      mov esi, dword ptr[edx + 4]
1648
      mov ebx, dword ptr[edx + 8]
1649
      mov eax, dword ptr[edx + 12]
1650

1651
      // shift mod 32
1652
      shld eax, ebx, cl
1653
      shld ebx, esi, cl
1654
      shld esi, edi, cl
1655
      shl edi, cl
1656

1657
      // clear out low 12 bytes of stack
1658
      xor edx, edx
1659
      mov dword ptr[r], edx
1660
      mov dword ptr[r + 4], edx
1661
      mov dword ptr[r + 8], edx
1662

1663
      // store shifted amount offset by count/32 bits
1664
      shr ecx, 5
1665
      and ecx, 3
1666
      mov dword ptr[r + ecx * 4 + 0], edi
1667
      mov dword ptr[r + ecx * 4 + 4], esi
1668
      mov dword ptr[r + ecx * 4 + 8], ebx
1669
      mov dword ptr[r + ecx * 4 + 12], eax
1670
   }
1671
#else
1672

1673
   r[0] = src->lo;
1674
   r[1] = src->hi;
1675

1676
   amount &= 127;
1677
   if (amount >= 64) {
1678
      r[1] = r[0] << (amount - 64);
1679
      r[0] = 0;
1680
   } else if (amount) {
1681
#  if defined(_M_X64) && !defined(R128_STDC_ONLY)
1682
      r[1] = __shiftleft128(r[0], r[1], (char) amount);
1683
#  else
1684
      r[1] = (r[1] << amount) | (r[0] >> (64 - amount));
1685
#  endif
1686
      r[0] = r[0] << amount;
1687
   }
1688
#endif   //_M_IX86
1689

1690
   dst->lo = r[0];
1691
   dst->hi = r[1];
1692
   R128_DEBUG_SET(dst);
1693
}
1694

1695
void r128Shr(R128 *dst, const R128 *src, int amount)
1696
{
1697
   R128_U64 r[4];
1698

1699
   R128_ASSERT(dst != NULL);
1700
   R128_ASSERT(src != NULL);
1701

1702
#if defined(_M_IX86) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
1703
   __asm {
1704
      // load src
1705
      mov edx, dword ptr[src]
1706
      mov ecx, amount
1707

1708
      mov edi, dword ptr[edx]
1709
      mov esi, dword ptr[edx + 4]
1710
      mov ebx, dword ptr[edx + 8]
1711
      mov eax, dword ptr[edx + 12]
1712

1713
      // shift mod 32
1714
      shrd edi, esi, cl
1715
      shrd esi, ebx, cl
1716
      shrd ebx, eax, cl
1717
      shr eax, cl
1718

1719
      // clear out high 12 bytes of stack
1720
      xor edx, edx
1721
      mov dword ptr[r + 20], edx
1722
      mov dword ptr[r + 24], edx
1723
      mov dword ptr[r + 28], edx
1724

1725
      // store shifted amount offset by -count/32 bits
1726
      shr ecx, 5
1727
      and ecx, 3
1728
      neg ecx
1729
      mov dword ptr[r + ecx * 4 + 16], edi
1730
      mov dword ptr[r + ecx * 4 + 20], esi
1731
      mov dword ptr[r + ecx * 4 + 24], ebx
1732
      mov dword ptr[r + ecx * 4 + 28], eax
1733
   }
1734
#else
1735
   r[2] = src->lo;
1736
   r[3] = src->hi;
1737

1738
   amount &= 127;
1739
   if (amount >= 64) {
1740
      r[2] = r[3] >> (amount - 64);
1741
      r[3] = 0;
1742
   } else if (amount) {
1743
#if defined(_M_X64) && !defined(R128_STDC_ONLY)
1744
      r[2] = __shiftright128(r[2], r[3], (char) amount);
1745
#else
1746
      r[2] = (r[2] >> amount) | (r[3] << (64 - amount));
1747
#endif
1748
      r[3] = r[3] >> amount;
1749
   }
1750
#endif
1751

1752
   dst->lo = r[2];
1753
   dst->hi = r[3];
1754
   R128_DEBUG_SET(dst);
1755
}
1756

1757
void r128Sar(R128 *dst, const R128 *src, int amount)
1758
{
1759
   R128_U64 r[4];
1760

1761
   R128_ASSERT(dst != NULL);
1762
   R128_ASSERT(src != NULL);
1763

1764
#if defined(_M_IX86) && !defined(R128_STDC_ONLY) && !defined(__MINGW32__)
1765
   __asm {
1766
      // load src
1767
      mov edx, dword ptr[src]
1768
      mov ecx, amount
1769

1770
      mov edi, dword ptr[edx]
1771
      mov esi, dword ptr[edx + 4]
1772
      mov ebx, dword ptr[edx + 8]
1773
      mov eax, dword ptr[edx + 12]
1774

1775
      // shift mod 32
1776
      shrd edi, esi, cl
1777
      shrd esi, ebx, cl
1778
      shrd ebx, eax, cl
1779
      sar eax, cl
1780

1781
      // copy sign to high 12 bytes of stack
1782
      cdq
1783
      mov dword ptr[r + 20], edx
1784
      mov dword ptr[r + 24], edx
1785
      mov dword ptr[r + 28], edx
1786

1787
      // store shifted amount offset by -count/32 bits
1788
      shr ecx, 5
1789
      and ecx, 3
1790
      neg ecx
1791
      mov dword ptr[r + ecx * 4 + 16], edi
1792
      mov dword ptr[r + ecx * 4 + 20], esi
1793
      mov dword ptr[r + ecx * 4 + 24], ebx
1794
      mov dword ptr[r + ecx * 4 + 28], eax
1795
   }
1796
#else
1797
   r[2] = src->lo;
1798
   r[3] = src->hi;
1799

1800
   amount &= 127;
1801
   if (amount >= 64) {
1802
      r[2] = (R128_U64)((R128_S64)r[3] >> (amount - 64));
1803
      r[3] = (R128_U64)((R128_S64)r[3] >> 63);
1804
   } else if (amount) {
1805
      r[2] = (r[2] >> amount) | (R128_U64)((R128_S64)r[3] << (64 - amount));
1806
      r[3] = (R128_U64)((R128_S64)r[3] >> amount);
1807
   }
1808
#endif
1809

1810
   dst->lo = r[2];
1811
   dst->hi = r[3];
1812
   R128_DEBUG_SET(dst);
1813
}
1814

1815
void r128Add(R128 *dst, const R128 *a, const R128 *b)
1816
{
1817
   unsigned char carry = 0;
1818
   R128_ASSERT(dst != NULL);
1819
   R128_ASSERT(a != NULL);
1820
   R128_ASSERT(b != NULL);
1821

1822
#if R128_INTEL && !defined(R128_STDC_ONLY)
1823
#  if R128_64BIT
1824
   carry = _addcarry_u64(carry, a->lo, b->lo, &dst->lo);
1825
   carry = _addcarry_u64(carry, a->hi, b->hi, &dst->hi);
1826
#  else
1827
   R128_U32 r0, r1, r2, r3;
1828
   carry = _addcarry_u32(carry, R128_R0(a), R128_R0(b), &r0);
1829
   carry = _addcarry_u32(carry, R128_R1(a), R128_R1(b), &r1);
1830
   carry = _addcarry_u32(carry, R128_R2(a), R128_R2(b), &r2);
1831
   carry = _addcarry_u32(carry, R128_R3(a), R128_R3(b), &r3);
1832
   R128_SET4(dst, r0, r1, r2, r3);
1833
#  endif //R128_64BIT
1834
#else
1835
   {
1836
      R128_U64 r = a->lo + b->lo;
1837
      carry = r < a->lo;
1838
      dst->lo = r;
1839
      dst->hi = a->hi + b->hi + carry;
1840
   }
1841
#endif   //R128_INTEL
1842

1843
   R128_DEBUG_SET(dst);
1844
}
1845

1846
void r128Sub(R128 *dst, const R128 *a, const R128 *b)
1847
{
1848
   unsigned char borrow = 0;
1849
   R128_ASSERT(dst != NULL);
1850
   R128_ASSERT(a != NULL);
1851
   R128_ASSERT(b != NULL);
1852

1853
#if R128_INTEL && !defined(R128_STDC_ONLY)
1854
#  if R128_64BIT
1855
   borrow = _subborrow_u64(borrow, a->lo, b->lo, &dst->lo);
1856
   borrow = _subborrow_u64(borrow, a->hi, b->hi, &dst->hi);
1857
#  else
1858
   R128_U32 r0, r1, r2, r3;
1859
   borrow = _subborrow_u32(borrow, R128_R0(a), R128_R0(b), &r0);
1860
   borrow = _subborrow_u32(borrow, R128_R1(a), R128_R1(b), &r1);
1861
   borrow = _subborrow_u32(borrow, R128_R2(a), R128_R2(b), &r2);
1862
   borrow = _subborrow_u32(borrow, R128_R3(a), R128_R3(b), &r3);
1863
   R128_SET4(dst, r0, r1, r2, r3);
1864
#  endif //R128_64BIT
1865
#else
1866
   {
1867
      R128_U64 r = a->lo - b->lo;
1868
      borrow = r > a->lo;
1869
      dst->lo = r;
1870
      dst->hi = a->hi - b->hi - borrow;
1871
   }
1872
#endif   //R128_INTEL
1873

1874
   R128_DEBUG_SET(dst);
1875
}
1876

1877
void r128Mul(R128 *dst, const R128 *a, const R128 *b)
1878
{
1879
   int sign = 0;
1880
   R128 ta, tb, tc;
1881

1882
   R128_ASSERT(dst != NULL);
1883
   R128_ASSERT(a != NULL);
1884
   R128_ASSERT(b != NULL);
1885

1886
   R128_SET2(&ta, a->lo, a->hi);
1887
   R128_SET2(&tb, b->lo, b->hi);
1888

1889
   if (r128IsNeg(&ta)) {
1890
      r128__neg(&ta, &ta);
1891
      sign = !sign;
1892
   }
1893
   if (r128IsNeg(&tb)) {
1894
      r128__neg(&tb, &tb);
1895
      sign = !sign;
1896
   }
1897

1898
   r128__umul(&tc, &ta, &tb);
1899
   if (sign) {
1900
      r128__neg(&tc, &tc);
1901
   }
1902

1903
   r128Copy(dst, &tc);
1904
}
1905

1906
void r128Div(R128 *dst, const R128 *a, const R128 *b)
1907
{
1908
   int sign = 0;
1909
   R128 tn, td, tq;
1910

1911
   R128_ASSERT(dst != NULL);
1912
   R128_ASSERT(a != NULL);
1913
   R128_ASSERT(b != NULL);
1914

1915
   R128_SET2(&tn, a->lo, a->hi);
1916
   R128_SET2(&td, b->lo, b->hi);
1917

1918
   if (r128IsNeg(&tn)) {
1919
      r128__neg(&tn, &tn);
1920
      sign = !sign;
1921
   }
1922

1923
   if (td.lo == 0 && td.hi == 0) {
1924
      // divide by zero
1925
      if (sign) {
1926
         r128Copy(dst, &R128_min);
1927
      } else {
1928
         r128Copy(dst, &R128_max);
1929
      }
1930
      return;
1931
   } else if (r128IsNeg(&td)) {
1932
      r128__neg(&td, &td);
1933
      sign = !sign;
1934
   }
1935

1936
   r128__udiv(&tq, &tn, &td);
1937

1938
   if (sign) {
1939
      r128__neg(&tq, &tq);
1940
   }
1941

1942
   r128Copy(dst, &tq);
1943
}
1944

1945
void r128Mod(R128 *dst, const R128 *a, const R128 *b)
1946
{
1947
   int sign = 0;
1948
   R128 tn, td, tq;
1949

1950
   R128_ASSERT(dst != NULL);
1951
   R128_ASSERT(a != NULL);
1952
   R128_ASSERT(b != NULL);
1953

1954
   R128_SET2(&tn, a->lo, a->hi);
1955
   R128_SET2(&td, b->lo, b->hi);
1956

1957
   if (r128IsNeg(&tn)) {
1958
      r128__neg(&tn, &tn);
1959
      sign = !sign;
1960
   }
1961

1962
   if (td.lo == 0 && td.hi == 0) {
1963
      // divide by zero
1964
      if (sign) {
1965
         r128Copy(dst, &R128_min);
1966
      } else {
1967
         r128Copy(dst, &R128_max);
1968
      }
1969
      return;
1970
   } else if (r128IsNeg(&td)) {
1971
      r128__neg(&td, &td);
1972
      sign = !sign;
1973
   }
1974

1975
   tq.hi = r128__umod(&tn, &td);
1976
   tq.lo = 0;
1977

1978
   if (sign) {
1979
      tq.hi = ~tq.hi + 1;
1980
   }
1981

1982
   r128Mul(&tq, &tq, b);
1983
   r128Sub(dst, a, &tq);
1984
}
1985

1986
void r128Rsqrt(R128 *dst, const R128 *v)
1987
{
1988
   static const R128 threeHalves = { R128_LIT_U64(0x8000000000000000), 1 };
1989
   R128 x, est;
1990
   int i;
1991

1992
   if ((R128_S64)v->hi < 0) {
1993
      r128Copy(dst, &R128_min);
1994
      return;
1995
   }
1996

1997
   R128_SET2(&x, v->lo, v->hi);
1998

1999
   // get initial estimate
2000
   if (x.hi) {
2001
      int shift = (64 + r128__clz64(x.hi)) >> 1;
2002
      est.lo = R128_LIT_U64(1) << shift;
2003
      est.hi = 0;
2004
   } else if (x.lo) {
2005
      int shift = r128__clz64(x.lo) >> 1;
2006
      est.hi = R128_LIT_U64(1) << shift;
2007
      est.lo = 0;
2008
   } else {
2009
      R128_SET2(dst, 0, 0);
2010
      return;
2011
   }
2012

2013
   // x /= 2
2014
   r128Shr(&x, &x, 1);
2015

2016
   // Newton-Raphson iterate
2017
   for (i = 0; i < 7; ++i) {
2018
      R128 newEst;
2019

2020
      // newEst = est * (threeHalves - (x / 2) * est * est);
2021
      r128__umul(&newEst, &est, &est);
2022
      r128__umul(&newEst, &newEst, &x);
2023
      r128Sub(&newEst, &threeHalves, &newEst);
2024
      r128__umul(&newEst, &est, &newEst);
2025

2026
      if (newEst.lo == est.lo && newEst.hi == est.hi) {
2027
         break;
2028
      }
2029
      R128_SET2(&est, newEst.lo, newEst.hi);
2030
   }
2031

2032
   r128Copy(dst, &est);
2033
}
2034

2035
void r128Sqrt(R128 *dst, const R128 *v)
2036
{
2037
   R128 x, est;
2038
   int i;
2039

2040
   if ((R128_S64)v->hi < 0) {
2041
      r128Copy(dst, &R128_min);
2042
      return;
2043
   }
2044

2045
   R128_SET2(&x, v->lo, v->hi);
2046

2047
   // get initial estimate
2048
   if (x.hi) {
2049
      int shift = (63 - r128__clz64(x.hi)) >> 1;
2050
      r128Shr(&est, &x, shift);
2051
   } else if (x.lo) {
2052
      int shift = (1 + r128__clz64(x.lo)) >> 1;
2053
      r128Shl(&est, &x, shift);
2054
   } else {
2055
      R128_SET2(dst, 0, 0);
2056
      return;
2057
   }
2058

2059
   // Newton-Raphson iterate
2060
   for (i = 0; i < 7; ++i) {
2061
      R128 newEst;
2062

2063
      // newEst = (est + x / est) / 2
2064
      r128__udiv(&newEst, &x, &est);
2065
      r128Add(&newEst, &newEst, &est);
2066
      r128Shr(&newEst, &newEst, 1);
2067

2068
      if (newEst.lo == est.lo && newEst.hi == est.hi) {
2069
         break;
2070
      }
2071
      R128_SET2(&est, newEst.lo, newEst.hi);
2072
   }
2073

2074
   r128Copy(dst, &est);
2075
}
2076

2077
int r128Cmp(const R128 *a, const R128 *b)
2078
{
2079
   R128_ASSERT(a != NULL);
2080
   R128_ASSERT(b != NULL);
2081

2082
   if (a->hi == b->hi) {
2083
      if (a->lo == b->lo) {
2084
         return 0;
2085
      } else if (a->lo > b->lo) {
2086
         return 1;
2087
      } else {
2088
         return -1;
2089
      }
2090
   } else if ((R128_S64)a->hi > (R128_S64)b->hi) {
2091
      return 1;
2092
   } else {
2093
      return -1;
2094
   }
2095
}
2096

2097
int r128IsNeg(const R128 *v)
2098
{
2099
   R128_ASSERT(v != NULL);
2100

2101
   return (R128_S64)v->hi < 0;
2102
}
2103

2104
void r128Min(R128 *dst, const R128 *a, const R128 *b)
2105
{
2106
   R128_ASSERT(dst != NULL);
2107
   R128_ASSERT(a != NULL);
2108
   R128_ASSERT(b != NULL);
2109

2110
   if (r128Cmp(a, b) < 0) {
2111
      r128Copy(dst, a);
2112
   } else {
2113
      r128Copy(dst, b);
2114
   }
2115
}
2116

2117
void r128Max(R128 *dst, const R128 *a, const R128 *b)
2118
{
2119
   R128_ASSERT(dst != NULL);
2120
   R128_ASSERT(a != NULL);
2121
   R128_ASSERT(b != NULL);
2122

2123
   if (r128Cmp(a, b) > 0) {
2124
      r128Copy(dst, a);
2125
   } else {
2126
      r128Copy(dst, b);
2127
   }
2128
}
2129

2130
void r128Floor(R128 *dst, const R128 *v)
2131
{
2132
   R128_ASSERT(dst != NULL);
2133
   R128_ASSERT(v != NULL);
2134

2135
   dst->hi = v->hi;
2136
   dst->lo = 0;
2137
   R128_DEBUG_SET(dst);
2138
}
2139

2140
void r128Ceil(R128 *dst, const R128 *v)
2141
{
2142
   R128_ASSERT(dst != NULL);
2143
   R128_ASSERT(v != NULL);
2144

2145
   dst->hi = v->hi + (v->lo != 0);
2146
   dst->lo = 0;
2147
   R128_DEBUG_SET(dst);
2148
}
2149

2150
void r128Round(R128* dst, const R128* v)
2151
{
2152
   R128_ASSERT(dst != NULL);
2153
   R128_ASSERT(v != NULL);
2154

2155
   dst->hi = v->hi + (v->lo >= R128_LIT_U64(0x8000000000000000) + (R128_U64)((R128_S64)v->hi < 0));
2156
   dst->lo = 0;
2157
   R128_DEBUG_SET(dst);
2158
}
2159

2160
#endif   //R128_IMPLEMENTATION
2161

2162
Product

Resources

Company