Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/cpython
Path: blob/main/Modules/_decimal/libmpdec/umodarith.h
12 views
1
/*
2
* Copyright (c) 2008-2020 Stefan Krah. All rights reserved.
3
*
4
* Redistribution and use in source and binary forms, with or without
5
* modification, are permitted provided that the following conditions
6
* are met:
7
*
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
*
11
* 2. Redistributions in binary form must reproduce the above copyright
12
* notice, this list of conditions and the following disclaimer in the
13
* documentation and/or other materials provided with the distribution.
14
*
15
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
16
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25
* SUCH DAMAGE.
26
*/
27
28
29
#ifndef LIBMPDEC_UMODARITH_H_
30
#define LIBMPDEC_UMODARITH_H_
31
32
33
#include "mpdecimal.h"
34
35
#include "constants.h"
36
#include "typearith.h"
37
38
39
/* Bignum: Low level routines for unsigned modular arithmetic. These are
40
used in the fast convolution functions for very large coefficients. */
41
42
43
/**************************************************************************/
44
/* ANSI modular arithmetic */
45
/**************************************************************************/
46
47
48
/*
49
* Restrictions: a < m and b < m
50
* ACL2 proof: umodarith.lisp: addmod-correct
51
*/
52
static inline mpd_uint_t
53
addmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
54
{
55
mpd_uint_t s;
56
57
s = a + b;
58
s = (s < a) ? s - m : s;
59
s = (s >= m) ? s - m : s;
60
61
return s;
62
}
63
64
/*
65
* Restrictions: a < m and b < m
66
* ACL2 proof: umodarith.lisp: submod-2-correct
67
*/
68
static inline mpd_uint_t
69
submod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
70
{
71
mpd_uint_t d;
72
73
d = a - b;
74
d = (a < b) ? d + m : d;
75
76
return d;
77
}
78
79
/*
80
* Restrictions: a < 2m and b < 2m
81
* ACL2 proof: umodarith.lisp: section ext-submod
82
*/
83
static inline mpd_uint_t
84
ext_submod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
85
{
86
mpd_uint_t d;
87
88
a = (a >= m) ? a - m : a;
89
b = (b >= m) ? b - m : b;
90
91
d = a - b;
92
d = (a < b) ? d + m : d;
93
94
return d;
95
}
96
97
/*
98
* Reduce double word modulo m.
99
* Restrictions: m != 0
100
* ACL2 proof: umodarith.lisp: section dw-reduce
101
*/
102
static inline mpd_uint_t
103
dw_reduce(mpd_uint_t hi, mpd_uint_t lo, mpd_uint_t m)
104
{
105
mpd_uint_t r1, r2, w;
106
107
_mpd_div_word(&w, &r1, hi, m);
108
_mpd_div_words(&w, &r2, r1, lo, m);
109
110
return r2;
111
}
112
113
/*
114
* Subtract double word from a.
115
* Restrictions: a < m
116
* ACL2 proof: umodarith.lisp: section dw-submod
117
*/
118
static inline mpd_uint_t
119
dw_submod(mpd_uint_t a, mpd_uint_t hi, mpd_uint_t lo, mpd_uint_t m)
120
{
121
mpd_uint_t d, r;
122
123
r = dw_reduce(hi, lo, m);
124
d = a - r;
125
d = (a < r) ? d + m : d;
126
127
return d;
128
}
129
130
#ifdef CONFIG_64
131
132
/**************************************************************************/
133
/* 64-bit modular arithmetic */
134
/**************************************************************************/
135
136
/*
137
* A proof of the algorithm is in literature/mulmod-64.txt. An ACL2
138
* proof is in umodarith.lisp: section "Fast modular reduction".
139
*
140
* Algorithm: calculate (a * b) % p:
141
*
142
* a) hi, lo <- a * b # Calculate a * b.
143
*
144
* b) hi, lo <- R(hi, lo) # Reduce modulo p.
145
*
146
* c) Repeat step b) until 0 <= hi * 2**64 + lo < 2*p.
147
*
148
* d) If the result is less than p, return lo. Otherwise return lo - p.
149
*/
150
151
static inline mpd_uint_t
152
x64_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
153
{
154
mpd_uint_t hi, lo, x, y;
155
156
157
_mpd_mul_words(&hi, &lo, a, b);
158
159
if (m & (1ULL<<32)) { /* P1 */
160
161
/* first reduction */
162
x = y = hi;
163
hi >>= 32;
164
165
x = lo - x;
166
if (x > lo) hi--;
167
168
y <<= 32;
169
lo = y + x;
170
if (lo < y) hi++;
171
172
/* second reduction */
173
x = y = hi;
174
hi >>= 32;
175
176
x = lo - x;
177
if (x > lo) hi--;
178
179
y <<= 32;
180
lo = y + x;
181
if (lo < y) hi++;
182
183
return (hi || lo >= m ? lo - m : lo);
184
}
185
else if (m & (1ULL<<34)) { /* P2 */
186
187
/* first reduction */
188
x = y = hi;
189
hi >>= 30;
190
191
x = lo - x;
192
if (x > lo) hi--;
193
194
y <<= 34;
195
lo = y + x;
196
if (lo < y) hi++;
197
198
/* second reduction */
199
x = y = hi;
200
hi >>= 30;
201
202
x = lo - x;
203
if (x > lo) hi--;
204
205
y <<= 34;
206
lo = y + x;
207
if (lo < y) hi++;
208
209
/* third reduction */
210
x = y = hi;
211
hi >>= 30;
212
213
x = lo - x;
214
if (x > lo) hi--;
215
216
y <<= 34;
217
lo = y + x;
218
if (lo < y) hi++;
219
220
return (hi || lo >= m ? lo - m : lo);
221
}
222
else { /* P3 */
223
224
/* first reduction */
225
x = y = hi;
226
hi >>= 24;
227
228
x = lo - x;
229
if (x > lo) hi--;
230
231
y <<= 40;
232
lo = y + x;
233
if (lo < y) hi++;
234
235
/* second reduction */
236
x = y = hi;
237
hi >>= 24;
238
239
x = lo - x;
240
if (x > lo) hi--;
241
242
y <<= 40;
243
lo = y + x;
244
if (lo < y) hi++;
245
246
/* third reduction */
247
x = y = hi;
248
hi >>= 24;
249
250
x = lo - x;
251
if (x > lo) hi--;
252
253
y <<= 40;
254
lo = y + x;
255
if (lo < y) hi++;
256
257
return (hi || lo >= m ? lo - m : lo);
258
}
259
}
260
261
static inline void
262
x64_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)
263
{
264
*a = x64_mulmod(*a, w, m);
265
*b = x64_mulmod(*b, w, m);
266
}
267
268
static inline void
269
x64_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
270
mpd_uint_t m)
271
{
272
*a0 = x64_mulmod(*a0, b0, m);
273
*a1 = x64_mulmod(*a1, b1, m);
274
}
275
276
static inline mpd_uint_t
277
x64_powmod(mpd_uint_t base, mpd_uint_t exp, mpd_uint_t umod)
278
{
279
mpd_uint_t r = 1;
280
281
while (exp > 0) {
282
if (exp & 1)
283
r = x64_mulmod(r, base, umod);
284
base = x64_mulmod(base, base, umod);
285
exp >>= 1;
286
}
287
288
return r;
289
}
290
291
/* END CONFIG_64 */
292
#else /* CONFIG_32 */
293
294
295
/**************************************************************************/
296
/* 32-bit modular arithmetic */
297
/**************************************************************************/
298
299
#if defined(ANSI)
300
#if !defined(LEGACY_COMPILER)
301
/* HAVE_UINT64_T */
302
static inline mpd_uint_t
303
std_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
304
{
305
return ((mpd_uuint_t) a * b) % m;
306
}
307
308
static inline void
309
std_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)
310
{
311
*a = ((mpd_uuint_t) *a * w) % m;
312
*b = ((mpd_uuint_t) *b * w) % m;
313
}
314
315
static inline void
316
std_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
317
mpd_uint_t m)
318
{
319
*a0 = ((mpd_uuint_t) *a0 * b0) % m;
320
*a1 = ((mpd_uuint_t) *a1 * b1) % m;
321
}
322
/* END HAVE_UINT64_T */
323
#else
324
/* LEGACY_COMPILER */
325
static inline mpd_uint_t
326
std_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
327
{
328
mpd_uint_t hi, lo, q, r;
329
_mpd_mul_words(&hi, &lo, a, b);
330
_mpd_div_words(&q, &r, hi, lo, m);
331
return r;
332
}
333
334
static inline void
335
std_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)
336
{
337
*a = std_mulmod(*a, w, m);
338
*b = std_mulmod(*b, w, m);
339
}
340
341
static inline void
342
std_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
343
mpd_uint_t m)
344
{
345
*a0 = std_mulmod(*a0, b0, m);
346
*a1 = std_mulmod(*a1, b1, m);
347
}
348
/* END LEGACY_COMPILER */
349
#endif
350
351
static inline mpd_uint_t
352
std_powmod(mpd_uint_t base, mpd_uint_t exp, mpd_uint_t umod)
353
{
354
mpd_uint_t r = 1;
355
356
while (exp > 0) {
357
if (exp & 1)
358
r = std_mulmod(r, base, umod);
359
base = std_mulmod(base, base, umod);
360
exp >>= 1;
361
}
362
363
return r;
364
}
365
#endif /* ANSI CONFIG_32 */
366
367
368
/**************************************************************************/
369
/* Pentium Pro modular arithmetic */
370
/**************************************************************************/
371
372
/*
373
* A proof of the algorithm is in literature/mulmod-ppro.txt. The FPU
374
* control word must be set to 64-bit precision and truncation mode
375
* prior to using these functions.
376
*
377
* Algorithm: calculate (a * b) % p:
378
*
379
* p := prime < 2**31
380
* pinv := (long double)1.0 / p (precalculated)
381
*
382
* a) n = a * b # Calculate exact product.
383
* b) qest = n * pinv # Calculate estimate for q = n / p.
384
* c) q = (qest+2**63)-2**63 # Truncate qest to the exact quotient.
385
* d) r = n - q * p # Calculate remainder.
386
*
387
* Remarks:
388
*
389
* - p = dmod and pinv = dinvmod.
390
* - dinvmod points to an array of three uint32_t, which is interpreted
391
* as an 80 bit long double by fldt.
392
* - Intel compilers prior to version 11 do not seem to handle the
393
* __GNUC__ inline assembly correctly.
394
* - random tests are provided in tests/extended/ppro_mulmod.c
395
*/
396
397
#if defined(PPRO)
398
#if defined(ASM)
399
400
/* Return (a * b) % dmod */
401
static inline mpd_uint_t
402
ppro_mulmod(mpd_uint_t a, mpd_uint_t b, double *dmod, uint32_t *dinvmod)
403
{
404
mpd_uint_t retval;
405
406
__asm__ (
407
"fildl %2\n\t"
408
"fildl %1\n\t"
409
"fmulp %%st, %%st(1)\n\t"
410
"fldt (%4)\n\t"
411
"fmul %%st(1), %%st\n\t"
412
"flds %5\n\t"
413
"fadd %%st, %%st(1)\n\t"
414
"fsubrp %%st, %%st(1)\n\t"
415
"fldl (%3)\n\t"
416
"fmulp %%st, %%st(1)\n\t"
417
"fsubrp %%st, %%st(1)\n\t"
418
"fistpl %0\n\t"
419
: "=m" (retval)
420
: "m" (a), "m" (b), "r" (dmod), "r" (dinvmod), "m" (MPD_TWO63)
421
: "st", "memory"
422
);
423
424
return retval;
425
}
426
427
/*
428
* Two modular multiplications in parallel:
429
* *a0 = (*a0 * w) % dmod
430
* *a1 = (*a1 * w) % dmod
431
*/
432
static inline void
433
ppro_mulmod2c(mpd_uint_t *a0, mpd_uint_t *a1, mpd_uint_t w,
434
double *dmod, uint32_t *dinvmod)
435
{
436
__asm__ (
437
"fildl %2\n\t"
438
"fildl (%1)\n\t"
439
"fmul %%st(1), %%st\n\t"
440
"fxch %%st(1)\n\t"
441
"fildl (%0)\n\t"
442
"fmulp %%st, %%st(1) \n\t"
443
"fldt (%4)\n\t"
444
"flds %5\n\t"
445
"fld %%st(2)\n\t"
446
"fmul %%st(2)\n\t"
447
"fadd %%st(1)\n\t"
448
"fsub %%st(1)\n\t"
449
"fmull (%3)\n\t"
450
"fsubrp %%st, %%st(3)\n\t"
451
"fxch %%st(2)\n\t"
452
"fistpl (%0)\n\t"
453
"fmul %%st(2)\n\t"
454
"fadd %%st(1)\n\t"
455
"fsubp %%st, %%st(1)\n\t"
456
"fmull (%3)\n\t"
457
"fsubrp %%st, %%st(1)\n\t"
458
"fistpl (%1)\n\t"
459
: : "r" (a0), "r" (a1), "m" (w),
460
"r" (dmod), "r" (dinvmod),
461
"m" (MPD_TWO63)
462
: "st", "memory"
463
);
464
}
465
466
/*
467
* Two modular multiplications in parallel:
468
* *a0 = (*a0 * b0) % dmod
469
* *a1 = (*a1 * b1) % dmod
470
*/
471
static inline void
472
ppro_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
473
double *dmod, uint32_t *dinvmod)
474
{
475
__asm__ (
476
"fildl %3\n\t"
477
"fildl (%2)\n\t"
478
"fmulp %%st, %%st(1)\n\t"
479
"fildl %1\n\t"
480
"fildl (%0)\n\t"
481
"fmulp %%st, %%st(1)\n\t"
482
"fldt (%5)\n\t"
483
"fld %%st(2)\n\t"
484
"fmul %%st(1), %%st\n\t"
485
"fxch %%st(1)\n\t"
486
"fmul %%st(2), %%st\n\t"
487
"flds %6\n\t"
488
"fldl (%4)\n\t"
489
"fxch %%st(3)\n\t"
490
"fadd %%st(1), %%st\n\t"
491
"fxch %%st(2)\n\t"
492
"fadd %%st(1), %%st\n\t"
493
"fxch %%st(2)\n\t"
494
"fsub %%st(1), %%st\n\t"
495
"fxch %%st(2)\n\t"
496
"fsubp %%st, %%st(1)\n\t"
497
"fxch %%st(1)\n\t"
498
"fmul %%st(2), %%st\n\t"
499
"fxch %%st(1)\n\t"
500
"fmulp %%st, %%st(2)\n\t"
501
"fsubrp %%st, %%st(3)\n\t"
502
"fsubrp %%st, %%st(1)\n\t"
503
"fxch %%st(1)\n\t"
504
"fistpl (%2)\n\t"
505
"fistpl (%0)\n\t"
506
: : "r" (a0), "m" (b0), "r" (a1), "m" (b1),
507
"r" (dmod), "r" (dinvmod),
508
"m" (MPD_TWO63)
509
: "st", "memory"
510
);
511
}
512
/* END PPRO GCC ASM */
513
#elif defined(MASM)
514
515
/* Return (a * b) % dmod */
516
static inline mpd_uint_t __cdecl
517
ppro_mulmod(mpd_uint_t a, mpd_uint_t b, double *dmod, uint32_t *dinvmod)
518
{
519
mpd_uint_t retval;
520
521
__asm {
522
mov eax, dinvmod
523
mov edx, dmod
524
fild b
525
fild a
526
fmulp st(1), st
527
fld TBYTE PTR [eax]
528
fmul st, st(1)
529
fld MPD_TWO63
530
fadd st(1), st
531
fsubp st(1), st
532
fld QWORD PTR [edx]
533
fmulp st(1), st
534
fsubp st(1), st
535
fistp retval
536
}
537
538
return retval;
539
}
540
541
/*
542
* Two modular multiplications in parallel:
543
* *a0 = (*a0 * w) % dmod
544
* *a1 = (*a1 * w) % dmod
545
*/
546
static inline mpd_uint_t __cdecl
547
ppro_mulmod2c(mpd_uint_t *a0, mpd_uint_t *a1, mpd_uint_t w,
548
double *dmod, uint32_t *dinvmod)
549
{
550
__asm {
551
mov ecx, dmod
552
mov edx, a1
553
mov ebx, dinvmod
554
mov eax, a0
555
fild w
556
fild DWORD PTR [edx]
557
fmul st, st(1)
558
fxch st(1)
559
fild DWORD PTR [eax]
560
fmulp st(1), st
561
fld TBYTE PTR [ebx]
562
fld MPD_TWO63
563
fld st(2)
564
fmul st, st(2)
565
fadd st, st(1)
566
fsub st, st(1)
567
fmul QWORD PTR [ecx]
568
fsubp st(3), st
569
fxch st(2)
570
fistp DWORD PTR [eax]
571
fmul st, st(2)
572
fadd st, st(1)
573
fsubrp st(1), st
574
fmul QWORD PTR [ecx]
575
fsubp st(1), st
576
fistp DWORD PTR [edx]
577
}
578
}
579
580
/*
581
* Two modular multiplications in parallel:
582
* *a0 = (*a0 * b0) % dmod
583
* *a1 = (*a1 * b1) % dmod
584
*/
585
static inline void __cdecl
586
ppro_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
587
double *dmod, uint32_t *dinvmod)
588
{
589
__asm {
590
mov ecx, dmod
591
mov edx, a1
592
mov ebx, dinvmod
593
mov eax, a0
594
fild b1
595
fild DWORD PTR [edx]
596
fmulp st(1), st
597
fild b0
598
fild DWORD PTR [eax]
599
fmulp st(1), st
600
fld TBYTE PTR [ebx]
601
fld st(2)
602
fmul st, st(1)
603
fxch st(1)
604
fmul st, st(2)
605
fld DWORD PTR MPD_TWO63
606
fld QWORD PTR [ecx]
607
fxch st(3)
608
fadd st, st(1)
609
fxch st(2)
610
fadd st, st(1)
611
fxch st(2)
612
fsub st, st(1)
613
fxch st(2)
614
fsubrp st(1), st
615
fxch st(1)
616
fmul st, st(2)
617
fxch st(1)
618
fmulp st(2), st
619
fsubp st(3), st
620
fsubp st(1), st
621
fxch st(1)
622
fistp DWORD PTR [edx]
623
fistp DWORD PTR [eax]
624
}
625
}
626
#endif /* PPRO MASM (_MSC_VER) */
627
628
629
/* Return (base ** exp) % dmod */
630
static inline mpd_uint_t
631
ppro_powmod(mpd_uint_t base, mpd_uint_t exp, double *dmod, uint32_t *dinvmod)
632
{
633
mpd_uint_t r = 1;
634
635
while (exp > 0) {
636
if (exp & 1)
637
r = ppro_mulmod(r, base, dmod, dinvmod);
638
base = ppro_mulmod(base, base, dmod, dinvmod);
639
exp >>= 1;
640
}
641
642
return r;
643
}
644
#endif /* PPRO */
645
#endif /* CONFIG_32 */
646
647
648
#endif /* LIBMPDEC_UMODARITH_H_ */
649
650