Path: blob/main/Modules/_decimal/libmpdec/umodarith.h
12 views
/*1* Copyright (c) 2008-2020 Stefan Krah. All rights reserved.2*3* Redistribution and use in source and binary forms, with or without4* modification, are permitted provided that the following conditions5* are met:6*7* 1. Redistributions of source code must retain the above copyright8* notice, this list of conditions and the following disclaimer.9*10* 2. Redistributions in binary form must reproduce the above copyright11* notice, this list of conditions and the following disclaimer in the12* documentation and/or other materials provided with the distribution.13*14* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND15* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE16* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE17* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE18* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL19* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS20* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)21* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT22* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY23* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF24* SUCH DAMAGE.25*/262728#ifndef LIBMPDEC_UMODARITH_H_29#define LIBMPDEC_UMODARITH_H_303132#include "mpdecimal.h"3334#include "constants.h"35#include "typearith.h"363738/* Bignum: Low level routines for unsigned modular arithmetic. These are39used in the fast convolution functions for very large coefficients. */404142/**************************************************************************/43/* ANSI modular arithmetic */44/**************************************************************************/454647/*48* Restrictions: a < m and b < m49* ACL2 proof: umodarith.lisp: addmod-correct50*/51static inline mpd_uint_t52addmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)53{54mpd_uint_t s;5556s = a + b;57s = (s < a) ? s - m : s;58s = (s >= m) ? s - m : s;5960return s;61}6263/*64* Restrictions: a < m and b < m65* ACL2 proof: umodarith.lisp: submod-2-correct66*/67static inline mpd_uint_t68submod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)69{70mpd_uint_t d;7172d = a - b;73d = (a < b) ? d + m : d;7475return d;76}7778/*79* Restrictions: a < 2m and b < 2m80* ACL2 proof: umodarith.lisp: section ext-submod81*/82static inline mpd_uint_t83ext_submod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)84{85mpd_uint_t d;8687a = (a >= m) ? a - m : a;88b = (b >= m) ? b - m : b;8990d = a - b;91d = (a < b) ? d + m : d;9293return d;94}9596/*97* Reduce double word modulo m.98* Restrictions: m != 099* ACL2 proof: umodarith.lisp: section dw-reduce100*/101static inline mpd_uint_t102dw_reduce(mpd_uint_t hi, mpd_uint_t lo, mpd_uint_t m)103{104mpd_uint_t r1, r2, w;105106_mpd_div_word(&w, &r1, hi, m);107_mpd_div_words(&w, &r2, r1, lo, m);108109return r2;110}111112/*113* Subtract double word from a.114* Restrictions: a < m115* ACL2 proof: umodarith.lisp: section dw-submod116*/117static inline mpd_uint_t118dw_submod(mpd_uint_t a, mpd_uint_t hi, mpd_uint_t lo, mpd_uint_t m)119{120mpd_uint_t d, r;121122r = dw_reduce(hi, lo, m);123d = a - r;124d = (a < r) ? d + m : d;125126return d;127}128129#ifdef CONFIG_64130131/**************************************************************************/132/* 64-bit modular arithmetic */133/**************************************************************************/134135/*136* A proof of the algorithm is in literature/mulmod-64.txt. An ACL2137* proof is in umodarith.lisp: section "Fast modular reduction".138*139* Algorithm: calculate (a * b) % p:140*141* a) hi, lo <- a * b # Calculate a * b.142*143* b) hi, lo <- R(hi, lo) # Reduce modulo p.144*145* c) Repeat step b) until 0 <= hi * 2**64 + lo < 2*p.146*147* d) If the result is less than p, return lo. Otherwise return lo - p.148*/149150static inline mpd_uint_t151x64_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)152{153mpd_uint_t hi, lo, x, y;154155156_mpd_mul_words(&hi, &lo, a, b);157158if (m & (1ULL<<32)) { /* P1 */159160/* first reduction */161x = y = hi;162hi >>= 32;163164x = lo - x;165if (x > lo) hi--;166167y <<= 32;168lo = y + x;169if (lo < y) hi++;170171/* second reduction */172x = y = hi;173hi >>= 32;174175x = lo - x;176if (x > lo) hi--;177178y <<= 32;179lo = y + x;180if (lo < y) hi++;181182return (hi || lo >= m ? lo - m : lo);183}184else if (m & (1ULL<<34)) { /* P2 */185186/* first reduction */187x = y = hi;188hi >>= 30;189190x = lo - x;191if (x > lo) hi--;192193y <<= 34;194lo = y + x;195if (lo < y) hi++;196197/* second reduction */198x = y = hi;199hi >>= 30;200201x = lo - x;202if (x > lo) hi--;203204y <<= 34;205lo = y + x;206if (lo < y) hi++;207208/* third reduction */209x = y = hi;210hi >>= 30;211212x = lo - x;213if (x > lo) hi--;214215y <<= 34;216lo = y + x;217if (lo < y) hi++;218219return (hi || lo >= m ? lo - m : lo);220}221else { /* P3 */222223/* first reduction */224x = y = hi;225hi >>= 24;226227x = lo - x;228if (x > lo) hi--;229230y <<= 40;231lo = y + x;232if (lo < y) hi++;233234/* second reduction */235x = y = hi;236hi >>= 24;237238x = lo - x;239if (x > lo) hi--;240241y <<= 40;242lo = y + x;243if (lo < y) hi++;244245/* third reduction */246x = y = hi;247hi >>= 24;248249x = lo - x;250if (x > lo) hi--;251252y <<= 40;253lo = y + x;254if (lo < y) hi++;255256return (hi || lo >= m ? lo - m : lo);257}258}259260static inline void261x64_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)262{263*a = x64_mulmod(*a, w, m);264*b = x64_mulmod(*b, w, m);265}266267static inline void268x64_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,269mpd_uint_t m)270{271*a0 = x64_mulmod(*a0, b0, m);272*a1 = x64_mulmod(*a1, b1, m);273}274275static inline mpd_uint_t276x64_powmod(mpd_uint_t base, mpd_uint_t exp, mpd_uint_t umod)277{278mpd_uint_t r = 1;279280while (exp > 0) {281if (exp & 1)282r = x64_mulmod(r, base, umod);283base = x64_mulmod(base, base, umod);284exp >>= 1;285}286287return r;288}289290/* END CONFIG_64 */291#else /* CONFIG_32 */292293294/**************************************************************************/295/* 32-bit modular arithmetic */296/**************************************************************************/297298#if defined(ANSI)299#if !defined(LEGACY_COMPILER)300/* HAVE_UINT64_T */301static inline mpd_uint_t302std_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)303{304return ((mpd_uuint_t) a * b) % m;305}306307static inline void308std_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)309{310*a = ((mpd_uuint_t) *a * w) % m;311*b = ((mpd_uuint_t) *b * w) % m;312}313314static inline void315std_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,316mpd_uint_t m)317{318*a0 = ((mpd_uuint_t) *a0 * b0) % m;319*a1 = ((mpd_uuint_t) *a1 * b1) % m;320}321/* END HAVE_UINT64_T */322#else323/* LEGACY_COMPILER */324static inline mpd_uint_t325std_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)326{327mpd_uint_t hi, lo, q, r;328_mpd_mul_words(&hi, &lo, a, b);329_mpd_div_words(&q, &r, hi, lo, m);330return r;331}332333static inline void334std_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)335{336*a = std_mulmod(*a, w, m);337*b = std_mulmod(*b, w, m);338}339340static inline void341std_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,342mpd_uint_t m)343{344*a0 = std_mulmod(*a0, b0, m);345*a1 = std_mulmod(*a1, b1, m);346}347/* END LEGACY_COMPILER */348#endif349350static inline mpd_uint_t351std_powmod(mpd_uint_t base, mpd_uint_t exp, mpd_uint_t umod)352{353mpd_uint_t r = 1;354355while (exp > 0) {356if (exp & 1)357r = std_mulmod(r, base, umod);358base = std_mulmod(base, base, umod);359exp >>= 1;360}361362return r;363}364#endif /* ANSI CONFIG_32 */365366367/**************************************************************************/368/* Pentium Pro modular arithmetic */369/**************************************************************************/370371/*372* A proof of the algorithm is in literature/mulmod-ppro.txt. The FPU373* control word must be set to 64-bit precision and truncation mode374* prior to using these functions.375*376* Algorithm: calculate (a * b) % p:377*378* p := prime < 2**31379* pinv := (long double)1.0 / p (precalculated)380*381* a) n = a * b # Calculate exact product.382* b) qest = n * pinv # Calculate estimate for q = n / p.383* c) q = (qest+2**63)-2**63 # Truncate qest to the exact quotient.384* d) r = n - q * p # Calculate remainder.385*386* Remarks:387*388* - p = dmod and pinv = dinvmod.389* - dinvmod points to an array of three uint32_t, which is interpreted390* as an 80 bit long double by fldt.391* - Intel compilers prior to version 11 do not seem to handle the392* __GNUC__ inline assembly correctly.393* - random tests are provided in tests/extended/ppro_mulmod.c394*/395396#if defined(PPRO)397#if defined(ASM)398399/* Return (a * b) % dmod */400static inline mpd_uint_t401ppro_mulmod(mpd_uint_t a, mpd_uint_t b, double *dmod, uint32_t *dinvmod)402{403mpd_uint_t retval;404405__asm__ (406"fildl %2\n\t"407"fildl %1\n\t"408"fmulp %%st, %%st(1)\n\t"409"fldt (%4)\n\t"410"fmul %%st(1), %%st\n\t"411"flds %5\n\t"412"fadd %%st, %%st(1)\n\t"413"fsubrp %%st, %%st(1)\n\t"414"fldl (%3)\n\t"415"fmulp %%st, %%st(1)\n\t"416"fsubrp %%st, %%st(1)\n\t"417"fistpl %0\n\t"418: "=m" (retval)419: "m" (a), "m" (b), "r" (dmod), "r" (dinvmod), "m" (MPD_TWO63)420: "st", "memory"421);422423return retval;424}425426/*427* Two modular multiplications in parallel:428* *a0 = (*a0 * w) % dmod429* *a1 = (*a1 * w) % dmod430*/431static inline void432ppro_mulmod2c(mpd_uint_t *a0, mpd_uint_t *a1, mpd_uint_t w,433double *dmod, uint32_t *dinvmod)434{435__asm__ (436"fildl %2\n\t"437"fildl (%1)\n\t"438"fmul %%st(1), %%st\n\t"439"fxch %%st(1)\n\t"440"fildl (%0)\n\t"441"fmulp %%st, %%st(1) \n\t"442"fldt (%4)\n\t"443"flds %5\n\t"444"fld %%st(2)\n\t"445"fmul %%st(2)\n\t"446"fadd %%st(1)\n\t"447"fsub %%st(1)\n\t"448"fmull (%3)\n\t"449"fsubrp %%st, %%st(3)\n\t"450"fxch %%st(2)\n\t"451"fistpl (%0)\n\t"452"fmul %%st(2)\n\t"453"fadd %%st(1)\n\t"454"fsubp %%st, %%st(1)\n\t"455"fmull (%3)\n\t"456"fsubrp %%st, %%st(1)\n\t"457"fistpl (%1)\n\t"458: : "r" (a0), "r" (a1), "m" (w),459"r" (dmod), "r" (dinvmod),460"m" (MPD_TWO63)461: "st", "memory"462);463}464465/*466* Two modular multiplications in parallel:467* *a0 = (*a0 * b0) % dmod468* *a1 = (*a1 * b1) % dmod469*/470static inline void471ppro_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,472double *dmod, uint32_t *dinvmod)473{474__asm__ (475"fildl %3\n\t"476"fildl (%2)\n\t"477"fmulp %%st, %%st(1)\n\t"478"fildl %1\n\t"479"fildl (%0)\n\t"480"fmulp %%st, %%st(1)\n\t"481"fldt (%5)\n\t"482"fld %%st(2)\n\t"483"fmul %%st(1), %%st\n\t"484"fxch %%st(1)\n\t"485"fmul %%st(2), %%st\n\t"486"flds %6\n\t"487"fldl (%4)\n\t"488"fxch %%st(3)\n\t"489"fadd %%st(1), %%st\n\t"490"fxch %%st(2)\n\t"491"fadd %%st(1), %%st\n\t"492"fxch %%st(2)\n\t"493"fsub %%st(1), %%st\n\t"494"fxch %%st(2)\n\t"495"fsubp %%st, %%st(1)\n\t"496"fxch %%st(1)\n\t"497"fmul %%st(2), %%st\n\t"498"fxch %%st(1)\n\t"499"fmulp %%st, %%st(2)\n\t"500"fsubrp %%st, %%st(3)\n\t"501"fsubrp %%st, %%st(1)\n\t"502"fxch %%st(1)\n\t"503"fistpl (%2)\n\t"504"fistpl (%0)\n\t"505: : "r" (a0), "m" (b0), "r" (a1), "m" (b1),506"r" (dmod), "r" (dinvmod),507"m" (MPD_TWO63)508: "st", "memory"509);510}511/* END PPRO GCC ASM */512#elif defined(MASM)513514/* Return (a * b) % dmod */515static inline mpd_uint_t __cdecl516ppro_mulmod(mpd_uint_t a, mpd_uint_t b, double *dmod, uint32_t *dinvmod)517{518mpd_uint_t retval;519520__asm {521mov eax, dinvmod522mov edx, dmod523fild b524fild a525fmulp st(1), st526fld TBYTE PTR [eax]527fmul st, st(1)528fld MPD_TWO63529fadd st(1), st530fsubp st(1), st531fld QWORD PTR [edx]532fmulp st(1), st533fsubp st(1), st534fistp retval535}536537return retval;538}539540/*541* Two modular multiplications in parallel:542* *a0 = (*a0 * w) % dmod543* *a1 = (*a1 * w) % dmod544*/545static inline mpd_uint_t __cdecl546ppro_mulmod2c(mpd_uint_t *a0, mpd_uint_t *a1, mpd_uint_t w,547double *dmod, uint32_t *dinvmod)548{549__asm {550mov ecx, dmod551mov edx, a1552mov ebx, dinvmod553mov eax, a0554fild w555fild DWORD PTR [edx]556fmul st, st(1)557fxch st(1)558fild DWORD PTR [eax]559fmulp st(1), st560fld TBYTE PTR [ebx]561fld MPD_TWO63562fld st(2)563fmul st, st(2)564fadd st, st(1)565fsub st, st(1)566fmul QWORD PTR [ecx]567fsubp st(3), st568fxch st(2)569fistp DWORD PTR [eax]570fmul st, st(2)571fadd st, st(1)572fsubrp st(1), st573fmul QWORD PTR [ecx]574fsubp st(1), st575fistp DWORD PTR [edx]576}577}578579/*580* Two modular multiplications in parallel:581* *a0 = (*a0 * b0) % dmod582* *a1 = (*a1 * b1) % dmod583*/584static inline void __cdecl585ppro_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,586double *dmod, uint32_t *dinvmod)587{588__asm {589mov ecx, dmod590mov edx, a1591mov ebx, dinvmod592mov eax, a0593fild b1594fild DWORD PTR [edx]595fmulp st(1), st596fild b0597fild DWORD PTR [eax]598fmulp st(1), st599fld TBYTE PTR [ebx]600fld st(2)601fmul st, st(1)602fxch st(1)603fmul st, st(2)604fld DWORD PTR MPD_TWO63605fld QWORD PTR [ecx]606fxch st(3)607fadd st, st(1)608fxch st(2)609fadd st, st(1)610fxch st(2)611fsub st, st(1)612fxch st(2)613fsubrp st(1), st614fxch st(1)615fmul st, st(2)616fxch st(1)617fmulp st(2), st618fsubp st(3), st619fsubp st(1), st620fxch st(1)621fistp DWORD PTR [edx]622fistp DWORD PTR [eax]623}624}625#endif /* PPRO MASM (_MSC_VER) */626627628/* Return (base ** exp) % dmod */629static inline mpd_uint_t630ppro_powmod(mpd_uint_t base, mpd_uint_t exp, double *dmod, uint32_t *dinvmod)631{632mpd_uint_t r = 1;633634while (exp > 0) {635if (exp & 1)636r = ppro_mulmod(r, base, dmod, dinvmod);637base = ppro_mulmod(base, base, dmod, dinvmod);638exp >>= 1;639}640641return r;642}643#endif /* PPRO */644#endif /* CONFIG_32 */645646647#endif /* LIBMPDEC_UMODARITH_H_ */648649650