Path: blob/main/contrib/bearssl/src/int/i31_moddiv.c
39483 views
/*1* Copyright (c) 2018 Thomas Pornin <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining4* a copy of this software and associated documentation files (the5* "Software"), to deal in the Software without restriction, including6* without limitation the rights to use, copy, modify, merge, publish,7* distribute, sublicense, and/or sell copies of the Software, and to8* permit persons to whom the Software is furnished to do so, subject to9* the following conditions:10*11* The above copyright notice and this permission notice shall be12* included in all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,15* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF16* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND17* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS18* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN19* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN20* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE21* SOFTWARE.22*/2324#include "inner.h"2526/*27* In this file, we handle big integers with a custom format, i.e.28* without the usual one-word header. Value is split into 31-bit words,29* each stored in a 32-bit slot (top bit is zero) in little-endian30* order. The length (in words) is provided explicitly. In some cases,31* the value can be negative (using two's complement representation). In32* some cases, the top word is allowed to have a 32th bit.33*/3435/*36* Negate big integer conditionally. The value consists of 'len' words,37* with 31 bits in each word (the top bit of each word should be 0,38* except possibly for the last word). If 'ctl' is 1, the negation is39* computed; otherwise, if 'ctl' is 0, then the value is unchanged.40*/41static void42cond_negate(uint32_t *a, size_t len, uint32_t ctl)43{44size_t k;45uint32_t cc, xm;4647cc = ctl;48xm = -ctl >> 1;49for (k = 0; k < len; k ++) {50uint32_t aw;5152aw = a[k];53aw = (aw ^ xm) + cc;54a[k] = aw & 0x7FFFFFFF;55cc = aw >> 31;56}57}5859/*60* Finish modular reduction. Rules on input parameters:61*62* if neg = 1, then -m <= a < 063* if neg = 0, then 0 <= a < 2*m64*65* If neg = 0, then the top word of a[] may use 32 bits.66*67* Also, modulus m must be odd.68*/69static void70finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg)71{72size_t k;73uint32_t cc, xm, ym;7475/*76* First pass: compare a (assumed nonnegative) with m.77* Note that if the final word uses the top extra bit, then78* subtracting m must yield a value less than 2^31, since we79* assumed that a < 2*m.80*/81cc = 0;82for (k = 0; k < len; k ++) {83uint32_t aw, mw;8485aw = a[k];86mw = m[k];87cc = (aw - mw - cc) >> 31;88}8990/*91* At this point:92* if neg = 1, then we must add m (regardless of cc)93* if neg = 0 and cc = 0, then we must subtract m94* if neg = 0 and cc = 1, then we must do nothing95*/96xm = -neg >> 1;97ym = -(neg | (1 - cc));98cc = neg;99for (k = 0; k < len; k ++) {100uint32_t aw, mw;101102aw = a[k];103mw = (m[k] ^ xm) & ym;104aw = aw - mw - cc;105a[k] = aw & 0x7FFFFFFF;106cc = aw >> 31;107}108}109110/*111* Compute:112* a <- (a*pa+b*pb)/(2^31)113* b <- (a*qa+b*qb)/(2^31)114* The division is assumed to be exact (i.e. the low word is dropped).115* If the final a is negative, then it is negated. Similarly for b.116* Returned value is the combination of two bits:117* bit 0: 1 if a had to be negated, 0 otherwise118* bit 1: 1 if b had to be negated, 0 otherwise119*120* Factors pa, pb, qa and qb must be at most 2^31 in absolute value.121* Source integers a and b must be nonnegative; top word is not allowed122* to contain an extra 32th bit.123*/124static uint32_t125co_reduce(uint32_t *a, uint32_t *b, size_t len,126int64_t pa, int64_t pb, int64_t qa, int64_t qb)127{128size_t k;129int64_t cca, ccb;130uint32_t nega, negb;131132cca = 0;133ccb = 0;134for (k = 0; k < len; k ++) {135uint32_t wa, wb;136uint64_t za, zb;137uint64_t tta, ttb;138139/*140* Since:141* |pa| <= 2^31142* |pb| <= 2^31143* 0 <= wa <= 2^31 - 1144* 0 <= wb <= 2^31 - 1145* |cca| <= 2^32 - 1146* Then:147* |za| <= (2^31-1)*(2^32) + (2^32-1) = 2^63 - 1148*149* Thus, the new value of cca is such that |cca| <= 2^32 - 1.150* The same applies to ccb.151*/152wa = a[k];153wb = b[k];154za = wa * (uint64_t)pa + wb * (uint64_t)pb + (uint64_t)cca;155zb = wa * (uint64_t)qa + wb * (uint64_t)qb + (uint64_t)ccb;156if (k > 0) {157a[k - 1] = za & 0x7FFFFFFF;158b[k - 1] = zb & 0x7FFFFFFF;159}160161/*162* For the new values of cca and ccb, we need a signed163* right-shift; since, in C, right-shifting a signed164* negative value is implementation-defined, we use a165* custom portable sign extension expression.166*/167#define M ((uint64_t)1 << 32)168tta = za >> 31;169ttb = zb >> 31;170tta = (tta ^ M) - M;171ttb = (ttb ^ M) - M;172cca = *(int64_t *)&tta;173ccb = *(int64_t *)&ttb;174#undef M175}176a[len - 1] = (uint32_t)cca;177b[len - 1] = (uint32_t)ccb;178179nega = (uint32_t)((uint64_t)cca >> 63);180negb = (uint32_t)((uint64_t)ccb >> 63);181cond_negate(a, len, nega);182cond_negate(b, len, negb);183return nega | (negb << 1);184}185186/*187* Compute:188* a <- (a*pa+b*pb)/(2^31) mod m189* b <- (a*qa+b*qb)/(2^31) mod m190*191* m0i is equal to -1/m[0] mod 2^31.192*193* Factors pa, pb, qa and qb must be at most 2^31 in absolute value.194* Source integers a and b must be nonnegative; top word is not allowed195* to contain an extra 32th bit.196*/197static void198co_reduce_mod(uint32_t *a, uint32_t *b, size_t len,199int64_t pa, int64_t pb, int64_t qa, int64_t qb,200const uint32_t *m, uint32_t m0i)201{202size_t k;203int64_t cca, ccb;204uint32_t fa, fb;205206cca = 0;207ccb = 0;208fa = ((a[0] * (uint32_t)pa + b[0] * (uint32_t)pb) * m0i) & 0x7FFFFFFF;209fb = ((a[0] * (uint32_t)qa + b[0] * (uint32_t)qb) * m0i) & 0x7FFFFFFF;210for (k = 0; k < len; k ++) {211uint32_t wa, wb;212uint64_t za, zb;213uint64_t tta, ttb;214215/*216* In this loop, carries 'cca' and 'ccb' always fit on217* 33 bits (in absolute value).218*/219wa = a[k];220wb = b[k];221za = wa * (uint64_t)pa + wb * (uint64_t)pb222+ m[k] * (uint64_t)fa + (uint64_t)cca;223zb = wa * (uint64_t)qa + wb * (uint64_t)qb224+ m[k] * (uint64_t)fb + (uint64_t)ccb;225if (k > 0) {226a[k - 1] = (uint32_t)za & 0x7FFFFFFF;227b[k - 1] = (uint32_t)zb & 0x7FFFFFFF;228}229230#define M ((uint64_t)1 << 32)231tta = za >> 31;232ttb = zb >> 31;233tta = (tta ^ M) - M;234ttb = (ttb ^ M) - M;235cca = *(int64_t *)&tta;236ccb = *(int64_t *)&ttb;237#undef M238}239a[len - 1] = (uint32_t)cca;240b[len - 1] = (uint32_t)ccb;241242/*243* At this point:244* -m <= a < 2*m245* -m <= b < 2*m246* (this is a case of Montgomery reduction)247* The top word of 'a' and 'b' may have a 32-th bit set.248* We may have to add or subtract the modulus.249*/250finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));251finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));252}253254/* see inner.h */255uint32_t256br_i31_moddiv(uint32_t *x, const uint32_t *y, const uint32_t *m, uint32_t m0i,257uint32_t *t)258{259/*260* Algorithm is an extended binary GCD. We maintain four values261* a, b, u and v, with the following invariants:262*263* a * x = y * u mod m264* b * x = y * v mod m265*266* Starting values are:267*268* a = y269* b = m270* u = x271* v = 0272*273* The formal definition of the algorithm is a sequence of steps:274*275* - If a is even, then a <- a/2 and u <- u/2 mod m.276* - Otherwise, if b is even, then b <- b/2 and v <- v/2 mod m.277* - Otherwise, if a > b, then a <- (a-b)/2 and u <- (u-v)/2 mod m.278* - Otherwise, b <- (b-a)/2 and v <- (v-u)/2 mod m.279*280* Algorithm stops when a = b. At that point, they both are equal281* to GCD(y,m); the modular division succeeds if that value is 1.282* The result of the modular division is then u (or v: both are283* equal at that point).284*285* Each step makes either a or b shrink by at least one bit; hence,286* if m has bit length k bits, then 2k-2 steps are sufficient.287*288*289* Though complexity is quadratic in the size of m, the bit-by-bit290* processing is not very efficient. We can speed up processing by291* remarking that the decisions are taken based only on observation292* of the top and low bits of a and b.293*294* In the loop below, at each iteration, we use the two top words295* of a and b, and the low words of a and b, to compute reduction296* parameters pa, pb, qa and qb such that the new values for a297* and b are:298*299* a' = (a*pa + b*pb) / (2^31)300* b' = (a*qa + b*qb) / (2^31)301*302* the division being exact.303*304* Since the choices are based on the top words, they may be slightly305* off, requiring an optional correction: if a' < 0, then we replace306* pa with -pa, and pb with -pb. The total length of a and b is307* thus reduced by at least 30 bits at each iteration.308*309* The stopping conditions are still the same, though: when a310* and b become equal, they must be both odd (since m is odd,311* the GCD cannot be even), therefore the next operation is a312* subtraction, and one of the values becomes 0. At that point,313* nothing else happens, i.e. one value is stuck at 0, and the314* other one is the GCD.315*/316size_t len, k;317uint32_t *a, *b, *u, *v;318uint32_t num, r;319320len = (m[0] + 31) >> 5;321a = t;322b = a + len;323u = x + 1;324v = b + len;325memcpy(a, y + 1, len * sizeof *y);326memcpy(b, m + 1, len * sizeof *m);327memset(v, 0, len * sizeof *v);328329/*330* Loop below ensures that a and b are reduced by some bits each,331* for a total of at least 30 bits.332*/333for (num = ((m[0] - (m[0] >> 5)) << 1) + 30; num >= 30; num -= 30) {334size_t j;335uint32_t c0, c1;336uint32_t a0, a1, b0, b1;337uint64_t a_hi, b_hi;338uint32_t a_lo, b_lo;339int64_t pa, pb, qa, qb;340int i;341342/*343* Extract top words of a and b. If j is the highest344* index >= 1 such that a[j] != 0 or b[j] != 0, then we want345* (a[j] << 31) + a[j - 1], and (b[j] << 31) + b[j - 1].346* If a and b are down to one word each, then we use a[0]347* and b[0].348*/349c0 = (uint32_t)-1;350c1 = (uint32_t)-1;351a0 = 0;352a1 = 0;353b0 = 0;354b1 = 0;355j = len;356while (j -- > 0) {357uint32_t aw, bw;358359aw = a[j];360bw = b[j];361a0 ^= (a0 ^ aw) & c0;362a1 ^= (a1 ^ aw) & c1;363b0 ^= (b0 ^ bw) & c0;364b1 ^= (b1 ^ bw) & c1;365c1 = c0;366c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;367}368369/*370* If c1 = 0, then we grabbed two words for a and b.371* If c1 != 0 but c0 = 0, then we grabbed one word. It372* is not possible that c1 != 0 and c0 != 0, because that373* would mean that both integers are zero.374*/375a1 |= a0 & c1;376a0 &= ~c1;377b1 |= b0 & c1;378b0 &= ~c1;379a_hi = ((uint64_t)a0 << 31) + a1;380b_hi = ((uint64_t)b0 << 31) + b1;381a_lo = a[0];382b_lo = b[0];383384/*385* Compute reduction factors:386*387* a' = a*pa + b*pb388* b' = a*qa + b*qb389*390* such that a' and b' are both multiple of 2^31, but are391* only marginally larger than a and b.392*/393pa = 1;394pb = 0;395qa = 0;396qb = 1;397for (i = 0; i < 31; i ++) {398/*399* At each iteration:400*401* a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi402* b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi403* a <- a/2 if: a is even404* b <- b/2 if: a is odd, b is even405*406* We multiply a_lo and b_lo by 2 at each407* iteration, thus a division by 2 really is a408* non-multiplication by 2.409*/410uint32_t r, oa, ob, cAB, cBA, cA;411uint64_t rz;412413/*414* r = GT(a_hi, b_hi)415* But the GT() function works on uint32_t operands,416* so we inline a 64-bit version here.417*/418rz = b_hi - a_hi;419r = (uint32_t)((rz ^ ((a_hi ^ b_hi)420& (a_hi ^ rz))) >> 63);421422/*423* cAB = 1 if b must be subtracted from a424* cBA = 1 if a must be subtracted from b425* cA = 1 if a is divided by 2, 0 otherwise426*427* Rules:428*429* cAB and cBA cannot be both 1.430* if a is not divided by 2, b is.431*/432oa = (a_lo >> i) & 1;433ob = (b_lo >> i) & 1;434cAB = oa & ob & r;435cBA = oa & ob & NOT(r);436cA = cAB | NOT(oa);437438/*439* Conditional subtractions.440*/441a_lo -= b_lo & -cAB;442a_hi -= b_hi & -(uint64_t)cAB;443pa -= qa & -(int64_t)cAB;444pb -= qb & -(int64_t)cAB;445b_lo -= a_lo & -cBA;446b_hi -= a_hi & -(uint64_t)cBA;447qa -= pa & -(int64_t)cBA;448qb -= pb & -(int64_t)cBA;449450/*451* Shifting.452*/453a_lo += a_lo & (cA - 1);454pa += pa & ((int64_t)cA - 1);455pb += pb & ((int64_t)cA - 1);456a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;457b_lo += b_lo & -cA;458qa += qa & -(int64_t)cA;459qb += qb & -(int64_t)cA;460b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);461}462463/*464* Replace a and b with new values a' and b'.465*/466r = co_reduce(a, b, len, pa, pb, qa, qb);467pa -= pa * ((r & 1) << 1);468pb -= pb * ((r & 1) << 1);469qa -= qa * (r & 2);470qb -= qb * (r & 2);471co_reduce_mod(u, v, len, pa, pb, qa, qb, m + 1, m0i);472}473474/*475* Now one of the arrays should be 0, and the other contains476* the GCD. If a is 0, then u is 0 as well, and v contains477* the division result.478* Result is correct if and only if GCD is 1.479*/480r = (a[0] | b[0]) ^ 1;481u[0] |= v[0];482for (k = 1; k < len; k ++) {483r |= a[k] | b[k];484u[k] |= v[k];485}486return EQ0(r);487}488489490