Path: blob/main/contrib/bearssl/src/int/i15_moddiv.c
39483 views
/*1* Copyright (c) 2018 Thomas Pornin <[email protected]>2*3* Permission is hereby granted, free of charge, to any person obtaining4* a copy of this software and associated documentation files (the5* "Software"), to deal in the Software without restriction, including6* without limitation the rights to use, copy, modify, merge, publish,7* distribute, sublicense, and/or sell copies of the Software, and to8* permit persons to whom the Software is furnished to do so, subject to9* the following conditions:10*11* The above copyright notice and this permission notice shall be12* included in all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,15* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF16* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND17* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS18* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN19* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN20* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE21* SOFTWARE.22*/2324#include "inner.h"2526/*27* In this file, we handle big integers with a custom format, i.e.28* without the usual one-word header. Value is split into 15-bit words,29* each stored in a 16-bit slot (top bit is zero) in little-endian30* order. The length (in words) is provided explicitly. In some cases,31* the value can be negative (using two's complement representation). In32* some cases, the top word is allowed to have a 16th bit.33*/3435/*36* Negate big integer conditionally. The value consists of 'len' words,37* with 15 bits in each word (the top bit of each word should be 0,38* except possibly for the last word). If 'ctl' is 1, the negation is39* computed; otherwise, if 'ctl' is 0, then the value is unchanged.40*/41static void42cond_negate(uint16_t *a, size_t len, uint32_t ctl)43{44size_t k;45uint32_t cc, xm;4647cc = ctl;48xm = 0x7FFF & -ctl;49for (k = 0; k < len; k ++) {50uint32_t aw;5152aw = a[k];53aw = (aw ^ xm) + cc;54a[k] = aw & 0x7FFF;55cc = (aw >> 15) & 1;56}57}5859/*60* Finish modular reduction. Rules on input parameters:61*62* if neg = 1, then -m <= a < 063* if neg = 0, then 0 <= a < 2*m64*65* If neg = 0, then the top word of a[] may use 16 bits.66*67* Also, modulus m must be odd.68*/69static void70finish_mod(uint16_t *a, size_t len, const uint16_t *m, uint32_t neg)71{72size_t k;73uint32_t cc, xm, ym;7475/*76* First pass: compare a (assumed nonnegative) with m.77*/78cc = 0;79for (k = 0; k < len; k ++) {80uint32_t aw, mw;8182aw = a[k];83mw = m[k];84cc = (aw - mw - cc) >> 31;85}8687/*88* At this point:89* if neg = 1, then we must add m (regardless of cc)90* if neg = 0 and cc = 0, then we must subtract m91* if neg = 0 and cc = 1, then we must do nothing92*/93xm = 0x7FFF & -neg;94ym = -(neg | (1 - cc));95cc = neg;96for (k = 0; k < len; k ++) {97uint32_t aw, mw;9899aw = a[k];100mw = (m[k] ^ xm) & ym;101aw = aw - mw - cc;102a[k] = aw & 0x7FFF;103cc = aw >> 31;104}105}106107/*108* Compute:109* a <- (a*pa+b*pb)/(2^15)110* b <- (a*qa+b*qb)/(2^15)111* The division is assumed to be exact (i.e. the low word is dropped).112* If the final a is negative, then it is negated. Similarly for b.113* Returned value is the combination of two bits:114* bit 0: 1 if a had to be negated, 0 otherwise115* bit 1: 1 if b had to be negated, 0 otherwise116*117* Factors pa, pb, qa and qb must be at most 2^15 in absolute value.118* Source integers a and b must be nonnegative; top word is not allowed119* to contain an extra 16th bit.120*/121static uint32_t122co_reduce(uint16_t *a, uint16_t *b, size_t len,123int32_t pa, int32_t pb, int32_t qa, int32_t qb)124{125size_t k;126int32_t cca, ccb;127uint32_t nega, negb;128129cca = 0;130ccb = 0;131for (k = 0; k < len; k ++) {132uint32_t wa, wb, za, zb;133uint16_t tta, ttb;134135/*136* Since:137* |pa| <= 2^15138* |pb| <= 2^15139* 0 <= wa <= 2^15 - 1140* 0 <= wb <= 2^15 - 1141* |cca| <= 2^16 - 1142* Then:143* |za| <= (2^15-1)*(2^16) + (2^16-1) = 2^31 - 1144*145* Thus, the new value of cca is such that |cca| <= 2^16 - 1.146* The same applies to ccb.147*/148wa = a[k];149wb = b[k];150za = wa * (uint32_t)pa + wb * (uint32_t)pb + (uint32_t)cca;151zb = wa * (uint32_t)qa + wb * (uint32_t)qb + (uint32_t)ccb;152if (k > 0) {153a[k - 1] = za & 0x7FFF;154b[k - 1] = zb & 0x7FFF;155}156tta = za >> 15;157ttb = zb >> 15;158cca = *(int16_t *)&tta;159ccb = *(int16_t *)&ttb;160}161a[len - 1] = (uint16_t)cca;162b[len - 1] = (uint16_t)ccb;163nega = (uint32_t)cca >> 31;164negb = (uint32_t)ccb >> 31;165cond_negate(a, len, nega);166cond_negate(b, len, negb);167return nega | (negb << 1);168}169170/*171* Compute:172* a <- (a*pa+b*pb)/(2^15) mod m173* b <- (a*qa+b*qb)/(2^15) mod m174*175* m0i is equal to -1/m[0] mod 2^15.176*177* Factors pa, pb, qa and qb must be at most 2^15 in absolute value.178* Source integers a and b must be nonnegative; top word is not allowed179* to contain an extra 16th bit.180*/181static void182co_reduce_mod(uint16_t *a, uint16_t *b, size_t len,183int32_t pa, int32_t pb, int32_t qa, int32_t qb,184const uint16_t *m, uint16_t m0i)185{186size_t k;187int32_t cca, ccb, fa, fb;188189cca = 0;190ccb = 0;191fa = ((a[0] * (uint32_t)pa + b[0] * (uint32_t)pb) * m0i) & 0x7FFF;192fb = ((a[0] * (uint32_t)qa + b[0] * (uint32_t)qb) * m0i) & 0x7FFF;193for (k = 0; k < len; k ++) {194uint32_t wa, wb, za, zb;195uint32_t tta, ttb;196197/*198* In this loop, carries 'cca' and 'ccb' always fit on199* 17 bits (in absolute value).200*/201wa = a[k];202wb = b[k];203za = wa * (uint32_t)pa + wb * (uint32_t)pb204+ m[k] * (uint32_t)fa + (uint32_t)cca;205zb = wa * (uint32_t)qa + wb * (uint32_t)qb206+ m[k] * (uint32_t)fb + (uint32_t)ccb;207if (k > 0) {208a[k - 1] = za & 0x7FFF;209b[k - 1] = zb & 0x7FFF;210}211212/*213* The XOR-and-sub construction below does an arithmetic214* right shift in a portable way (technically, right-shifting215* a negative signed value is implementation-defined in C).216*/217#define M ((uint32_t)1 << 16)218tta = za >> 15;219ttb = zb >> 15;220tta = (tta ^ M) - M;221ttb = (ttb ^ M) - M;222cca = *(int32_t *)&tta;223ccb = *(int32_t *)&ttb;224#undef M225}226a[len - 1] = (uint32_t)cca;227b[len - 1] = (uint32_t)ccb;228229/*230* At this point:231* -m <= a < 2*m232* -m <= b < 2*m233* (this is a case of Montgomery reduction)234* The top word of 'a' and 'b' may have a 16-th bit set.235* We may have to add or subtract the modulus.236*/237finish_mod(a, len, m, (uint32_t)cca >> 31);238finish_mod(b, len, m, (uint32_t)ccb >> 31);239}240241/* see inner.h */242uint32_t243br_i15_moddiv(uint16_t *x, const uint16_t *y, const uint16_t *m, uint16_t m0i,244uint16_t *t)245{246/*247* Algorithm is an extended binary GCD. We maintain four values248* a, b, u and v, with the following invariants:249*250* a * x = y * u mod m251* b * x = y * v mod m252*253* Starting values are:254*255* a = y256* b = m257* u = x258* v = 0259*260* The formal definition of the algorithm is a sequence of steps:261*262* - If a is even, then a <- a/2 and u <- u/2 mod m.263* - Otherwise, if b is even, then b <- b/2 and v <- v/2 mod m.264* - Otherwise, if a > b, then a <- (a-b)/2 and u <- (u-v)/2 mod m.265* - Otherwise, b <- (b-a)/2 and v <- (v-u)/2 mod m.266*267* Algorithm stops when a = b. At that point, they both are equal268* to GCD(y,m); the modular division succeeds if that value is 1.269* The result of the modular division is then u (or v: both are270* equal at that point).271*272* Each step makes either a or b shrink by at least one bit; hence,273* if m has bit length k bits, then 2k-2 steps are sufficient.274*275*276* Though complexity is quadratic in the size of m, the bit-by-bit277* processing is not very efficient. We can speed up processing by278* remarking that the decisions are taken based only on observation279* of the top and low bits of a and b.280*281* In the loop below, at each iteration, we use the two top words282* of a and b, and the low words of a and b, to compute reduction283* parameters pa, pb, qa and qb such that the new values for a284* and b are:285*286* a' = (a*pa + b*pb) / (2^15)287* b' = (a*qa + b*qb) / (2^15)288*289* the division being exact.290*291* Since the choices are based on the top words, they may be slightly292* off, requiring an optional correction: if a' < 0, then we replace293* pa with -pa, and pb with -pb. The total length of a and b is294* thus reduced by at least 14 bits at each iteration.295*296* The stopping conditions are still the same, though: when a297* and b become equal, they must be both odd (since m is odd,298* the GCD cannot be even), therefore the next operation is a299* subtraction, and one of the values becomes 0. At that point,300* nothing else happens, i.e. one value is stuck at 0, and the301* other one is the GCD.302*/303size_t len, k;304uint16_t *a, *b, *u, *v;305uint32_t num, r;306307len = (m[0] + 15) >> 4;308a = t;309b = a + len;310u = x + 1;311v = b + len;312memcpy(a, y + 1, len * sizeof *y);313memcpy(b, m + 1, len * sizeof *m);314memset(v, 0, len * sizeof *v);315316/*317* Loop below ensures that a and b are reduced by some bits each,318* for a total of at least 14 bits.319*/320for (num = ((m[0] - (m[0] >> 4)) << 1) + 14; num >= 14; num -= 14) {321size_t j;322uint32_t c0, c1;323uint32_t a0, a1, b0, b1;324uint32_t a_hi, b_hi, a_lo, b_lo;325int32_t pa, pb, qa, qb;326int i;327328/*329* Extract top words of a and b. If j is the highest330* index >= 1 such that a[j] != 0 or b[j] != 0, then we want331* (a[j] << 15) + a[j - 1], and (b[j] << 15) + b[j - 1].332* If a and b are down to one word each, then we use a[0]333* and b[0].334*/335c0 = (uint32_t)-1;336c1 = (uint32_t)-1;337a0 = 0;338a1 = 0;339b0 = 0;340b1 = 0;341j = len;342while (j -- > 0) {343uint32_t aw, bw;344345aw = a[j];346bw = b[j];347a0 ^= (a0 ^ aw) & c0;348a1 ^= (a1 ^ aw) & c1;349b0 ^= (b0 ^ bw) & c0;350b1 ^= (b1 ^ bw) & c1;351c1 = c0;352c0 &= (((aw | bw) + 0xFFFF) >> 16) - (uint32_t)1;353}354355/*356* If c1 = 0, then we grabbed two words for a and b.357* If c1 != 0 but c0 = 0, then we grabbed one word. It358* is not possible that c1 != 0 and c0 != 0, because that359* would mean that both integers are zero.360*/361a1 |= a0 & c1;362a0 &= ~c1;363b1 |= b0 & c1;364b0 &= ~c1;365a_hi = (a0 << 15) + a1;366b_hi = (b0 << 15) + b1;367a_lo = a[0];368b_lo = b[0];369370/*371* Compute reduction factors:372*373* a' = a*pa + b*pb374* b' = a*qa + b*qb375*376* such that a' and b' are both multiple of 2^15, but are377* only marginally larger than a and b.378*/379pa = 1;380pb = 0;381qa = 0;382qb = 1;383for (i = 0; i < 15; i ++) {384/*385* At each iteration:386*387* a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi388* b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi389* a <- a/2 if: a is even390* b <- b/2 if: a is odd, b is even391*392* We multiply a_lo and b_lo by 2 at each393* iteration, thus a division by 2 really is a394* non-multiplication by 2.395*/396uint32_t r, oa, ob, cAB, cBA, cA;397398/*399* cAB = 1 if b must be subtracted from a400* cBA = 1 if a must be subtracted from b401* cA = 1 if a is divided by 2, 0 otherwise402*403* Rules:404*405* cAB and cBA cannot be both 1.406* if a is not divided by 2, b is.407*/408r = GT(a_hi, b_hi);409oa = (a_lo >> i) & 1;410ob = (b_lo >> i) & 1;411cAB = oa & ob & r;412cBA = oa & ob & NOT(r);413cA = cAB | NOT(oa);414415/*416* Conditional subtractions.417*/418a_lo -= b_lo & -cAB;419a_hi -= b_hi & -cAB;420pa -= qa & -(int32_t)cAB;421pb -= qb & -(int32_t)cAB;422b_lo -= a_lo & -cBA;423b_hi -= a_hi & -cBA;424qa -= pa & -(int32_t)cBA;425qb -= pb & -(int32_t)cBA;426427/*428* Shifting.429*/430a_lo += a_lo & (cA - 1);431pa += pa & ((int32_t)cA - 1);432pb += pb & ((int32_t)cA - 1);433a_hi ^= (a_hi ^ (a_hi >> 1)) & -cA;434b_lo += b_lo & -cA;435qa += qa & -(int32_t)cA;436qb += qb & -(int32_t)cA;437b_hi ^= (b_hi ^ (b_hi >> 1)) & (cA - 1);438}439440/*441* Replace a and b with new values a' and b'.442*/443r = co_reduce(a, b, len, pa, pb, qa, qb);444pa -= pa * ((r & 1) << 1);445pb -= pb * ((r & 1) << 1);446qa -= qa * (r & 2);447qb -= qb * (r & 2);448co_reduce_mod(u, v, len, pa, pb, qa, qb, m + 1, m0i);449}450451/*452* Now one of the arrays should be 0, and the other contains453* the GCD. If a is 0, then u is 0 as well, and v contains454* the division result.455* Result is correct if and only if GCD is 1.456*/457r = (a[0] | b[0]) ^ 1;458u[0] |= v[0];459for (k = 1; k < len; k ++) {460r |= a[k] | b[k];461u[k] |= v[k];462}463return EQ0(r);464}465466467