Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/bearssl/src/int/i31_moddiv.c
39483 views
1
/*
2
* Copyright (c) 2018 Thomas Pornin <[email protected]>
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining
5
* a copy of this software and associated documentation files (the
6
* "Software"), to deal in the Software without restriction, including
7
* without limitation the rights to use, copy, modify, merge, publish,
8
* distribute, sublicense, and/or sell copies of the Software, and to
9
* permit persons to whom the Software is furnished to do so, subject to
10
* the following conditions:
11
*
12
* The above copyright notice and this permission notice shall be
13
* included in all copies or substantial portions of the Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
25
#include "inner.h"
26
27
/*
28
* In this file, we handle big integers with a custom format, i.e.
29
* without the usual one-word header. Value is split into 31-bit words,
30
* each stored in a 32-bit slot (top bit is zero) in little-endian
31
* order. The length (in words) is provided explicitly. In some cases,
32
* the value can be negative (using two's complement representation). In
33
* some cases, the top word is allowed to have a 32th bit.
34
*/
35
36
/*
37
* Negate big integer conditionally. The value consists of 'len' words,
38
* with 31 bits in each word (the top bit of each word should be 0,
39
* except possibly for the last word). If 'ctl' is 1, the negation is
40
* computed; otherwise, if 'ctl' is 0, then the value is unchanged.
41
*/
42
static void
43
cond_negate(uint32_t *a, size_t len, uint32_t ctl)
44
{
45
size_t k;
46
uint32_t cc, xm;
47
48
cc = ctl;
49
xm = -ctl >> 1;
50
for (k = 0; k < len; k ++) {
51
uint32_t aw;
52
53
aw = a[k];
54
aw = (aw ^ xm) + cc;
55
a[k] = aw & 0x7FFFFFFF;
56
cc = aw >> 31;
57
}
58
}
59
60
/*
61
* Finish modular reduction. Rules on input parameters:
62
*
63
* if neg = 1, then -m <= a < 0
64
* if neg = 0, then 0 <= a < 2*m
65
*
66
* If neg = 0, then the top word of a[] may use 32 bits.
67
*
68
* Also, modulus m must be odd.
69
*/
70
static void
71
finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg)
72
{
73
size_t k;
74
uint32_t cc, xm, ym;
75
76
/*
77
* First pass: compare a (assumed nonnegative) with m.
78
* Note that if the final word uses the top extra bit, then
79
* subtracting m must yield a value less than 2^31, since we
80
* assumed that a < 2*m.
81
*/
82
cc = 0;
83
for (k = 0; k < len; k ++) {
84
uint32_t aw, mw;
85
86
aw = a[k];
87
mw = m[k];
88
cc = (aw - mw - cc) >> 31;
89
}
90
91
/*
92
* At this point:
93
* if neg = 1, then we must add m (regardless of cc)
94
* if neg = 0 and cc = 0, then we must subtract m
95
* if neg = 0 and cc = 1, then we must do nothing
96
*/
97
xm = -neg >> 1;
98
ym = -(neg | (1 - cc));
99
cc = neg;
100
for (k = 0; k < len; k ++) {
101
uint32_t aw, mw;
102
103
aw = a[k];
104
mw = (m[k] ^ xm) & ym;
105
aw = aw - mw - cc;
106
a[k] = aw & 0x7FFFFFFF;
107
cc = aw >> 31;
108
}
109
}
110
111
/*
112
* Compute:
113
* a <- (a*pa+b*pb)/(2^31)
114
* b <- (a*qa+b*qb)/(2^31)
115
* The division is assumed to be exact (i.e. the low word is dropped).
116
* If the final a is negative, then it is negated. Similarly for b.
117
* Returned value is the combination of two bits:
118
* bit 0: 1 if a had to be negated, 0 otherwise
119
* bit 1: 1 if b had to be negated, 0 otherwise
120
*
121
* Factors pa, pb, qa and qb must be at most 2^31 in absolute value.
122
* Source integers a and b must be nonnegative; top word is not allowed
123
* to contain an extra 32th bit.
124
*/
125
static uint32_t
126
co_reduce(uint32_t *a, uint32_t *b, size_t len,
127
int64_t pa, int64_t pb, int64_t qa, int64_t qb)
128
{
129
size_t k;
130
int64_t cca, ccb;
131
uint32_t nega, negb;
132
133
cca = 0;
134
ccb = 0;
135
for (k = 0; k < len; k ++) {
136
uint32_t wa, wb;
137
uint64_t za, zb;
138
uint64_t tta, ttb;
139
140
/*
141
* Since:
142
* |pa| <= 2^31
143
* |pb| <= 2^31
144
* 0 <= wa <= 2^31 - 1
145
* 0 <= wb <= 2^31 - 1
146
* |cca| <= 2^32 - 1
147
* Then:
148
* |za| <= (2^31-1)*(2^32) + (2^32-1) = 2^63 - 1
149
*
150
* Thus, the new value of cca is such that |cca| <= 2^32 - 1.
151
* The same applies to ccb.
152
*/
153
wa = a[k];
154
wb = b[k];
155
za = wa * (uint64_t)pa + wb * (uint64_t)pb + (uint64_t)cca;
156
zb = wa * (uint64_t)qa + wb * (uint64_t)qb + (uint64_t)ccb;
157
if (k > 0) {
158
a[k - 1] = za & 0x7FFFFFFF;
159
b[k - 1] = zb & 0x7FFFFFFF;
160
}
161
162
/*
163
* For the new values of cca and ccb, we need a signed
164
* right-shift; since, in C, right-shifting a signed
165
* negative value is implementation-defined, we use a
166
* custom portable sign extension expression.
167
*/
168
#define M ((uint64_t)1 << 32)
169
tta = za >> 31;
170
ttb = zb >> 31;
171
tta = (tta ^ M) - M;
172
ttb = (ttb ^ M) - M;
173
cca = *(int64_t *)&tta;
174
ccb = *(int64_t *)&ttb;
175
#undef M
176
}
177
a[len - 1] = (uint32_t)cca;
178
b[len - 1] = (uint32_t)ccb;
179
180
nega = (uint32_t)((uint64_t)cca >> 63);
181
negb = (uint32_t)((uint64_t)ccb >> 63);
182
cond_negate(a, len, nega);
183
cond_negate(b, len, negb);
184
return nega | (negb << 1);
185
}
186
187
/*
188
* Compute:
189
* a <- (a*pa+b*pb)/(2^31) mod m
190
* b <- (a*qa+b*qb)/(2^31) mod m
191
*
192
* m0i is equal to -1/m[0] mod 2^31.
193
*
194
* Factors pa, pb, qa and qb must be at most 2^31 in absolute value.
195
* Source integers a and b must be nonnegative; top word is not allowed
196
* to contain an extra 32th bit.
197
*/
198
static void
199
co_reduce_mod(uint32_t *a, uint32_t *b, size_t len,
200
int64_t pa, int64_t pb, int64_t qa, int64_t qb,
201
const uint32_t *m, uint32_t m0i)
202
{
203
size_t k;
204
int64_t cca, ccb;
205
uint32_t fa, fb;
206
207
cca = 0;
208
ccb = 0;
209
fa = ((a[0] * (uint32_t)pa + b[0] * (uint32_t)pb) * m0i) & 0x7FFFFFFF;
210
fb = ((a[0] * (uint32_t)qa + b[0] * (uint32_t)qb) * m0i) & 0x7FFFFFFF;
211
for (k = 0; k < len; k ++) {
212
uint32_t wa, wb;
213
uint64_t za, zb;
214
uint64_t tta, ttb;
215
216
/*
217
* In this loop, carries 'cca' and 'ccb' always fit on
218
* 33 bits (in absolute value).
219
*/
220
wa = a[k];
221
wb = b[k];
222
za = wa * (uint64_t)pa + wb * (uint64_t)pb
223
+ m[k] * (uint64_t)fa + (uint64_t)cca;
224
zb = wa * (uint64_t)qa + wb * (uint64_t)qb
225
+ m[k] * (uint64_t)fb + (uint64_t)ccb;
226
if (k > 0) {
227
a[k - 1] = (uint32_t)za & 0x7FFFFFFF;
228
b[k - 1] = (uint32_t)zb & 0x7FFFFFFF;
229
}
230
231
#define M ((uint64_t)1 << 32)
232
tta = za >> 31;
233
ttb = zb >> 31;
234
tta = (tta ^ M) - M;
235
ttb = (ttb ^ M) - M;
236
cca = *(int64_t *)&tta;
237
ccb = *(int64_t *)&ttb;
238
#undef M
239
}
240
a[len - 1] = (uint32_t)cca;
241
b[len - 1] = (uint32_t)ccb;
242
243
/*
244
* At this point:
245
* -m <= a < 2*m
246
* -m <= b < 2*m
247
* (this is a case of Montgomery reduction)
248
* The top word of 'a' and 'b' may have a 32-th bit set.
249
* We may have to add or subtract the modulus.
250
*/
251
finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));
252
finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));
253
}
254
255
/* see inner.h */
256
uint32_t
257
br_i31_moddiv(uint32_t *x, const uint32_t *y, const uint32_t *m, uint32_t m0i,
258
uint32_t *t)
259
{
260
/*
261
* Algorithm is an extended binary GCD. We maintain four values
262
* a, b, u and v, with the following invariants:
263
*
264
* a * x = y * u mod m
265
* b * x = y * v mod m
266
*
267
* Starting values are:
268
*
269
* a = y
270
* b = m
271
* u = x
272
* v = 0
273
*
274
* The formal definition of the algorithm is a sequence of steps:
275
*
276
* - If a is even, then a <- a/2 and u <- u/2 mod m.
277
* - Otherwise, if b is even, then b <- b/2 and v <- v/2 mod m.
278
* - Otherwise, if a > b, then a <- (a-b)/2 and u <- (u-v)/2 mod m.
279
* - Otherwise, b <- (b-a)/2 and v <- (v-u)/2 mod m.
280
*
281
* Algorithm stops when a = b. At that point, they both are equal
282
* to GCD(y,m); the modular division succeeds if that value is 1.
283
* The result of the modular division is then u (or v: both are
284
* equal at that point).
285
*
286
* Each step makes either a or b shrink by at least one bit; hence,
287
* if m has bit length k bits, then 2k-2 steps are sufficient.
288
*
289
*
290
* Though complexity is quadratic in the size of m, the bit-by-bit
291
* processing is not very efficient. We can speed up processing by
292
* remarking that the decisions are taken based only on observation
293
* of the top and low bits of a and b.
294
*
295
* In the loop below, at each iteration, we use the two top words
296
* of a and b, and the low words of a and b, to compute reduction
297
* parameters pa, pb, qa and qb such that the new values for a
298
* and b are:
299
*
300
* a' = (a*pa + b*pb) / (2^31)
301
* b' = (a*qa + b*qb) / (2^31)
302
*
303
* the division being exact.
304
*
305
* Since the choices are based on the top words, they may be slightly
306
* off, requiring an optional correction: if a' < 0, then we replace
307
* pa with -pa, and pb with -pb. The total length of a and b is
308
* thus reduced by at least 30 bits at each iteration.
309
*
310
* The stopping conditions are still the same, though: when a
311
* and b become equal, they must be both odd (since m is odd,
312
* the GCD cannot be even), therefore the next operation is a
313
* subtraction, and one of the values becomes 0. At that point,
314
* nothing else happens, i.e. one value is stuck at 0, and the
315
* other one is the GCD.
316
*/
317
size_t len, k;
318
uint32_t *a, *b, *u, *v;
319
uint32_t num, r;
320
321
len = (m[0] + 31) >> 5;
322
a = t;
323
b = a + len;
324
u = x + 1;
325
v = b + len;
326
memcpy(a, y + 1, len * sizeof *y);
327
memcpy(b, m + 1, len * sizeof *m);
328
memset(v, 0, len * sizeof *v);
329
330
/*
331
* Loop below ensures that a and b are reduced by some bits each,
332
* for a total of at least 30 bits.
333
*/
334
for (num = ((m[0] - (m[0] >> 5)) << 1) + 30; num >= 30; num -= 30) {
335
size_t j;
336
uint32_t c0, c1;
337
uint32_t a0, a1, b0, b1;
338
uint64_t a_hi, b_hi;
339
uint32_t a_lo, b_lo;
340
int64_t pa, pb, qa, qb;
341
int i;
342
343
/*
344
* Extract top words of a and b. If j is the highest
345
* index >= 1 such that a[j] != 0 or b[j] != 0, then we want
346
* (a[j] << 31) + a[j - 1], and (b[j] << 31) + b[j - 1].
347
* If a and b are down to one word each, then we use a[0]
348
* and b[0].
349
*/
350
c0 = (uint32_t)-1;
351
c1 = (uint32_t)-1;
352
a0 = 0;
353
a1 = 0;
354
b0 = 0;
355
b1 = 0;
356
j = len;
357
while (j -- > 0) {
358
uint32_t aw, bw;
359
360
aw = a[j];
361
bw = b[j];
362
a0 ^= (a0 ^ aw) & c0;
363
a1 ^= (a1 ^ aw) & c1;
364
b0 ^= (b0 ^ bw) & c0;
365
b1 ^= (b1 ^ bw) & c1;
366
c1 = c0;
367
c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
368
}
369
370
/*
371
* If c1 = 0, then we grabbed two words for a and b.
372
* If c1 != 0 but c0 = 0, then we grabbed one word. It
373
* is not possible that c1 != 0 and c0 != 0, because that
374
* would mean that both integers are zero.
375
*/
376
a1 |= a0 & c1;
377
a0 &= ~c1;
378
b1 |= b0 & c1;
379
b0 &= ~c1;
380
a_hi = ((uint64_t)a0 << 31) + a1;
381
b_hi = ((uint64_t)b0 << 31) + b1;
382
a_lo = a[0];
383
b_lo = b[0];
384
385
/*
386
* Compute reduction factors:
387
*
388
* a' = a*pa + b*pb
389
* b' = a*qa + b*qb
390
*
391
* such that a' and b' are both multiple of 2^31, but are
392
* only marginally larger than a and b.
393
*/
394
pa = 1;
395
pb = 0;
396
qa = 0;
397
qb = 1;
398
for (i = 0; i < 31; i ++) {
399
/*
400
* At each iteration:
401
*
402
* a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
403
* b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
404
* a <- a/2 if: a is even
405
* b <- b/2 if: a is odd, b is even
406
*
407
* We multiply a_lo and b_lo by 2 at each
408
* iteration, thus a division by 2 really is a
409
* non-multiplication by 2.
410
*/
411
uint32_t r, oa, ob, cAB, cBA, cA;
412
uint64_t rz;
413
414
/*
415
* r = GT(a_hi, b_hi)
416
* But the GT() function works on uint32_t operands,
417
* so we inline a 64-bit version here.
418
*/
419
rz = b_hi - a_hi;
420
r = (uint32_t)((rz ^ ((a_hi ^ b_hi)
421
& (a_hi ^ rz))) >> 63);
422
423
/*
424
* cAB = 1 if b must be subtracted from a
425
* cBA = 1 if a must be subtracted from b
426
* cA = 1 if a is divided by 2, 0 otherwise
427
*
428
* Rules:
429
*
430
* cAB and cBA cannot be both 1.
431
* if a is not divided by 2, b is.
432
*/
433
oa = (a_lo >> i) & 1;
434
ob = (b_lo >> i) & 1;
435
cAB = oa & ob & r;
436
cBA = oa & ob & NOT(r);
437
cA = cAB | NOT(oa);
438
439
/*
440
* Conditional subtractions.
441
*/
442
a_lo -= b_lo & -cAB;
443
a_hi -= b_hi & -(uint64_t)cAB;
444
pa -= qa & -(int64_t)cAB;
445
pb -= qb & -(int64_t)cAB;
446
b_lo -= a_lo & -cBA;
447
b_hi -= a_hi & -(uint64_t)cBA;
448
qa -= pa & -(int64_t)cBA;
449
qb -= pb & -(int64_t)cBA;
450
451
/*
452
* Shifting.
453
*/
454
a_lo += a_lo & (cA - 1);
455
pa += pa & ((int64_t)cA - 1);
456
pb += pb & ((int64_t)cA - 1);
457
a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;
458
b_lo += b_lo & -cA;
459
qa += qa & -(int64_t)cA;
460
qb += qb & -(int64_t)cA;
461
b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);
462
}
463
464
/*
465
* Replace a and b with new values a' and b'.
466
*/
467
r = co_reduce(a, b, len, pa, pb, qa, qb);
468
pa -= pa * ((r & 1) << 1);
469
pb -= pb * ((r & 1) << 1);
470
qa -= qa * (r & 2);
471
qb -= qb * (r & 2);
472
co_reduce_mod(u, v, len, pa, pb, qa, qb, m + 1, m0i);
473
}
474
475
/*
476
* Now one of the arrays should be 0, and the other contains
477
* the GCD. If a is 0, then u is 0 as well, and v contains
478
* the division result.
479
* Result is correct if and only if GCD is 1.
480
*/
481
r = (a[0] | b[0]) ^ 1;
482
u[0] |= v[0];
483
for (k = 1; k < len; k ++) {
484
r |= a[k] | b[k];
485
u[k] |= v[k];
486
}
487
return EQ0(r);
488
}
489
490