Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/bearssl/src/int/i15_moddiv.c
39483 views
1
/*
2
* Copyright (c) 2018 Thomas Pornin <[email protected]>
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining
5
* a copy of this software and associated documentation files (the
6
* "Software"), to deal in the Software without restriction, including
7
* without limitation the rights to use, copy, modify, merge, publish,
8
* distribute, sublicense, and/or sell copies of the Software, and to
9
* permit persons to whom the Software is furnished to do so, subject to
10
* the following conditions:
11
*
12
* The above copyright notice and this permission notice shall be
13
* included in all copies or substantial portions of the Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
25
#include "inner.h"
26
27
/*
28
* In this file, we handle big integers with a custom format, i.e.
29
* without the usual one-word header. Value is split into 15-bit words,
30
* each stored in a 16-bit slot (top bit is zero) in little-endian
31
* order. The length (in words) is provided explicitly. In some cases,
32
* the value can be negative (using two's complement representation). In
33
* some cases, the top word is allowed to have a 16th bit.
34
*/
35
36
/*
37
* Negate big integer conditionally. The value consists of 'len' words,
38
* with 15 bits in each word (the top bit of each word should be 0,
39
* except possibly for the last word). If 'ctl' is 1, the negation is
40
* computed; otherwise, if 'ctl' is 0, then the value is unchanged.
41
*/
42
static void
43
cond_negate(uint16_t *a, size_t len, uint32_t ctl)
44
{
45
size_t k;
46
uint32_t cc, xm;
47
48
cc = ctl;
49
xm = 0x7FFF & -ctl;
50
for (k = 0; k < len; k ++) {
51
uint32_t aw;
52
53
aw = a[k];
54
aw = (aw ^ xm) + cc;
55
a[k] = aw & 0x7FFF;
56
cc = (aw >> 15) & 1;
57
}
58
}
59
60
/*
61
* Finish modular reduction. Rules on input parameters:
62
*
63
* if neg = 1, then -m <= a < 0
64
* if neg = 0, then 0 <= a < 2*m
65
*
66
* If neg = 0, then the top word of a[] may use 16 bits.
67
*
68
* Also, modulus m must be odd.
69
*/
70
static void
71
finish_mod(uint16_t *a, size_t len, const uint16_t *m, uint32_t neg)
72
{
73
size_t k;
74
uint32_t cc, xm, ym;
75
76
/*
77
* First pass: compare a (assumed nonnegative) with m.
78
*/
79
cc = 0;
80
for (k = 0; k < len; k ++) {
81
uint32_t aw, mw;
82
83
aw = a[k];
84
mw = m[k];
85
cc = (aw - mw - cc) >> 31;
86
}
87
88
/*
89
* At this point:
90
* if neg = 1, then we must add m (regardless of cc)
91
* if neg = 0 and cc = 0, then we must subtract m
92
* if neg = 0 and cc = 1, then we must do nothing
93
*/
94
xm = 0x7FFF & -neg;
95
ym = -(neg | (1 - cc));
96
cc = neg;
97
for (k = 0; k < len; k ++) {
98
uint32_t aw, mw;
99
100
aw = a[k];
101
mw = (m[k] ^ xm) & ym;
102
aw = aw - mw - cc;
103
a[k] = aw & 0x7FFF;
104
cc = aw >> 31;
105
}
106
}
107
108
/*
109
* Compute:
110
* a <- (a*pa+b*pb)/(2^15)
111
* b <- (a*qa+b*qb)/(2^15)
112
* The division is assumed to be exact (i.e. the low word is dropped).
113
* If the final a is negative, then it is negated. Similarly for b.
114
* Returned value is the combination of two bits:
115
* bit 0: 1 if a had to be negated, 0 otherwise
116
* bit 1: 1 if b had to be negated, 0 otherwise
117
*
118
* Factors pa, pb, qa and qb must be at most 2^15 in absolute value.
119
* Source integers a and b must be nonnegative; top word is not allowed
120
* to contain an extra 16th bit.
121
*/
122
static uint32_t
123
co_reduce(uint16_t *a, uint16_t *b, size_t len,
124
int32_t pa, int32_t pb, int32_t qa, int32_t qb)
125
{
126
size_t k;
127
int32_t cca, ccb;
128
uint32_t nega, negb;
129
130
cca = 0;
131
ccb = 0;
132
for (k = 0; k < len; k ++) {
133
uint32_t wa, wb, za, zb;
134
uint16_t tta, ttb;
135
136
/*
137
* Since:
138
* |pa| <= 2^15
139
* |pb| <= 2^15
140
* 0 <= wa <= 2^15 - 1
141
* 0 <= wb <= 2^15 - 1
142
* |cca| <= 2^16 - 1
143
* Then:
144
* |za| <= (2^15-1)*(2^16) + (2^16-1) = 2^31 - 1
145
*
146
* Thus, the new value of cca is such that |cca| <= 2^16 - 1.
147
* The same applies to ccb.
148
*/
149
wa = a[k];
150
wb = b[k];
151
za = wa * (uint32_t)pa + wb * (uint32_t)pb + (uint32_t)cca;
152
zb = wa * (uint32_t)qa + wb * (uint32_t)qb + (uint32_t)ccb;
153
if (k > 0) {
154
a[k - 1] = za & 0x7FFF;
155
b[k - 1] = zb & 0x7FFF;
156
}
157
tta = za >> 15;
158
ttb = zb >> 15;
159
cca = *(int16_t *)&tta;
160
ccb = *(int16_t *)&ttb;
161
}
162
a[len - 1] = (uint16_t)cca;
163
b[len - 1] = (uint16_t)ccb;
164
nega = (uint32_t)cca >> 31;
165
negb = (uint32_t)ccb >> 31;
166
cond_negate(a, len, nega);
167
cond_negate(b, len, negb);
168
return nega | (negb << 1);
169
}
170
171
/*
172
* Compute:
173
* a <- (a*pa+b*pb)/(2^15) mod m
174
* b <- (a*qa+b*qb)/(2^15) mod m
175
*
176
* m0i is equal to -1/m[0] mod 2^15.
177
*
178
* Factors pa, pb, qa and qb must be at most 2^15 in absolute value.
179
* Source integers a and b must be nonnegative; top word is not allowed
180
* to contain an extra 16th bit.
181
*/
182
static void
183
co_reduce_mod(uint16_t *a, uint16_t *b, size_t len,
184
int32_t pa, int32_t pb, int32_t qa, int32_t qb,
185
const uint16_t *m, uint16_t m0i)
186
{
187
size_t k;
188
int32_t cca, ccb, fa, fb;
189
190
cca = 0;
191
ccb = 0;
192
fa = ((a[0] * (uint32_t)pa + b[0] * (uint32_t)pb) * m0i) & 0x7FFF;
193
fb = ((a[0] * (uint32_t)qa + b[0] * (uint32_t)qb) * m0i) & 0x7FFF;
194
for (k = 0; k < len; k ++) {
195
uint32_t wa, wb, za, zb;
196
uint32_t tta, ttb;
197
198
/*
199
* In this loop, carries 'cca' and 'ccb' always fit on
200
* 17 bits (in absolute value).
201
*/
202
wa = a[k];
203
wb = b[k];
204
za = wa * (uint32_t)pa + wb * (uint32_t)pb
205
+ m[k] * (uint32_t)fa + (uint32_t)cca;
206
zb = wa * (uint32_t)qa + wb * (uint32_t)qb
207
+ m[k] * (uint32_t)fb + (uint32_t)ccb;
208
if (k > 0) {
209
a[k - 1] = za & 0x7FFF;
210
b[k - 1] = zb & 0x7FFF;
211
}
212
213
/*
214
* The XOR-and-sub construction below does an arithmetic
215
* right shift in a portable way (technically, right-shifting
216
* a negative signed value is implementation-defined in C).
217
*/
218
#define M ((uint32_t)1 << 16)
219
tta = za >> 15;
220
ttb = zb >> 15;
221
tta = (tta ^ M) - M;
222
ttb = (ttb ^ M) - M;
223
cca = *(int32_t *)&tta;
224
ccb = *(int32_t *)&ttb;
225
#undef M
226
}
227
a[len - 1] = (uint32_t)cca;
228
b[len - 1] = (uint32_t)ccb;
229
230
/*
231
* At this point:
232
* -m <= a < 2*m
233
* -m <= b < 2*m
234
* (this is a case of Montgomery reduction)
235
* The top word of 'a' and 'b' may have a 16-th bit set.
236
* We may have to add or subtract the modulus.
237
*/
238
finish_mod(a, len, m, (uint32_t)cca >> 31);
239
finish_mod(b, len, m, (uint32_t)ccb >> 31);
240
}
241
242
/* see inner.h */
243
uint32_t
244
br_i15_moddiv(uint16_t *x, const uint16_t *y, const uint16_t *m, uint16_t m0i,
245
uint16_t *t)
246
{
247
/*
248
* Algorithm is an extended binary GCD. We maintain four values
249
* a, b, u and v, with the following invariants:
250
*
251
* a * x = y * u mod m
252
* b * x = y * v mod m
253
*
254
* Starting values are:
255
*
256
* a = y
257
* b = m
258
* u = x
259
* v = 0
260
*
261
* The formal definition of the algorithm is a sequence of steps:
262
*
263
* - If a is even, then a <- a/2 and u <- u/2 mod m.
264
* - Otherwise, if b is even, then b <- b/2 and v <- v/2 mod m.
265
* - Otherwise, if a > b, then a <- (a-b)/2 and u <- (u-v)/2 mod m.
266
* - Otherwise, b <- (b-a)/2 and v <- (v-u)/2 mod m.
267
*
268
* Algorithm stops when a = b. At that point, they both are equal
269
* to GCD(y,m); the modular division succeeds if that value is 1.
270
* The result of the modular division is then u (or v: both are
271
* equal at that point).
272
*
273
* Each step makes either a or b shrink by at least one bit; hence,
274
* if m has bit length k bits, then 2k-2 steps are sufficient.
275
*
276
*
277
* Though complexity is quadratic in the size of m, the bit-by-bit
278
* processing is not very efficient. We can speed up processing by
279
* remarking that the decisions are taken based only on observation
280
* of the top and low bits of a and b.
281
*
282
* In the loop below, at each iteration, we use the two top words
283
* of a and b, and the low words of a and b, to compute reduction
284
* parameters pa, pb, qa and qb such that the new values for a
285
* and b are:
286
*
287
* a' = (a*pa + b*pb) / (2^15)
288
* b' = (a*qa + b*qb) / (2^15)
289
*
290
* the division being exact.
291
*
292
* Since the choices are based on the top words, they may be slightly
293
* off, requiring an optional correction: if a' < 0, then we replace
294
* pa with -pa, and pb with -pb. The total length of a and b is
295
* thus reduced by at least 14 bits at each iteration.
296
*
297
* The stopping conditions are still the same, though: when a
298
* and b become equal, they must be both odd (since m is odd,
299
* the GCD cannot be even), therefore the next operation is a
300
* subtraction, and one of the values becomes 0. At that point,
301
* nothing else happens, i.e. one value is stuck at 0, and the
302
* other one is the GCD.
303
*/
304
size_t len, k;
305
uint16_t *a, *b, *u, *v;
306
uint32_t num, r;
307
308
len = (m[0] + 15) >> 4;
309
a = t;
310
b = a + len;
311
u = x + 1;
312
v = b + len;
313
memcpy(a, y + 1, len * sizeof *y);
314
memcpy(b, m + 1, len * sizeof *m);
315
memset(v, 0, len * sizeof *v);
316
317
/*
318
* Loop below ensures that a and b are reduced by some bits each,
319
* for a total of at least 14 bits.
320
*/
321
for (num = ((m[0] - (m[0] >> 4)) << 1) + 14; num >= 14; num -= 14) {
322
size_t j;
323
uint32_t c0, c1;
324
uint32_t a0, a1, b0, b1;
325
uint32_t a_hi, b_hi, a_lo, b_lo;
326
int32_t pa, pb, qa, qb;
327
int i;
328
329
/*
330
* Extract top words of a and b. If j is the highest
331
* index >= 1 such that a[j] != 0 or b[j] != 0, then we want
332
* (a[j] << 15) + a[j - 1], and (b[j] << 15) + b[j - 1].
333
* If a and b are down to one word each, then we use a[0]
334
* and b[0].
335
*/
336
c0 = (uint32_t)-1;
337
c1 = (uint32_t)-1;
338
a0 = 0;
339
a1 = 0;
340
b0 = 0;
341
b1 = 0;
342
j = len;
343
while (j -- > 0) {
344
uint32_t aw, bw;
345
346
aw = a[j];
347
bw = b[j];
348
a0 ^= (a0 ^ aw) & c0;
349
a1 ^= (a1 ^ aw) & c1;
350
b0 ^= (b0 ^ bw) & c0;
351
b1 ^= (b1 ^ bw) & c1;
352
c1 = c0;
353
c0 &= (((aw | bw) + 0xFFFF) >> 16) - (uint32_t)1;
354
}
355
356
/*
357
* If c1 = 0, then we grabbed two words for a and b.
358
* If c1 != 0 but c0 = 0, then we grabbed one word. It
359
* is not possible that c1 != 0 and c0 != 0, because that
360
* would mean that both integers are zero.
361
*/
362
a1 |= a0 & c1;
363
a0 &= ~c1;
364
b1 |= b0 & c1;
365
b0 &= ~c1;
366
a_hi = (a0 << 15) + a1;
367
b_hi = (b0 << 15) + b1;
368
a_lo = a[0];
369
b_lo = b[0];
370
371
/*
372
* Compute reduction factors:
373
*
374
* a' = a*pa + b*pb
375
* b' = a*qa + b*qb
376
*
377
* such that a' and b' are both multiple of 2^15, but are
378
* only marginally larger than a and b.
379
*/
380
pa = 1;
381
pb = 0;
382
qa = 0;
383
qb = 1;
384
for (i = 0; i < 15; i ++) {
385
/*
386
* At each iteration:
387
*
388
* a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
389
* b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
390
* a <- a/2 if: a is even
391
* b <- b/2 if: a is odd, b is even
392
*
393
* We multiply a_lo and b_lo by 2 at each
394
* iteration, thus a division by 2 really is a
395
* non-multiplication by 2.
396
*/
397
uint32_t r, oa, ob, cAB, cBA, cA;
398
399
/*
400
* cAB = 1 if b must be subtracted from a
401
* cBA = 1 if a must be subtracted from b
402
* cA = 1 if a is divided by 2, 0 otherwise
403
*
404
* Rules:
405
*
406
* cAB and cBA cannot be both 1.
407
* if a is not divided by 2, b is.
408
*/
409
r = GT(a_hi, b_hi);
410
oa = (a_lo >> i) & 1;
411
ob = (b_lo >> i) & 1;
412
cAB = oa & ob & r;
413
cBA = oa & ob & NOT(r);
414
cA = cAB | NOT(oa);
415
416
/*
417
* Conditional subtractions.
418
*/
419
a_lo -= b_lo & -cAB;
420
a_hi -= b_hi & -cAB;
421
pa -= qa & -(int32_t)cAB;
422
pb -= qb & -(int32_t)cAB;
423
b_lo -= a_lo & -cBA;
424
b_hi -= a_hi & -cBA;
425
qa -= pa & -(int32_t)cBA;
426
qb -= pb & -(int32_t)cBA;
427
428
/*
429
* Shifting.
430
*/
431
a_lo += a_lo & (cA - 1);
432
pa += pa & ((int32_t)cA - 1);
433
pb += pb & ((int32_t)cA - 1);
434
a_hi ^= (a_hi ^ (a_hi >> 1)) & -cA;
435
b_lo += b_lo & -cA;
436
qa += qa & -(int32_t)cA;
437
qb += qb & -(int32_t)cA;
438
b_hi ^= (b_hi ^ (b_hi >> 1)) & (cA - 1);
439
}
440
441
/*
442
* Replace a and b with new values a' and b'.
443
*/
444
r = co_reduce(a, b, len, pa, pb, qa, qb);
445
pa -= pa * ((r & 1) << 1);
446
pb -= pb * ((r & 1) << 1);
447
qa -= qa * (r & 2);
448
qb -= qb * (r & 2);
449
co_reduce_mod(u, v, len, pa, pb, qa, qb, m + 1, m0i);
450
}
451
452
/*
453
* Now one of the arrays should be 0, and the other contains
454
* the GCD. If a is 0, then u is 0 as well, and v contains
455
* the division result.
456
* Result is correct if and only if GCD is 1.
457
*/
458
r = (a[0] | b[0]) ^ 1;
459
u[0] |= v[0];
460
for (k = 1; k < len; k ++) {
461
r |= a[k] | b[k];
462
u[k] |= v[k];
463
}
464
return EQ0(r);
465
}
466
467