CoCalc -- ec_p256

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/bearssl/src/ec/ec_p256_m31.c
¹⁰⁷⁰⁰¹ views
1
/*
2
 * Copyright (c) 2017 Thomas Pornin <[email protected]>
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining 
5
 * a copy of this software and associated documentation files (the
6
 * "Software"), to deal in the Software without restriction, including
7
 * without limitation the rights to use, copy, modify, merge, publish,
8
 * distribute, sublicense, and/or sell copies of the Software, and to
9
 * permit persons to whom the Software is furnished to do so, subject to
10
 * the following conditions:
11
 *
12
 * The above copyright notice and this permission notice shall be 
13
 * included in all copies or substantial portions of the Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
16
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
18
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
 * SOFTWARE.
23
 */
24

25
#include "inner.h"
26

27
/*
28
 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
29
 * that right-shifting a signed negative integer copies the sign bit
30
 * (arithmetic right-shift). This is "implementation-defined behaviour",
31
 * i.e. it is not undefined, but it may differ between compilers. Each
32
 * compiler is supposed to document its behaviour in that respect. GCC
33
 * explicitly defines that an arithmetic right shift is used. We expect
34
 * all other compilers to do the same, because underlying CPU offer an
35
 * arithmetic right shift opcode that could not be used otherwise.
36
 */
37
#if BR_NO_ARITH_SHIFT
38
#define ARSH(x, n)    (((uint32_t)(x) >> (n)) \
39
                      | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
40
#define ARSHW(x, n)   (((uint64_t)(x) >> (n)) \
41
                      | ((-((uint64_t)(x) >> 63)) << (64 - (n))))
42
#else
43
#define ARSH(x, n)    ((*(int32_t *)&(x)) >> (n))
44
#define ARSHW(x, n)   ((*(int64_t *)&(x)) >> (n))
45
#endif
46

47
/*
48
 * Convert an integer from unsigned big-endian encoding to a sequence of
49
 * 30-bit words in little-endian order. The final "partial" word is
50
 * returned.
51
 */
52
static uint32_t
53
be8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
54
{
55
	uint32_t acc;
56
	int acc_len;
57

58
	acc = 0;
59
	acc_len = 0;
60
	while (len -- > 0) {
61
		uint32_t b;
62

63
		b = src[len];
64
		if (acc_len < 22) {
65
			acc |= b << acc_len;
66
			acc_len += 8;
67
		} else {
68
			*dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
69
			acc = b >> (30 - acc_len);
70
			acc_len -= 22;
71
		}
72
	}
73
	return acc;
74
}
75

76
/*
77
 * Convert an integer (30-bit words, little-endian) to unsigned
78
 * big-endian encoding. The total encoding length is provided; all
79
 * the destination bytes will be filled.
80
 */
81
static void
82
le30_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
83
{
84
	uint32_t acc;
85
	int acc_len;
86

87
	acc = 0;
88
	acc_len = 0;
89
	while (len -- > 0) {
90
		if (acc_len < 8) {
91
			uint32_t w;
92

93
			w = *src ++;
94
			dst[len] = (unsigned char)(acc | (w << acc_len));
95
			acc = w >> (8 - acc_len);
96
			acc_len += 22;
97
		} else {
98
			dst[len] = (unsigned char)acc;
99
			acc >>= 8;
100
			acc_len -= 8;
101
		}
102
	}
103
}
104

105
/*
106
 * Multiply two integers. Source integers are represented as arrays of
107
 * nine 30-bit words, for values up to 2^270-1. Result is encoded over
108
 * 18 words of 30 bits each.
109
 */
110
static void
111
mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
112
{
113
	/*
114
	 * Maximum intermediate result is no more than
115
	 * 10376293531797946367, which fits in 64 bits. Reason:
116
	 *
117
	 *   10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
118
	 *   10376293531797946367 < 9663676407 * 2^30
119
	 *
120
	 * Thus, adding together 9 products of 30-bit integers, with
121
	 * a carry of at most 9663676406, yields an integer that fits
122
	 * on 64 bits and generates a carry of at most 9663676406.
123
	 */
124
	uint64_t t[17];
125
	uint64_t cc;
126
	int i;
127

128
	t[ 0] = MUL31(a[0], b[0]);
129
	t[ 1] = MUL31(a[0], b[1])
130
		+ MUL31(a[1], b[0]);
131
	t[ 2] = MUL31(a[0], b[2])
132
		+ MUL31(a[1], b[1])
133
		+ MUL31(a[2], b[0]);
134
	t[ 3] = MUL31(a[0], b[3])
135
		+ MUL31(a[1], b[2])
136
		+ MUL31(a[2], b[1])
137
		+ MUL31(a[3], b[0]);
138
	t[ 4] = MUL31(a[0], b[4])
139
		+ MUL31(a[1], b[3])
140
		+ MUL31(a[2], b[2])
141
		+ MUL31(a[3], b[1])
142
		+ MUL31(a[4], b[0]);
143
	t[ 5] = MUL31(a[0], b[5])
144
		+ MUL31(a[1], b[4])
145
		+ MUL31(a[2], b[3])
146
		+ MUL31(a[3], b[2])
147
		+ MUL31(a[4], b[1])
148
		+ MUL31(a[5], b[0]);
149
	t[ 6] = MUL31(a[0], b[6])
150
		+ MUL31(a[1], b[5])
151
		+ MUL31(a[2], b[4])
152
		+ MUL31(a[3], b[3])
153
		+ MUL31(a[4], b[2])
154
		+ MUL31(a[5], b[1])
155
		+ MUL31(a[6], b[0]);
156
	t[ 7] = MUL31(a[0], b[7])
157
		+ MUL31(a[1], b[6])
158
		+ MUL31(a[2], b[5])
159
		+ MUL31(a[3], b[4])
160
		+ MUL31(a[4], b[3])
161
		+ MUL31(a[5], b[2])
162
		+ MUL31(a[6], b[1])
163
		+ MUL31(a[7], b[0]);
164
	t[ 8] = MUL31(a[0], b[8])
165
		+ MUL31(a[1], b[7])
166
		+ MUL31(a[2], b[6])
167
		+ MUL31(a[3], b[5])
168
		+ MUL31(a[4], b[4])
169
		+ MUL31(a[5], b[3])
170
		+ MUL31(a[6], b[2])
171
		+ MUL31(a[7], b[1])
172
		+ MUL31(a[8], b[0]);
173
	t[ 9] = MUL31(a[1], b[8])
174
		+ MUL31(a[2], b[7])
175
		+ MUL31(a[3], b[6])
176
		+ MUL31(a[4], b[5])
177
		+ MUL31(a[5], b[4])
178
		+ MUL31(a[6], b[3])
179
		+ MUL31(a[7], b[2])
180
		+ MUL31(a[8], b[1]);
181
	t[10] = MUL31(a[2], b[8])
182
		+ MUL31(a[3], b[7])
183
		+ MUL31(a[4], b[6])
184
		+ MUL31(a[5], b[5])
185
		+ MUL31(a[6], b[4])
186
		+ MUL31(a[7], b[3])
187
		+ MUL31(a[8], b[2]);
188
	t[11] = MUL31(a[3], b[8])
189
		+ MUL31(a[4], b[7])
190
		+ MUL31(a[5], b[6])
191
		+ MUL31(a[6], b[5])
192
		+ MUL31(a[7], b[4])
193
		+ MUL31(a[8], b[3]);
194
	t[12] = MUL31(a[4], b[8])
195
		+ MUL31(a[5], b[7])
196
		+ MUL31(a[6], b[6])
197
		+ MUL31(a[7], b[5])
198
		+ MUL31(a[8], b[4]);
199
	t[13] = MUL31(a[5], b[8])
200
		+ MUL31(a[6], b[7])
201
		+ MUL31(a[7], b[6])
202
		+ MUL31(a[8], b[5]);
203
	t[14] = MUL31(a[6], b[8])
204
		+ MUL31(a[7], b[7])
205
		+ MUL31(a[8], b[6]);
206
	t[15] = MUL31(a[7], b[8])
207
		+ MUL31(a[8], b[7]);
208
	t[16] = MUL31(a[8], b[8]);
209

210
	/*
211
	 * Propagate carries.
212
	 */
213
	cc = 0;
214
	for (i = 0; i < 17; i ++) {
215
		uint64_t w;
216

217
		w = t[i] + cc;
218
		d[i] = (uint32_t)w & 0x3FFFFFFF;
219
		cc = w >> 30;
220
	}
221
	d[17] = (uint32_t)cc;
222
}
223

224
/*
225
 * Square a 270-bit integer, represented as an array of nine 30-bit words.
226
 * Result uses 18 words of 30 bits each.
227
 */
228
static void
229
square9(uint32_t *d, const uint32_t *a)
230
{
231
	uint64_t t[17];
232
	uint64_t cc;
233
	int i;
234

235
	t[ 0] = MUL31(a[0], a[0]);
236
	t[ 1] = ((MUL31(a[0], a[1])) << 1);
237
	t[ 2] = MUL31(a[1], a[1])
238
		+ ((MUL31(a[0], a[2])) << 1);
239
	t[ 3] = ((MUL31(a[0], a[3])
240
		+ MUL31(a[1], a[2])) << 1);
241
	t[ 4] = MUL31(a[2], a[2])
242
		+ ((MUL31(a[0], a[4])
243
		+ MUL31(a[1], a[3])) << 1);
244
	t[ 5] = ((MUL31(a[0], a[5])
245
		+ MUL31(a[1], a[4])
246
		+ MUL31(a[2], a[3])) << 1);
247
	t[ 6] = MUL31(a[3], a[3])
248
		+ ((MUL31(a[0], a[6])
249
		+ MUL31(a[1], a[5])
250
		+ MUL31(a[2], a[4])) << 1);
251
	t[ 7] = ((MUL31(a[0], a[7])
252
		+ MUL31(a[1], a[6])
253
		+ MUL31(a[2], a[5])
254
		+ MUL31(a[3], a[4])) << 1);
255
	t[ 8] = MUL31(a[4], a[4])
256
		+ ((MUL31(a[0], a[8])
257
		+ MUL31(a[1], a[7])
258
		+ MUL31(a[2], a[6])
259
		+ MUL31(a[3], a[5])) << 1);
260
	t[ 9] = ((MUL31(a[1], a[8])
261
		+ MUL31(a[2], a[7])
262
		+ MUL31(a[3], a[6])
263
		+ MUL31(a[4], a[5])) << 1);
264
	t[10] = MUL31(a[5], a[5])
265
		+ ((MUL31(a[2], a[8])
266
		+ MUL31(a[3], a[7])
267
		+ MUL31(a[4], a[6])) << 1);
268
	t[11] = ((MUL31(a[3], a[8])
269
		+ MUL31(a[4], a[7])
270
		+ MUL31(a[5], a[6])) << 1);
271
	t[12] = MUL31(a[6], a[6])
272
		+ ((MUL31(a[4], a[8])
273
		+ MUL31(a[5], a[7])) << 1);
274
	t[13] = ((MUL31(a[5], a[8])
275
		+ MUL31(a[6], a[7])) << 1);
276
	t[14] = MUL31(a[7], a[7])
277
		+ ((MUL31(a[6], a[8])) << 1);
278
	t[15] = ((MUL31(a[7], a[8])) << 1);
279
	t[16] = MUL31(a[8], a[8]);
280

281
	/*
282
	 * Propagate carries.
283
	 */
284
	cc = 0;
285
	for (i = 0; i < 17; i ++) {
286
		uint64_t w;
287

288
		w = t[i] + cc;
289
		d[i] = (uint32_t)w & 0x3FFFFFFF;
290
		cc = w >> 30;
291
	}
292
	d[17] = (uint32_t)cc;
293
}
294

295
/*
296
 * Base field modulus for P-256.
297
 */
298
static const uint32_t F256[] = {
299

300
	0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x0000003F, 0x00000000,
301
	0x00000000, 0x00001000, 0x3FFFC000, 0x0000FFFF
302
};
303

304
/*
305
 * The 'b' curve equation coefficient for P-256.
306
 */
307
static const uint32_t P256_B[] = {
308

309
	0x27D2604B, 0x2F38F0F8, 0x053B0F63, 0x0741AC33, 0x1886BC65,
310
	0x2EF555DA, 0x293E7B3E, 0x0D762A8E, 0x00005AC6
311
};
312

313
/*
314
 * Addition in the field. Source operands shall fit on 257 bits; output
315
 * will be lower than twice the modulus.
316
 */
317
static void
318
add_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
319
{
320
	uint32_t w, cc;
321
	int i;
322

323
	cc = 0;
324
	for (i = 0; i < 9; i ++) {
325
		w = a[i] + b[i] + cc;
326
		d[i] = w & 0x3FFFFFFF;
327
		cc = w >> 30;
328
	}
329
	w >>= 16;
330
	d[8] &= 0xFFFF;
331
	d[3] -= w << 6;
332
	d[6] -= w << 12;
333
	d[7] += w << 14;
334
	cc = w;
335
	for (i = 0; i < 9; i ++) {
336
		w = d[i] + cc;
337
		d[i] = w & 0x3FFFFFFF;
338
		cc = ARSH(w, 30);
339
	}
340
}
341

342
/*
343
 * Subtraction in the field. Source operands shall be smaller than twice
344
 * the modulus; the result will fulfil the same property.
345
 */
346
static void
347
sub_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
348
{
349
	uint32_t w, cc;
350
	int i;
351

352
	/*
353
	 * We really compute a - b + 2*p to make sure that the result is
354
	 * positive.
355
	 */
356
	w = a[0] - b[0] - 0x00002;
357
	d[0] = w & 0x3FFFFFFF;
358
	w = a[1] - b[1] + ARSH(w, 30);
359
	d[1] = w & 0x3FFFFFFF;
360
	w = a[2] - b[2] + ARSH(w, 30);
361
	d[2] = w & 0x3FFFFFFF;
362
	w = a[3] - b[3] + ARSH(w, 30) + 0x00080;
363
	d[3] = w & 0x3FFFFFFF;
364
	w = a[4] - b[4] + ARSH(w, 30);
365
	d[4] = w & 0x3FFFFFFF;
366
	w = a[5] - b[5] + ARSH(w, 30);
367
	d[5] = w & 0x3FFFFFFF;
368
	w = a[6] - b[6] + ARSH(w, 30) + 0x02000;
369
	d[6] = w & 0x3FFFFFFF;
370
	w = a[7] - b[7] + ARSH(w, 30) - 0x08000;
371
	d[7] = w & 0x3FFFFFFF;
372
	w = a[8] - b[8] + ARSH(w, 30) + 0x20000;
373
	d[8] = w & 0xFFFF;
374
	w >>= 16;
375
	d[8] &= 0xFFFF;
376
	d[3] -= w << 6;
377
	d[6] -= w << 12;
378
	d[7] += w << 14;
379
	cc = w;
380
	for (i = 0; i < 9; i ++) {
381
		w = d[i] + cc;
382
		d[i] = w & 0x3FFFFFFF;
383
		cc = ARSH(w, 30);
384
	}
385
}
386

387
/*
388
 * Compute a multiplication in F256. Source operands shall be less than
389
 * twice the modulus.
390
 */
391
static void
392
mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
393
{
394
	uint32_t t[18];
395
	uint64_t s[18];
396
	uint64_t cc, x;
397
	uint32_t z, c;
398
	int i;
399

400
	mul9(t, a, b);
401

402
	/*
403
	 * Modular reduction: each high word in added/subtracted where
404
	 * necessary.
405
	 *
406
	 * The modulus is:
407
	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
408
	 * Therefore:
409
	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
410
	 *
411
	 * For a word x at bit offset n (n >= 256), we have:
412
	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
413
	 *            - x*2^(n - 160) + x*2^(n-256) mod p
414
	 *
415
	 * Thus, we can nullify the high word if we reinject it at some
416
	 * proper emplacements.
417
	 *
418
	 * We use 64-bit intermediate words to allow for carries to
419
	 * accumulate easily, before performing the final propagation.
420
	 */
421
	for (i = 0; i < 18; i ++) {
422
		s[i] = t[i];
423
	}
424

425
	for (i = 17; i >= 9; i --) {
426
		uint64_t y;
427

428
		y = s[i];
429
		s[i - 1] += ARSHW(y, 2);
430
		s[i - 2] += (y << 28) & 0x3FFFFFFF;
431
		s[i - 2] -= ARSHW(y, 4);
432
		s[i - 3] -= (y << 26) & 0x3FFFFFFF;
433
		s[i - 5] -= ARSHW(y, 10);
434
		s[i - 6] -= (y << 20) & 0x3FFFFFFF;
435
		s[i - 8] += ARSHW(y, 16);
436
		s[i - 9] += (y << 14) & 0x3FFFFFFF;
437
	}
438

439
	/*
440
	 * Carry propagation must be signed. Moreover, we may have overdone
441
	 * it a bit, and obtain a negative result.
442
	 *
443
	 * The loop above ran 9 times; each time, each word was augmented
444
	 * by at most one extra word (in absolute value). Thus, the top
445
	 * word must in fine fit in 39 bits, so the carry below will fit
446
	 * on 9 bits.
447
	 */
448
	cc = 0;
449
	for (i = 0; i < 9; i ++) {
450
		x = s[i] + cc;
451
		d[i] = (uint32_t)x & 0x3FFFFFFF;
452
		cc = ARSHW(x, 30);
453
	}
454

455
	/*
456
	 * All nine words fit on 30 bits, but there may be an extra
457
	 * carry for a few bits (at most 9), and that carry may be
458
	 * negative. Moreover, we want the result to fit on 257 bits.
459
	 * The two lines below ensure that the word in d[] has length
460
	 * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
461
	 * significant length of cc is less than 24 bits, so we will be
462
	 * able to switch to 32-bit operations.
463
	 */
464
	cc = ARSHW(x, 16);
465
	d[8] &= 0xFFFF;
466

467
	/*
468
	 * One extra round of reduction, for cc*2^256, which means
469
	 * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
470
	 * value. If cc is negative, then it may happen (rarely, but
471
	 * not neglectibly so) that the result would be negative. In
472
	 * order to avoid that, if cc is negative, then we add the
473
	 * modulus once. Note that if cc is negative, then propagating
474
	 * that carry must yield a value lower than the modulus, so
475
	 * adding the modulus once will keep the final result under
476
	 * twice the modulus.
477
	 */
478
	z = (uint32_t)cc;
479
	d[3] -= z << 6;
480
	d[6] -= (z << 12) & 0x3FFFFFFF;
481
	d[7] -= ARSH(z, 18);
482
	d[7] += (z << 14) & 0x3FFFFFFF;
483
	d[8] += ARSH(z, 16);
484
	c = z >> 31;
485
	d[0] -= c;
486
	d[3] += c << 6;
487
	d[6] += c << 12;
488
	d[7] -= c << 14;
489
	d[8] += c << 16;
490
	for (i = 0; i < 9; i ++) {
491
		uint32_t w;
492

493
		w = d[i] + z;
494
		d[i] = w & 0x3FFFFFFF;
495
		z = ARSH(w, 30);
496
	}
497
}
498

499
/*
500
 * Compute a square in F256. Source operand shall be less than
501
 * twice the modulus.
502
 */
503
static void
504
square_f256(uint32_t *d, const uint32_t *a)
505
{
506
	uint32_t t[18];
507
	uint64_t s[18];
508
	uint64_t cc, x;
509
	uint32_t z, c;
510
	int i;
511

512
	square9(t, a);
513

514
	/*
515
	 * Modular reduction: each high word in added/subtracted where
516
	 * necessary.
517
	 *
518
	 * The modulus is:
519
	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
520
	 * Therefore:
521
	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
522
	 *
523
	 * For a word x at bit offset n (n >= 256), we have:
524
	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
525
	 *            - x*2^(n - 160) + x*2^(n-256) mod p
526
	 *
527
	 * Thus, we can nullify the high word if we reinject it at some
528
	 * proper emplacements.
529
	 *
530
	 * We use 64-bit intermediate words to allow for carries to
531
	 * accumulate easily, before performing the final propagation.
532
	 */
533
	for (i = 0; i < 18; i ++) {
534
		s[i] = t[i];
535
	}
536

537
	for (i = 17; i >= 9; i --) {
538
		uint64_t y;
539

540
		y = s[i];
541
		s[i - 1] += ARSHW(y, 2);
542
		s[i - 2] += (y << 28) & 0x3FFFFFFF;
543
		s[i - 2] -= ARSHW(y, 4);
544
		s[i - 3] -= (y << 26) & 0x3FFFFFFF;
545
		s[i - 5] -= ARSHW(y, 10);
546
		s[i - 6] -= (y << 20) & 0x3FFFFFFF;
547
		s[i - 8] += ARSHW(y, 16);
548
		s[i - 9] += (y << 14) & 0x3FFFFFFF;
549
	}
550

551
	/*
552
	 * Carry propagation must be signed. Moreover, we may have overdone
553
	 * it a bit, and obtain a negative result.
554
	 *
555
	 * The loop above ran 9 times; each time, each word was augmented
556
	 * by at most one extra word (in absolute value). Thus, the top
557
	 * word must in fine fit in 39 bits, so the carry below will fit
558
	 * on 9 bits.
559
	 */
560
	cc = 0;
561
	for (i = 0; i < 9; i ++) {
562
		x = s[i] + cc;
563
		d[i] = (uint32_t)x & 0x3FFFFFFF;
564
		cc = ARSHW(x, 30);
565
	}
566

567
	/*
568
	 * All nine words fit on 30 bits, but there may be an extra
569
	 * carry for a few bits (at most 9), and that carry may be
570
	 * negative. Moreover, we want the result to fit on 257 bits.
571
	 * The two lines below ensure that the word in d[] has length
572
	 * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
573
	 * significant length of cc is less than 24 bits, so we will be
574
	 * able to switch to 32-bit operations.
575
	 */
576
	cc = ARSHW(x, 16);
577
	d[8] &= 0xFFFF;
578

579
	/*
580
	 * One extra round of reduction, for cc*2^256, which means
581
	 * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
582
	 * value. If cc is negative, then it may happen (rarely, but
583
	 * not neglectibly so) that the result would be negative. In
584
	 * order to avoid that, if cc is negative, then we add the
585
	 * modulus once. Note that if cc is negative, then propagating
586
	 * that carry must yield a value lower than the modulus, so
587
	 * adding the modulus once will keep the final result under
588
	 * twice the modulus.
589
	 */
590
	z = (uint32_t)cc;
591
	d[3] -= z << 6;
592
	d[6] -= (z << 12) & 0x3FFFFFFF;
593
	d[7] -= ARSH(z, 18);
594
	d[7] += (z << 14) & 0x3FFFFFFF;
595
	d[8] += ARSH(z, 16);
596
	c = z >> 31;
597
	d[0] -= c;
598
	d[3] += c << 6;
599
	d[6] += c << 12;
600
	d[7] -= c << 14;
601
	d[8] += c << 16;
602
	for (i = 0; i < 9; i ++) {
603
		uint32_t w;
604

605
		w = d[i] + z;
606
		d[i] = w & 0x3FFFFFFF;
607
		z = ARSH(w, 30);
608
	}
609
}
610

611
/*
612
 * Perform a "final reduction" in field F256 (field for curve P-256).
613
 * The source value must be less than twice the modulus. If the value
614
 * is not lower than the modulus, then the modulus is subtracted and
615
 * this function returns 1; otherwise, it leaves it untouched and it
616
 * returns 0.
617
 */
618
static uint32_t
619
reduce_final_f256(uint32_t *d)
620
{
621
	uint32_t t[9];
622
	uint32_t cc;
623
	int i;
624

625
	cc = 0;
626
	for (i = 0; i < 9; i ++) {
627
		uint32_t w;
628

629
		w = d[i] - F256[i] - cc;
630
		cc = w >> 31;
631
		t[i] = w & 0x3FFFFFFF;
632
	}
633
	cc ^= 1;
634
	CCOPY(cc, d, t, sizeof t);
635
	return cc;
636
}
637

638
/*
639
 * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
640
 * are such that:
641
 *   X = x / z^2
642
 *   Y = y / z^3
643
 * For the point at infinity, z = 0.
644
 * Each point thus admits many possible representations.
645
 *
646
 * Coordinates are represented in arrays of 32-bit integers, each holding
647
 * 30 bits of data. Values may also be slightly greater than the modulus,
648
 * but they will always be lower than twice the modulus.
649
 */
650
typedef struct {
651
	uint32_t x[9];
652
	uint32_t y[9];
653
	uint32_t z[9];
654
} p256_jacobian;
655

656
/*
657
 * Convert a point to affine coordinates:
658
 *  - If the point is the point at infinity, then all three coordinates
659
 *    are set to 0.
660
 *  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
661
 *    coordinates are the 'X' and 'Y' affine coordinates.
662
 * The coordinates are guaranteed to be lower than the modulus.
663
 */
664
static void
665
p256_to_affine(p256_jacobian *P)
666
{
667
	uint32_t t1[9], t2[9];
668
	int i;
669

670
	/*
671
	 * Invert z with a modular exponentiation: the modulus is
672
	 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
673
	 * p-2. Exponent bit pattern (from high to low) is:
674
	 *  - 32 bits of value 1
675
	 *  - 31 bits of value 0
676
	 *  - 1 bit of value 1
677
	 *  - 96 bits of value 0
678
	 *  - 94 bits of value 1
679
	 *  - 1 bit of value 0
680
	 *  - 1 bit of value 1
681
	 * Thus, we precompute z^(2^31-1) to speed things up.
682
	 *
683
	 * If z = 0 (point at infinity) then the modular exponentiation
684
	 * will yield 0, which leads to the expected result (all three
685
	 * coordinates set to 0).
686
	 */
687

688
	/*
689
	 * A simple square-and-multiply for z^(2^31-1). We could save about
690
	 * two dozen multiplications here with an addition chain, but
691
	 * this would require a bit more code, and extra stack buffers.
692
	 */
693
	memcpy(t1, P->z, sizeof P->z);
694
	for (i = 0; i < 30; i ++) {
695
		square_f256(t1, t1);
696
		mul_f256(t1, t1, P->z);
697
	}
698

699
	/*
700
	 * Square-and-multiply. Apart from the squarings, we have a few
701
	 * multiplications to set bits to 1; we multiply by the original z
702
	 * for setting 1 bit, and by t1 for setting 31 bits.
703
	 */
704
	memcpy(t2, P->z, sizeof P->z);
705
	for (i = 1; i < 256; i ++) {
706
		square_f256(t2, t2);
707
		switch (i) {
708
		case 31:
709
		case 190:
710
		case 221:
711
		case 252:
712
			mul_f256(t2, t2, t1);
713
			break;
714
		case 63:
715
		case 253:
716
		case 255:
717
			mul_f256(t2, t2, P->z);
718
			break;
719
		}
720
	}
721

722
	/*
723
	 * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
724
	 */
725
	mul_f256(t1, t2, t2);
726
	mul_f256(P->x, t1, P->x);
727
	mul_f256(t1, t1, t2);
728
	mul_f256(P->y, t1, P->y);
729
	reduce_final_f256(P->x);
730
	reduce_final_f256(P->y);
731

732
	/*
733
	 * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
734
	 * this will set z to 1.
735
	 */
736
	mul_f256(P->z, P->z, t2);
737
	reduce_final_f256(P->z);
738
}
739

740
/*
741
 * Double a point in P-256. This function works for all valid points,
742
 * including the point at infinity.
743
 */
744
static void
745
p256_double(p256_jacobian *Q)
746
{
747
	/*
748
	 * Doubling formulas are:
749
	 *
750
	 *   s = 4*x*y^2
751
	 *   m = 3*(x + z^2)*(x - z^2)
752
	 *   x' = m^2 - 2*s
753
	 *   y' = m*(s - x') - 8*y^4
754
	 *   z' = 2*y*z
755
	 *
756
	 * These formulas work for all points, including points of order 2
757
	 * and points at infinity:
758
	 *   - If y = 0 then z' = 0. But there is no such point in P-256
759
	 *     anyway.
760
	 *   - If z = 0 then z' = 0.
761
	 */
762
	uint32_t t1[9], t2[9], t3[9], t4[9];
763

764
	/*
765
	 * Compute z^2 in t1.
766
	 */
767
	square_f256(t1, Q->z);
768

769
	/*
770
	 * Compute x-z^2 in t2 and x+z^2 in t1.
771
	 */
772
	add_f256(t2, Q->x, t1);
773
	sub_f256(t1, Q->x, t1);
774

775
	/*
776
	 * Compute 3*(x+z^2)*(x-z^2) in t1.
777
	 */
778
	mul_f256(t3, t1, t2);
779
	add_f256(t1, t3, t3);
780
	add_f256(t1, t3, t1);
781

782
	/*
783
	 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
784
	 */
785
	square_f256(t3, Q->y);
786
	add_f256(t3, t3, t3);
787
	mul_f256(t2, Q->x, t3);
788
	add_f256(t2, t2, t2);
789

790
	/*
791
	 * Compute x' = m^2 - 2*s.
792
	 */
793
	square_f256(Q->x, t1);
794
	sub_f256(Q->x, Q->x, t2);
795
	sub_f256(Q->x, Q->x, t2);
796

797
	/*
798
	 * Compute z' = 2*y*z.
799
	 */
800
	mul_f256(t4, Q->y, Q->z);
801
	add_f256(Q->z, t4, t4);
802

803
	/*
804
	 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
805
	 * 2*y^2 in t3.
806
	 */
807
	sub_f256(t2, t2, Q->x);
808
	mul_f256(Q->y, t1, t2);
809
	square_f256(t4, t3);
810
	add_f256(t4, t4, t4);
811
	sub_f256(Q->y, Q->y, t4);
812
}
813

814
/*
815
 * Add point P2 to point P1.
816
 *
817
 * This function computes the wrong result in the following cases:
818
 *
819
 *   - If P1 == 0 but P2 != 0
820
 *   - If P1 != 0 but P2 == 0
821
 *   - If P1 == P2
822
 *
823
 * In all three cases, P1 is set to the point at infinity.
824
 *
825
 * Returned value is 0 if one of the following occurs:
826
 *
827
 *   - P1 and P2 have the same Y coordinate
828
 *   - P1 == 0 and P2 == 0
829
 *   - The Y coordinate of one of the points is 0 and the other point is
830
 *     the point at infinity.
831
 *
832
 * The third case cannot actually happen with valid points, since a point
833
 * with Y == 0 is a point of order 2, and there is no point of order 2 on
834
 * curve P-256.
835
 *
836
 * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
837
 * can apply the following:
838
 *
839
 *   - If the result is not the point at infinity, then it is correct.
840
 *   - Otherwise, if the returned value is 1, then this is a case of
841
 *     P1+P2 == 0, so the result is indeed the point at infinity.
842
 *   - Otherwise, P1 == P2, so a "double" operation should have been
843
 *     performed.
844
 */
845
static uint32_t
846
p256_add(p256_jacobian *P1, const p256_jacobian *P2)
847
{
848
	/*
849
	 * Addtions formulas are:
850
	 *
851
	 *   u1 = x1 * z2^2
852
	 *   u2 = x2 * z1^2
853
	 *   s1 = y1 * z2^3
854
	 *   s2 = y2 * z1^3
855
	 *   h = u2 - u1
856
	 *   r = s2 - s1
857
	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
858
	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
859
	 *   z3 = h * z1 * z2
860
	 */
861
	uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
862
	uint32_t ret;
863
	int i;
864

865
	/*
866
	 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
867
	 */
868
	square_f256(t3, P2->z);
869
	mul_f256(t1, P1->x, t3);
870
	mul_f256(t4, P2->z, t3);
871
	mul_f256(t3, P1->y, t4);
872

873
	/*
874
	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
875
	 */
876
	square_f256(t4, P1->z);
877
	mul_f256(t2, P2->x, t4);
878
	mul_f256(t5, P1->z, t4);
879
	mul_f256(t4, P2->y, t5);
880

881
	/*
882
	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
883
	 * We need to test whether r is zero, so we will do some extra
884
	 * reduce.
885
	 */
886
	sub_f256(t2, t2, t1);
887
	sub_f256(t4, t4, t3);
888
	reduce_final_f256(t4);
889
	ret = 0;
890
	for (i = 0; i < 9; i ++) {
891
		ret |= t4[i];
892
	}
893
	ret = (ret | -ret) >> 31;
894

895
	/*
896
	 * Compute u1*h^2 (in t6) and h^3 (in t5);
897
	 */
898
	square_f256(t7, t2);
899
	mul_f256(t6, t1, t7);
900
	mul_f256(t5, t7, t2);
901

902
	/*
903
	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
904
	 */
905
	square_f256(P1->x, t4);
906
	sub_f256(P1->x, P1->x, t5);
907
	sub_f256(P1->x, P1->x, t6);
908
	sub_f256(P1->x, P1->x, t6);
909

910
	/*
911
	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
912
	 */
913
	sub_f256(t6, t6, P1->x);
914
	mul_f256(P1->y, t4, t6);
915
	mul_f256(t1, t5, t3);
916
	sub_f256(P1->y, P1->y, t1);
917

918
	/*
919
	 * Compute z3 = h*z1*z2.
920
	 */
921
	mul_f256(t1, P1->z, P2->z);
922
	mul_f256(P1->z, t1, t2);
923

924
	return ret;
925
}
926

927
/*
928
 * Add point P2 to point P1. This is a specialised function for the
929
 * case when P2 is a non-zero point in affine coordinate.
930
 *
931
 * This function computes the wrong result in the following cases:
932
 *
933
 *   - If P1 == 0
934
 *   - If P1 == P2
935
 *
936
 * In both cases, P1 is set to the point at infinity.
937
 *
938
 * Returned value is 0 if one of the following occurs:
939
 *
940
 *   - P1 and P2 have the same Y coordinate
941
 *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
942
 *
943
 * The second case cannot actually happen with valid points, since a point
944
 * with Y == 0 is a point of order 2, and there is no point of order 2 on
945
 * curve P-256.
946
 *
947
 * Therefore, assuming that P1 != 0 on input, then the caller
948
 * can apply the following:
949
 *
950
 *   - If the result is not the point at infinity, then it is correct.
951
 *   - Otherwise, if the returned value is 1, then this is a case of
952
 *     P1+P2 == 0, so the result is indeed the point at infinity.
953
 *   - Otherwise, P1 == P2, so a "double" operation should have been
954
 *     performed.
955
 */
956
static uint32_t
957
p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
958
{
959
	/*
960
	 * Addtions formulas are:
961
	 *
962
	 *   u1 = x1
963
	 *   u2 = x2 * z1^2
964
	 *   s1 = y1
965
	 *   s2 = y2 * z1^3
966
	 *   h = u2 - u1
967
	 *   r = s2 - s1
968
	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
969
	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
970
	 *   z3 = h * z1
971
	 */
972
	uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
973
	uint32_t ret;
974
	int i;
975

976
	/*
977
	 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
978
	 */
979
	memcpy(t1, P1->x, sizeof t1);
980
	memcpy(t3, P1->y, sizeof t3);
981

982
	/*
983
	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
984
	 */
985
	square_f256(t4, P1->z);
986
	mul_f256(t2, P2->x, t4);
987
	mul_f256(t5, P1->z, t4);
988
	mul_f256(t4, P2->y, t5);
989

990
	/*
991
	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
992
	 * We need to test whether r is zero, so we will do some extra
993
	 * reduce.
994
	 */
995
	sub_f256(t2, t2, t1);
996
	sub_f256(t4, t4, t3);
997
	reduce_final_f256(t4);
998
	ret = 0;
999
	for (i = 0; i < 9; i ++) {
1000
		ret |= t4[i];
1001
	}
1002
	ret = (ret | -ret) >> 31;
1003

1004
	/*
1005
	 * Compute u1*h^2 (in t6) and h^3 (in t5);
1006
	 */
1007
	square_f256(t7, t2);
1008
	mul_f256(t6, t1, t7);
1009
	mul_f256(t5, t7, t2);
1010

1011
	/*
1012
	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1013
	 */
1014
	square_f256(P1->x, t4);
1015
	sub_f256(P1->x, P1->x, t5);
1016
	sub_f256(P1->x, P1->x, t6);
1017
	sub_f256(P1->x, P1->x, t6);
1018

1019
	/*
1020
	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1021
	 */
1022
	sub_f256(t6, t6, P1->x);
1023
	mul_f256(P1->y, t4, t6);
1024
	mul_f256(t1, t5, t3);
1025
	sub_f256(P1->y, P1->y, t1);
1026

1027
	/*
1028
	 * Compute z3 = h*z1*z2.
1029
	 */
1030
	mul_f256(P1->z, P1->z, t2);
1031

1032
	return ret;
1033
}
1034

1035
/*
1036
 * Decode a P-256 point. This function does not support the point at
1037
 * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
1038
 */
1039
static uint32_t
1040
p256_decode(p256_jacobian *P, const void *src, size_t len)
1041
{
1042
	const unsigned char *buf;
1043
	uint32_t tx[9], ty[9], t1[9], t2[9];
1044
	uint32_t bad;
1045
	int i;
1046

1047
	if (len != 65) {
1048
		return 0;
1049
	}
1050
	buf = src;
1051

1052
	/*
1053
	 * First byte must be 0x04 (uncompressed format). We could support
1054
	 * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
1055
	 * least significant bit of the Y coordinate), but it is explicitly
1056
	 * forbidden by RFC 5480 (section 2.2).
1057
	 */
1058
	bad = NEQ(buf[0], 0x04);
1059

1060
	/*
1061
	 * Decode the coordinates, and check that they are both lower
1062
	 * than the modulus.
1063
	 */
1064
	tx[8] = be8_to_le30(tx, buf + 1, 32);
1065
	ty[8] = be8_to_le30(ty, buf + 33, 32);
1066
	bad |= reduce_final_f256(tx);
1067
	bad |= reduce_final_f256(ty);
1068

1069
	/*
1070
	 * Check curve equation.
1071
	 */
1072
	square_f256(t1, tx);
1073
	mul_f256(t1, tx, t1);
1074
	square_f256(t2, ty);
1075
	sub_f256(t1, t1, tx);
1076
	sub_f256(t1, t1, tx);
1077
	sub_f256(t1, t1, tx);
1078
	add_f256(t1, t1, P256_B);
1079
	sub_f256(t1, t1, t2);
1080
	reduce_final_f256(t1);
1081
	for (i = 0; i < 9; i ++) {
1082
		bad |= t1[i];
1083
	}
1084

1085
	/*
1086
	 * Copy coordinates to the point structure.
1087
	 */
1088
	memcpy(P->x, tx, sizeof tx);
1089
	memcpy(P->y, ty, sizeof ty);
1090
	memset(P->z, 0, sizeof P->z);
1091
	P->z[0] = 1;
1092
	return EQ(bad, 0);
1093
}
1094

1095
/*
1096
 * Encode a point into a buffer. This function assumes that the point is
1097
 * valid, in affine coordinates, and not the point at infinity.
1098
 */
1099
static void
1100
p256_encode(void *dst, const p256_jacobian *P)
1101
{
1102
	unsigned char *buf;
1103

1104
	buf = dst;
1105
	buf[0] = 0x04;
1106
	le30_to_be8(buf + 1, 32, P->x);
1107
	le30_to_be8(buf + 33, 32, P->y);
1108
}
1109

1110
/*
1111
 * Multiply a curve point by an integer. The integer is assumed to be
1112
 * lower than the curve order, and the base point must not be the point
1113
 * at infinity.
1114
 */
1115
static void
1116
p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
1117
{
1118
	/*
1119
	 * qz is a flag that is initially 1, and remains equal to 1
1120
	 * as long as the point is the point at infinity.
1121
	 *
1122
	 * We use a 2-bit window to handle multiplier bits by pairs.
1123
	 * The precomputed window really is the points P2 and P3.
1124
	 */
1125
	uint32_t qz;
1126
	p256_jacobian P2, P3, Q, T, U;
1127

1128
	/*
1129
	 * Compute window values.
1130
	 */
1131
	P2 = *P;
1132
	p256_double(&P2);
1133
	P3 = *P;
1134
	p256_add(&P3, &P2);
1135

1136
	/*
1137
	 * We start with Q = 0. We process multiplier bits 2 by 2.
1138
	 */
1139
	memset(&Q, 0, sizeof Q);
1140
	qz = 1;
1141
	while (xlen -- > 0) {
1142
		int k;
1143

1144
		for (k = 6; k >= 0; k -= 2) {
1145
			uint32_t bits;
1146
			uint32_t bnz;
1147

1148
			p256_double(&Q);
1149
			p256_double(&Q);
1150
			T = *P;
1151
			U = Q;
1152
			bits = (*x >> k) & (uint32_t)3;
1153
			bnz = NEQ(bits, 0);
1154
			CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
1155
			CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
1156
			p256_add(&U, &T);
1157
			CCOPY(bnz & qz, &Q, &T, sizeof Q);
1158
			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1159
			qz &= ~bnz;
1160
		}
1161
		x ++;
1162
	}
1163
	*P = Q;
1164
}
1165

1166
/*
1167
 * Precomputed window: k*G points, where G is the curve generator, and k
1168
 * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
1169
 * the point are encoded as 9 words of 30 bits each (little-endian
1170
 * order).
1171
 */
1172
static const uint32_t Gwin[15][18] = {
1173

1174
	{ 0x1898C296, 0x1284E517, 0x1EB33A0F, 0x00DF604B,
1175
	  0x2440F277, 0x339B958E, 0x04247F8B, 0x347CB84B,
1176
	  0x00006B17, 0x37BF51F5, 0x2ED901A0, 0x3315ECEC,
1177
	  0x338CD5DA, 0x0F9E162B, 0x1FAD29F0, 0x27F9B8EE,
1178
	  0x10B8BF86, 0x00004FE3 },
1179

1180
	{ 0x07669978, 0x182D23F1, 0x3F21B35A, 0x225A789D,
1181
	  0x351AC3C0, 0x08E00C12, 0x34F7E8A5, 0x1EC62340,
1182
	  0x00007CF2, 0x227873D1, 0x3812DE74, 0x0E982299,
1183
	  0x1F6B798F, 0x3430DBBA, 0x366B1A7D, 0x2D040293,
1184
	  0x154436E3, 0x00000777 },
1185

1186
	{ 0x06E7FD6C, 0x2D05986F, 0x3ADA985F, 0x31ADC87B,
1187
	  0x0BF165E6, 0x1FBE5475, 0x30A44C8F, 0x3934698C,
1188
	  0x00005ECB, 0x227D5032, 0x29E6C49E, 0x04FB83D9,
1189
	  0x0AAC0D8E, 0x24A2ECD8, 0x2C1B3869, 0x0FF7E374,
1190
	  0x19031266, 0x00008734 },
1191

1192
	{ 0x2B030852, 0x024C0911, 0x05596EF5, 0x07F8B6DE,
1193
	  0x262BD003, 0x3779967B, 0x08FBBA02, 0x128D4CB4,
1194
	  0x0000E253, 0x184ED8C6, 0x310B08FC, 0x30EE0055,
1195
	  0x3F25B0FC, 0x062D764E, 0x3FB97F6A, 0x33CC719D,
1196
	  0x15D69318, 0x0000E0F1 },
1197

1198
	{ 0x03D033ED, 0x05552837, 0x35BE5242, 0x2320BF47,
1199
	  0x268FDFEF, 0x13215821, 0x140D2D78, 0x02DE9454,
1200
	  0x00005159, 0x3DA16DA4, 0x0742ED13, 0x0D80888D,
1201
	  0x004BC035, 0x0A79260D, 0x06FCDAFE, 0x2727D8AE,
1202
	  0x1F6A2412, 0x0000E0C1 },
1203

1204
	{ 0x3C2291A9, 0x1AC2ABA4, 0x3B215B4C, 0x131D037A,
1205
	  0x17DDE302, 0x0C90B2E2, 0x0602C92D, 0x05CA9DA9,
1206
	  0x0000B01A, 0x0FC77FE2, 0x35F1214E, 0x07E16BDF,
1207
	  0x003DDC07, 0x2703791C, 0x3038B7EE, 0x3DAD56FE,
1208
	  0x041D0C8D, 0x0000E85C },
1209

1210
	{ 0x3187B2A3, 0x0018A1C0, 0x00FEF5B3, 0x3E7E2E2A,
1211
	  0x01FB607E, 0x2CC199F0, 0x37B4625B, 0x0EDBE82F,
1212
	  0x00008E53, 0x01F400B4, 0x15786A1B, 0x3041B21C,
1213
	  0x31CD8CF2, 0x35900053, 0x1A7E0E9B, 0x318366D0,
1214
	  0x076F780C, 0x000073EB },
1215

1216
	{ 0x1B6FB393, 0x13767707, 0x3CE97DBB, 0x348E2603,
1217
	  0x354CADC1, 0x09D0B4EA, 0x1B053404, 0x1DE76FBA,
1218
	  0x000062D9, 0x0F09957E, 0x295029A8, 0x3E76A78D,
1219
	  0x3B547DAE, 0x27CEE0A2, 0x0575DC45, 0x1D8244FF,
1220
	  0x332F647A, 0x0000AD5A },
1221

1222
	{ 0x10949EE0, 0x1E7A292E, 0x06DF8B3D, 0x02B2E30B,
1223
	  0x31F8729E, 0x24E35475, 0x30B71878, 0x35EDBFB7,
1224
	  0x0000EA68, 0x0DD048FA, 0x21688929, 0x0DE823FE,
1225
	  0x1C53FAA9, 0x0EA0C84D, 0x052A592A, 0x1FCE7870,
1226
	  0x11325CB2, 0x00002A27 },
1227

1228
	{ 0x04C5723F, 0x30D81A50, 0x048306E4, 0x329B11C7,
1229
	  0x223FB545, 0x085347A8, 0x2993E591, 0x1B5ACA8E,
1230
	  0x0000CEF6, 0x04AF0773, 0x28D2EEA9, 0x2751EEEC,
1231
	  0x037B4A7F, 0x3B4C1059, 0x08F37674, 0x2AE906E1,
1232
	  0x18A88A6A, 0x00008786 },
1233

1234
	{ 0x34BC21D1, 0x0CCE474D, 0x15048BF4, 0x1D0BB409,
1235
	  0x021CDA16, 0x20DE76C3, 0x34C59063, 0x04EDE20E,
1236
	  0x00003ED1, 0x282A3740, 0x0BE3BBF3, 0x29889DAE,
1237
	  0x03413697, 0x34C68A09, 0x210EBE93, 0x0C8A224C,
1238
	  0x0826B331, 0x00009099 },
1239

1240
	{ 0x0624E3C4, 0x140317BA, 0x2F82C99D, 0x260C0A2C,
1241
	  0x25D55179, 0x194DCC83, 0x3D95E462, 0x356F6A05,
1242
	  0x0000741D, 0x0D4481D3, 0x2657FC8B, 0x1BA5CA71,
1243
	  0x3AE44B0D, 0x07B1548E, 0x0E0D5522, 0x05FDC567,
1244
	  0x2D1AA70E, 0x00000770 },
1245

1246
	{ 0x06072C01, 0x23857675, 0x1EAD58A9, 0x0B8A12D9,
1247
	  0x1EE2FC79, 0x0177CB61, 0x0495A618, 0x20DEB82B,
1248
	  0x0000177C, 0x2FC7BFD8, 0x310EEF8B, 0x1FB4DF39,
1249
	  0x3B8530E8, 0x0F4E7226, 0x0246B6D0, 0x2A558A24,
1250
	  0x163353AF, 0x000063BB },
1251

1252
	{ 0x24D2920B, 0x1C249DCC, 0x2069C5E5, 0x09AB2F9E,
1253
	  0x36DF3CF1, 0x1991FD0C, 0x062B97A7, 0x1E80070E,
1254
	  0x000054E7, 0x20D0B375, 0x2E9F20BD, 0x35090081,
1255
	  0x1C7A9DDC, 0x22E7C371, 0x087E3016, 0x03175421,
1256
	  0x3C6ECA7D, 0x0000F599 },
1257

1258
	{ 0x259B9D5F, 0x0D9A318F, 0x23A0EF16, 0x00EBE4B7,
1259
	  0x088265AE, 0x2CDE2666, 0x2BAE7ADF, 0x1371A5C6,
1260
	  0x0000F045, 0x0D034F36, 0x1F967378, 0x1B5FA3F4,
1261
	  0x0EC8739D, 0x1643E62A, 0x1653947E, 0x22D1F4E6,
1262
	  0x0FB8D64B, 0x0000B5B9 }
1263
};
1264

1265
/*
1266
 * Lookup one of the Gwin[] values, by index. This is constant-time.
1267
 */
1268
static void
1269
lookup_Gwin(p256_jacobian *T, uint32_t idx)
1270
{
1271
	uint32_t xy[18];
1272
	uint32_t k;
1273
	size_t u;
1274

1275
	memset(xy, 0, sizeof xy);
1276
	for (k = 0; k < 15; k ++) {
1277
		uint32_t m;
1278

1279
		m = -EQ(idx, k + 1);
1280
		for (u = 0; u < 18; u ++) {
1281
			xy[u] |= m & Gwin[k][u];
1282
		}
1283
	}
1284
	memcpy(T->x, &xy[0], sizeof T->x);
1285
	memcpy(T->y, &xy[9], sizeof T->y);
1286
	memset(T->z, 0, sizeof T->z);
1287
	T->z[0] = 1;
1288
}
1289

1290
/*
1291
 * Multiply the generator by an integer. The integer is assumed non-zero
1292
 * and lower than the curve order.
1293
 */
1294
static void
1295
p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
1296
{
1297
	/*
1298
	 * qz is a flag that is initially 1, and remains equal to 1
1299
	 * as long as the point is the point at infinity.
1300
	 *
1301
	 * We use a 4-bit window to handle multiplier bits by groups
1302
	 * of 4. The precomputed window is constant static data, with
1303
	 * points in affine coordinates; we use a constant-time lookup.
1304
	 */
1305
	p256_jacobian Q;
1306
	uint32_t qz;
1307

1308
	memset(&Q, 0, sizeof Q);
1309
	qz = 1;
1310
	while (xlen -- > 0) {
1311
		int k;
1312
		unsigned bx;
1313

1314
		bx = *x ++;
1315
		for (k = 0; k < 2; k ++) {
1316
			uint32_t bits;
1317
			uint32_t bnz;
1318
			p256_jacobian T, U;
1319

1320
			p256_double(&Q);
1321
			p256_double(&Q);
1322
			p256_double(&Q);
1323
			p256_double(&Q);
1324
			bits = (bx >> 4) & 0x0F;
1325
			bnz = NEQ(bits, 0);
1326
			lookup_Gwin(&T, bits);
1327
			U = Q;
1328
			p256_add_mixed(&U, &T);
1329
			CCOPY(bnz & qz, &Q, &T, sizeof Q);
1330
			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1331
			qz &= ~bnz;
1332
			bx <<= 4;
1333
		}
1334
	}
1335
	*P = Q;
1336
}
1337

1338
static const unsigned char P256_G[] = {
1339
	0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
1340
	0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
1341
	0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
1342
	0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
1343
	0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
1344
	0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
1345
	0x68, 0x37, 0xBF, 0x51, 0xF5
1346
};
1347

1348
static const unsigned char P256_N[] = {
1349
	0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
1350
	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
1351
	0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
1352
	0x25, 0x51
1353
};
1354

1355
static const unsigned char *
1356
api_generator(int curve, size_t *len)
1357
{
1358
	(void)curve;
1359
	*len = sizeof P256_G;
1360
	return P256_G;
1361
}
1362

1363
static const unsigned char *
1364
api_order(int curve, size_t *len)
1365
{
1366
	(void)curve;
1367
	*len = sizeof P256_N;
1368
	return P256_N;
1369
}
1370

1371
static size_t
1372
api_xoff(int curve, size_t *len)
1373
{
1374
	(void)curve;
1375
	*len = 32;
1376
	return 1;
1377
}
1378

1379
static uint32_t
1380
api_mul(unsigned char *G, size_t Glen,
1381
	const unsigned char *x, size_t xlen, int curve)
1382
{
1383
	uint32_t r;
1384
	p256_jacobian P;
1385

1386
	(void)curve;
1387
	if (Glen != 65) {
1388
		return 0;
1389
	}
1390
	r = p256_decode(&P, G, Glen);
1391
	p256_mul(&P, x, xlen);
1392
	p256_to_affine(&P);
1393
	p256_encode(G, &P);
1394
	return r;
1395
}
1396

1397
static size_t
1398
api_mulgen(unsigned char *R,
1399
	const unsigned char *x, size_t xlen, int curve)
1400
{
1401
	p256_jacobian P;
1402

1403
	(void)curve;
1404
	p256_mulgen(&P, x, xlen);
1405
	p256_to_affine(&P);
1406
	p256_encode(R, &P);
1407
	return 65;
1408
}
1409

1410
static uint32_t
1411
api_muladd(unsigned char *A, const unsigned char *B, size_t len,
1412
	const unsigned char *x, size_t xlen,
1413
	const unsigned char *y, size_t ylen, int curve)
1414
{
1415
	p256_jacobian P, Q;
1416
	uint32_t r, t, z;
1417
	int i;
1418

1419
	(void)curve;
1420
	if (len != 65) {
1421
		return 0;
1422
	}
1423
	r = p256_decode(&P, A, len);
1424
	p256_mul(&P, x, xlen);
1425
	if (B == NULL) {
1426
		p256_mulgen(&Q, y, ylen);
1427
	} else {
1428
		r &= p256_decode(&Q, B, len);
1429
		p256_mul(&Q, y, ylen);
1430
	}
1431

1432
	/*
1433
	 * The final addition may fail in case both points are equal.
1434
	 */
1435
	t = p256_add(&P, &Q);
1436
	reduce_final_f256(P.z);
1437
	z = 0;
1438
	for (i = 0; i < 9; i ++) {
1439
		z |= P.z[i];
1440
	}
1441
	z = EQ(z, 0);
1442
	p256_double(&Q);
1443

1444
	/*
1445
	 * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
1446
	 * have the following:
1447
	 *
1448
	 *   z = 0, t = 0   return P (normal addition)
1449
	 *   z = 0, t = 1   return P (normal addition)
1450
	 *   z = 1, t = 0   return Q (a 'double' case)
1451
	 *   z = 1, t = 1   report an error (P+Q = 0)
1452
	 */
1453
	CCOPY(z & ~t, &P, &Q, sizeof Q);
1454
	p256_to_affine(&P);
1455
	p256_encode(A, &P);
1456
	r &= ~(z & t);
1457
	return r;
1458
}
1459

1460
/* see bearssl_ec.h */
1461
const br_ec_impl br_ec_p256_m31 = {
1462
	(uint32_t)0x00800000,
1463
	&api_generator,
1464
	&api_order,
1465
	&api_xoff,
1466
	&api_mul,
1467
	&api_mulgen,
1468
	&api_muladd
1469
};
1470

1471
Product

Resources

Company