CoCalc -- ec_c25519

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/bearssl/src/ec/ec_c25519_m31.c
³⁹⁵³⁶ views
1
/*
2
 * Copyright (c) 2017 Thomas Pornin <[email protected]>
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining 
5
 * a copy of this software and associated documentation files (the
6
 * "Software"), to deal in the Software without restriction, including
7
 * without limitation the rights to use, copy, modify, merge, publish,
8
 * distribute, sublicense, and/or sell copies of the Software, and to
9
 * permit persons to whom the Software is furnished to do so, subject to
10
 * the following conditions:
11
 *
12
 * The above copyright notice and this permission notice shall be 
13
 * included in all copies or substantial portions of the Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
16
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
18
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
 * SOFTWARE.
23
 */
24

25
#include "inner.h"
26

27
/* obsolete
28
#include <stdio.h>
29
#include <stdlib.h>
30
static void
31
print_int(const char *name, const uint32_t *x)
32
{
33
	size_t u;
34
	unsigned char tmp[40];
35

36
	printf("%s = ", name);
37
	for (u = 0; u < 9; u ++) {
38
		if (x[u] > 0x3FFFFFFF) {
39
			printf("INVALID:");
40
			for (u = 0; u < 9; u ++) {
41
				printf(" %08X", x[u]);
42
			}
43
			printf("\n");
44
			return;
45
		}
46
	}
47
	memset(tmp, 0, sizeof tmp);
48
	for (u = 0; u < 9; u ++) {
49
		uint64_t w;
50
		int j, k;
51

52
		w = x[u];
53
		j = 30 * (int)u;
54
		k = j & 7;
55
		if (k != 0) {
56
			w <<= k;
57
			j -= k;
58
		}
59
		k = j >> 3;
60
		for (j = 0; j < 8; j ++) {
61
			tmp[39 - k - j] |= (unsigned char)w;
62
			w >>= 8;
63
		}
64
	}
65
	for (u = 8; u < 40; u ++) {
66
		printf("%02X", tmp[u]);
67
	}
68
	printf("\n");
69
}
70
*/
71

72
/*
73
 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
74
 * that right-shifting a signed negative integer copies the sign bit
75
 * (arithmetic right-shift). This is "implementation-defined behaviour",
76
 * i.e. it is not undefined, but it may differ between compilers. Each
77
 * compiler is supposed to document its behaviour in that respect. GCC
78
 * explicitly defines that an arithmetic right shift is used. We expect
79
 * all other compilers to do the same, because underlying CPU offer an
80
 * arithmetic right shift opcode that could not be used otherwise.
81
 */
82
#if BR_NO_ARITH_SHIFT
83
#define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
84
                    | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
85
#else
86
#define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
87
#endif
88

89
/*
90
 * Convert an integer from unsigned little-endian encoding to a sequence of
91
 * 30-bit words in little-endian order. The final "partial" word is
92
 * returned.
93
 */
94
static uint32_t
95
le8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
96
{
97
	uint32_t acc;
98
	int acc_len;
99

100
	acc = 0;
101
	acc_len = 0;
102
	while (len -- > 0) {
103
		uint32_t b;
104

105
		b = *src ++;
106
		if (acc_len < 22) {
107
			acc |= b << acc_len;
108
			acc_len += 8;
109
		} else {
110
			*dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
111
			acc = b >> (30 - acc_len);
112
			acc_len -= 22;
113
		}
114
	}
115
	return acc;
116
}
117

118
/*
119
 * Convert an integer (30-bit words, little-endian) to unsigned
120
 * little-endian encoding. The total encoding length is provided; all
121
 * the destination bytes will be filled.
122
 */
123
static void
124
le30_to_le8(unsigned char *dst, size_t len, const uint32_t *src)
125
{
126
	uint32_t acc;
127
	int acc_len;
128

129
	acc = 0;
130
	acc_len = 0;
131
	while (len -- > 0) {
132
		if (acc_len < 8) {
133
			uint32_t w;
134

135
			w = *src ++;
136
			*dst ++ = (unsigned char)(acc | (w << acc_len));
137
			acc = w >> (8 - acc_len);
138
			acc_len += 22;
139
		} else {
140
			*dst ++ = (unsigned char)acc;
141
			acc >>= 8;
142
			acc_len -= 8;
143
		}
144
	}
145
}
146

147
/*
148
 * Multiply two integers. Source integers are represented as arrays of
149
 * nine 30-bit words, for values up to 2^270-1. Result is encoded over
150
 * 18 words of 30 bits each.
151
 */
152
static void
153
mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
154
{
155
	/*
156
	 * Maximum intermediate result is no more than
157
	 * 10376293531797946367, which fits in 64 bits. Reason:
158
	 *
159
	 *   10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
160
	 *   10376293531797946367 < 9663676407 * 2^30
161
	 *
162
	 * Thus, adding together 9 products of 30-bit integers, with
163
	 * a carry of at most 9663676406, yields an integer that fits
164
	 * on 64 bits and generates a carry of at most 9663676406.
165
	 */
166
	uint64_t t[17];
167
	uint64_t cc;
168
	int i;
169

170
	t[ 0] = MUL31(a[0], b[0]);
171
	t[ 1] = MUL31(a[0], b[1])
172
		+ MUL31(a[1], b[0]);
173
	t[ 2] = MUL31(a[0], b[2])
174
		+ MUL31(a[1], b[1])
175
		+ MUL31(a[2], b[0]);
176
	t[ 3] = MUL31(a[0], b[3])
177
		+ MUL31(a[1], b[2])
178
		+ MUL31(a[2], b[1])
179
		+ MUL31(a[3], b[0]);
180
	t[ 4] = MUL31(a[0], b[4])
181
		+ MUL31(a[1], b[3])
182
		+ MUL31(a[2], b[2])
183
		+ MUL31(a[3], b[1])
184
		+ MUL31(a[4], b[0]);
185
	t[ 5] = MUL31(a[0], b[5])
186
		+ MUL31(a[1], b[4])
187
		+ MUL31(a[2], b[3])
188
		+ MUL31(a[3], b[2])
189
		+ MUL31(a[4], b[1])
190
		+ MUL31(a[5], b[0]);
191
	t[ 6] = MUL31(a[0], b[6])
192
		+ MUL31(a[1], b[5])
193
		+ MUL31(a[2], b[4])
194
		+ MUL31(a[3], b[3])
195
		+ MUL31(a[4], b[2])
196
		+ MUL31(a[5], b[1])
197
		+ MUL31(a[6], b[0]);
198
	t[ 7] = MUL31(a[0], b[7])
199
		+ MUL31(a[1], b[6])
200
		+ MUL31(a[2], b[5])
201
		+ MUL31(a[3], b[4])
202
		+ MUL31(a[4], b[3])
203
		+ MUL31(a[5], b[2])
204
		+ MUL31(a[6], b[1])
205
		+ MUL31(a[7], b[0]);
206
	t[ 8] = MUL31(a[0], b[8])
207
		+ MUL31(a[1], b[7])
208
		+ MUL31(a[2], b[6])
209
		+ MUL31(a[3], b[5])
210
		+ MUL31(a[4], b[4])
211
		+ MUL31(a[5], b[3])
212
		+ MUL31(a[6], b[2])
213
		+ MUL31(a[7], b[1])
214
		+ MUL31(a[8], b[0]);
215
	t[ 9] = MUL31(a[1], b[8])
216
		+ MUL31(a[2], b[7])
217
		+ MUL31(a[3], b[6])
218
		+ MUL31(a[4], b[5])
219
		+ MUL31(a[5], b[4])
220
		+ MUL31(a[6], b[3])
221
		+ MUL31(a[7], b[2])
222
		+ MUL31(a[8], b[1]);
223
	t[10] = MUL31(a[2], b[8])
224
		+ MUL31(a[3], b[7])
225
		+ MUL31(a[4], b[6])
226
		+ MUL31(a[5], b[5])
227
		+ MUL31(a[6], b[4])
228
		+ MUL31(a[7], b[3])
229
		+ MUL31(a[8], b[2]);
230
	t[11] = MUL31(a[3], b[8])
231
		+ MUL31(a[4], b[7])
232
		+ MUL31(a[5], b[6])
233
		+ MUL31(a[6], b[5])
234
		+ MUL31(a[7], b[4])
235
		+ MUL31(a[8], b[3]);
236
	t[12] = MUL31(a[4], b[8])
237
		+ MUL31(a[5], b[7])
238
		+ MUL31(a[6], b[6])
239
		+ MUL31(a[7], b[5])
240
		+ MUL31(a[8], b[4]);
241
	t[13] = MUL31(a[5], b[8])
242
		+ MUL31(a[6], b[7])
243
		+ MUL31(a[7], b[6])
244
		+ MUL31(a[8], b[5]);
245
	t[14] = MUL31(a[6], b[8])
246
		+ MUL31(a[7], b[7])
247
		+ MUL31(a[8], b[6]);
248
	t[15] = MUL31(a[7], b[8])
249
		+ MUL31(a[8], b[7]);
250
	t[16] = MUL31(a[8], b[8]);
251

252
	/*
253
	 * Propagate carries.
254
	 */
255
	cc = 0;
256
	for (i = 0; i < 17; i ++) {
257
		uint64_t w;
258

259
		w = t[i] + cc;
260
		d[i] = (uint32_t)w & 0x3FFFFFFF;
261
		cc = w >> 30;
262
	}
263
	d[17] = (uint32_t)cc;
264
}
265

266
/*
267
 * Square a 270-bit integer, represented as an array of nine 30-bit words.
268
 * Result uses 18 words of 30 bits each.
269
 */
270
static void
271
square9(uint32_t *d, const uint32_t *a)
272
{
273
	uint64_t t[17];
274
	uint64_t cc;
275
	int i;
276

277
	t[ 0] = MUL31(a[0], a[0]);
278
	t[ 1] = ((MUL31(a[0], a[1])) << 1);
279
	t[ 2] = MUL31(a[1], a[1])
280
		+ ((MUL31(a[0], a[2])) << 1);
281
	t[ 3] = ((MUL31(a[0], a[3])
282
		+ MUL31(a[1], a[2])) << 1);
283
	t[ 4] = MUL31(a[2], a[2])
284
		+ ((MUL31(a[0], a[4])
285
		+ MUL31(a[1], a[3])) << 1);
286
	t[ 5] = ((MUL31(a[0], a[5])
287
		+ MUL31(a[1], a[4])
288
		+ MUL31(a[2], a[3])) << 1);
289
	t[ 6] = MUL31(a[3], a[3])
290
		+ ((MUL31(a[0], a[6])
291
		+ MUL31(a[1], a[5])
292
		+ MUL31(a[2], a[4])) << 1);
293
	t[ 7] = ((MUL31(a[0], a[7])
294
		+ MUL31(a[1], a[6])
295
		+ MUL31(a[2], a[5])
296
		+ MUL31(a[3], a[4])) << 1);
297
	t[ 8] = MUL31(a[4], a[4])
298
		+ ((MUL31(a[0], a[8])
299
		+ MUL31(a[1], a[7])
300
		+ MUL31(a[2], a[6])
301
		+ MUL31(a[3], a[5])) << 1);
302
	t[ 9] = ((MUL31(a[1], a[8])
303
		+ MUL31(a[2], a[7])
304
		+ MUL31(a[3], a[6])
305
		+ MUL31(a[4], a[5])) << 1);
306
	t[10] = MUL31(a[5], a[5])
307
		+ ((MUL31(a[2], a[8])
308
		+ MUL31(a[3], a[7])
309
		+ MUL31(a[4], a[6])) << 1);
310
	t[11] = ((MUL31(a[3], a[8])
311
		+ MUL31(a[4], a[7])
312
		+ MUL31(a[5], a[6])) << 1);
313
	t[12] = MUL31(a[6], a[6])
314
		+ ((MUL31(a[4], a[8])
315
		+ MUL31(a[5], a[7])) << 1);
316
	t[13] = ((MUL31(a[5], a[8])
317
		+ MUL31(a[6], a[7])) << 1);
318
	t[14] = MUL31(a[7], a[7])
319
		+ ((MUL31(a[6], a[8])) << 1);
320
	t[15] = ((MUL31(a[7], a[8])) << 1);
321
	t[16] = MUL31(a[8], a[8]);
322

323
	/*
324
	 * Propagate carries.
325
	 */
326
	cc = 0;
327
	for (i = 0; i < 17; i ++) {
328
		uint64_t w;
329

330
		w = t[i] + cc;
331
		d[i] = (uint32_t)w & 0x3FFFFFFF;
332
		cc = w >> 30;
333
	}
334
	d[17] = (uint32_t)cc;
335
}
336

337
/*
338
 * Perform a "final reduction" in field F255 (field for Curve25519)
339
 * The source value must be less than twice the modulus. If the value
340
 * is not lower than the modulus, then the modulus is subtracted and
341
 * this function returns 1; otherwise, it leaves it untouched and it
342
 * returns 0.
343
 */
344
static uint32_t
345
reduce_final_f255(uint32_t *d)
346
{
347
	uint32_t t[9];
348
	uint32_t cc;
349
	int i;
350

351
	memcpy(t, d, sizeof t);
352
	cc = 19;
353
	for (i = 0; i < 9; i ++) {
354
		uint32_t w;
355

356
		w = t[i] + cc;
357
		cc = w >> 30;
358
		t[i] = w & 0x3FFFFFFF;
359
	}
360
	cc = t[8] >> 15;
361
	t[8] &= 0x7FFF;
362
	CCOPY(cc, d, t, sizeof t);
363
	return cc;
364
}
365

366
/*
367
 * Perform a multiplication of two integers modulo 2^255-19.
368
 * Operands are arrays of 9 words, each containing 30 bits of data, in
369
 * little-endian order. Input value may be up to 2^256-1; on output, value
370
 * fits on 256 bits and is lower than twice the modulus.
371
 */
372
static void
373
f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
374
{
375
	uint32_t t[18], cc;
376
	int i;
377

378
	/*
379
	 * Compute raw multiplication. All result words fit in 30 bits
380
	 * each; upper word (t[17]) must fit on 2 bits, since the product
381
	 * of two 256-bit integers must fit on 512 bits.
382
	 */
383
	mul9(t, a, b);
384

385
	/*
386
	 * Modular reduction: each high word is added where necessary.
387
	 * Since the modulus is 2^255-19 and word 9 corresponds to
388
	 * offset 9*30 = 270, word 9+k must be added to word k with
389
	 * a factor of 19*2^15 = 622592. The extra bits in word 8 are also
390
	 * added that way.
391
	 *
392
	 * Keeping the carry on 32 bits helps with 32-bit architectures,
393
	 * and does not noticeably impact performance on 64-bit systems.
394
	 */
395
	cc = MUL15(t[8] >> 15, 19);  /* at most 19*(2^15-1) = 622573 */
396
	t[8] &= 0x7FFF;
397
	for (i = 0; i < 9; i ++) {
398
		uint64_t w;
399

400
		w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
401
		t[i] = (uint32_t)w & 0x3FFFFFFF;
402
		cc = (uint32_t)(w >> 30);  /* at most 622592 */
403
	}
404

405
	/*
406
	 * Original product was up to (2^256-1)^2, i.e. a 512-bit integer.
407
	 * This was split into two parts (upper of 257 bits, lower of 255
408
	 * bits), and the upper was added to the lower with a factor 19,
409
	 * which means that the intermediate value is less than 77*2^255
410
	 * (19*2^257 + 2^255). Therefore, the extra bits "t[8] >> 15" are
411
	 * less than 77, and the initial carry cc is at most 76*19 = 1444.
412
	 */
413
	cc = MUL15(t[8] >> 15, 19);
414
	t[8] &= 0x7FFF;
415
	for (i = 0; i < 9; i ++) {
416
		uint32_t z;
417

418
		z = t[i] + cc;
419
		d[i] = z & 0x3FFFFFFF;
420
		cc = z >> 30;
421
	}
422

423
	/*
424
	 * Final result is at most 2^255 + 1443. In particular, the last
425
	 * carry is necessarily 0, since t[8] was truncated to 15 bits.
426
	 */
427
}
428

429
/*
430
 * Perform a squaring of an integer modulo 2^255-19.
431
 * Operands are arrays of 9 words, each containing 30 bits of data, in
432
 * little-endian order. Input value may be up to 2^256-1; on output, value
433
 * fits on 256 bits and is lower than twice the modulus.
434
 */
435
static void
436
f255_square(uint32_t *d, const uint32_t *a)
437
{
438
	uint32_t t[18], cc;
439
	int i;
440

441
	/*
442
	 * Compute raw squaring. All result words fit in 30 bits
443
	 * each; upper word (t[17]) must fit on 2 bits, since the square
444
	 * of a 256-bit integers must fit on 512 bits.
445
	 */
446
	square9(t, a);
447

448
	/*
449
	 * Modular reduction: each high word is added where necessary.
450
	 * See f255_mul() for details on the reduction and carry limits.
451
	 */
452
	cc = MUL15(t[8] >> 15, 19);
453
	t[8] &= 0x7FFF;
454
	for (i = 0; i < 9; i ++) {
455
		uint64_t w;
456

457
		w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
458
		t[i] = (uint32_t)w & 0x3FFFFFFF;
459
		cc = (uint32_t)(w >> 30);
460
	}
461
	cc = MUL15(t[8] >> 15, 19);
462
	t[8] &= 0x7FFF;
463
	for (i = 0; i < 9; i ++) {
464
		uint32_t z;
465

466
		z = t[i] + cc;
467
		d[i] = z & 0x3FFFFFFF;
468
		cc = z >> 30;
469
	}
470
}
471

472
/*
473
 * Add two values in F255. Partial reduction is performed (down to less
474
 * than twice the modulus).
475
 */
476
static void
477
f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b)
478
{
479
	/*
480
	 * Since operand words fit on 30 bits, we can use 32-bit
481
	 * variables throughout.
482
	 */
483
	int i;
484
	uint32_t cc, w;
485

486
	cc = 0;
487
	for (i = 0; i < 9; i ++) {
488
		w = a[i] + b[i] + cc;
489
		d[i] = w & 0x3FFFFFFF;
490
		cc = w >> 30;
491
	}
492
	cc = MUL15(w >> 15, 19);
493
	d[8] &= 0x7FFF;
494
	for (i = 0; i < 9; i ++) {
495
		w = d[i] + cc;
496
		d[i] = w & 0x3FFFFFFF;
497
		cc = w >> 30;
498
	}
499
}
500

501
/*
502
 * Subtract one value from another in F255. Partial reduction is
503
 * performed (down to less than twice the modulus).
504
 */
505
static void
506
f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b)
507
{
508
	/*
509
	 * We actually compute a - b + 2*p, so that the final value is
510
	 * necessarily positive.
511
	 */
512
	int i;
513
	uint32_t cc, w;
514

515
	cc = (uint32_t)-38;
516
	for (i = 0; i < 9; i ++) {
517
		w = a[i] - b[i] + cc;
518
		d[i] = w & 0x3FFFFFFF;
519
		cc = ARSH(w, 30);
520
	}
521
	cc = MUL15((w + 0x10000) >> 15, 19);
522
	d[8] &= 0x7FFF;
523
	for (i = 0; i < 9; i ++) {
524
		w = d[i] + cc;
525
		d[i] = w & 0x3FFFFFFF;
526
		cc = w >> 30;
527
	}
528
}
529

530
/*
531
 * Multiply an integer by the 'A24' constant (121665). Partial reduction
532
 * is performed (down to less than twice the modulus).
533
 */
534
static void
535
f255_mul_a24(uint32_t *d, const uint32_t *a)
536
{
537
	int i;
538
	uint64_t w;
539
	uint32_t cc;
540

541
	/*
542
	 * a[] is over 256 bits, thus a[8] has length at most 16 bits.
543
	 * We single out the processing of the last word: intermediate
544
	 * value w is up to 121665*2^16, yielding a carry for the next
545
	 * loop of at most 19*(121665*2^16/2^15) = 4623289.
546
	 */
547
	cc = 0;
548
	for (i = 0; i < 8; i ++) {
549
		w = MUL31(a[i], 121665) + (uint64_t)cc;
550
		d[i] = (uint32_t)w & 0x3FFFFFFF;
551
		cc = (uint32_t)(w >> 30);
552
	}
553
	w = MUL31(a[8], 121665) + (uint64_t)cc;
554
	d[8] = (uint32_t)w & 0x7FFF;
555
	cc = MUL15((uint32_t)(w >> 15), 19);
556

557
	for (i = 0; i < 9; i ++) {
558
		uint32_t z;
559

560
		z = d[i] + cc;
561
		d[i] = z & 0x3FFFFFFF;
562
		cc = z >> 30;
563
	}
564
}
565

566
static const unsigned char GEN[] = {
567
	0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
568
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
569
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
570
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
571
};
572

573
static const unsigned char ORDER[] = {
574
	0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
575
	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
576
	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
577
	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
578
};
579

580
static const unsigned char *
581
api_generator(int curve, size_t *len)
582
{
583
	(void)curve;
584
	*len = 32;
585
	return GEN;
586
}
587

588
static const unsigned char *
589
api_order(int curve, size_t *len)
590
{
591
	(void)curve;
592
	*len = 32;
593
	return ORDER;
594
}
595

596
static size_t
597
api_xoff(int curve, size_t *len)
598
{
599
	(void)curve;
600
	*len = 32;
601
	return 0;
602
}
603

604
static void
605
cswap(uint32_t *a, uint32_t *b, uint32_t ctl)
606
{
607
	int i;
608

609
	ctl = -ctl;
610
	for (i = 0; i < 9; i ++) {
611
		uint32_t aw, bw, tw;
612

613
		aw = a[i];
614
		bw = b[i];
615
		tw = ctl & (aw ^ bw);
616
		a[i] = aw ^ tw;
617
		b[i] = bw ^ tw;
618
	}
619
}
620

621
static uint32_t
622
api_mul(unsigned char *G, size_t Glen,
623
	const unsigned char *kb, size_t kblen, int curve)
624
{
625
	uint32_t x1[9], x2[9], x3[9], z2[9], z3[9];
626
	uint32_t a[9], aa[9], b[9], bb[9];
627
	uint32_t c[9], d[9], e[9], da[9], cb[9];
628
	unsigned char k[32];
629
	uint32_t swap;
630
	int i;
631

632
	(void)curve;
633

634
	/*
635
	 * Points are encoded over exactly 32 bytes. Multipliers must fit
636
	 * in 32 bytes as well.
637
	 * RFC 7748 mandates that the high bit of the last point byte must
638
	 * be ignored/cleared.
639
	 */
640
	if (Glen != 32 || kblen > 32) {
641
		return 0;
642
	}
643
	G[31] &= 0x7F;
644

645
	/*
646
	 * Initialise variables x1, x2, z2, x3 and z3. We set all of them
647
	 * into Montgomery representation.
648
	 */
649
	x1[8] = le8_to_le30(x1, G, 32);
650
	memcpy(x3, x1, sizeof x1);
651
	memset(z2, 0, sizeof z2);
652
	memset(x2, 0, sizeof x2);
653
	x2[0] = 1;
654
	memset(z3, 0, sizeof z3);
655
	z3[0] = 1;
656

657
	memset(k, 0, (sizeof k) - kblen);
658
	memcpy(k + (sizeof k) - kblen, kb, kblen);
659
	k[31] &= 0xF8;
660
	k[0] &= 0x7F;
661
	k[0] |= 0x40;
662

663
	/* obsolete
664
	print_int("x1", x1);
665
	*/
666

667
	swap = 0;
668
	for (i = 254; i >= 0; i --) {
669
		uint32_t kt;
670

671
		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
672
		swap ^= kt;
673
		cswap(x2, x3, swap);
674
		cswap(z2, z3, swap);
675
		swap = kt;
676

677
		/* obsolete
678
		print_int("x2", x2);
679
		print_int("z2", z2);
680
		print_int("x3", x3);
681
		print_int("z3", z3);
682
		*/
683

684
		f255_add(a, x2, z2);
685
		f255_square(aa, a);
686
		f255_sub(b, x2, z2);
687
		f255_square(bb, b);
688
		f255_sub(e, aa, bb);
689
		f255_add(c, x3, z3);
690
		f255_sub(d, x3, z3);
691
		f255_mul(da, d, a);
692
		f255_mul(cb, c, b);
693

694
		/* obsolete
695
		print_int("a ", a);
696
		print_int("aa", aa);
697
		print_int("b ", b);
698
		print_int("bb", bb);
699
		print_int("e ", e);
700
		print_int("c ", c);
701
		print_int("d ", d);
702
		print_int("da", da);
703
		print_int("cb", cb);
704
		*/
705

706
		f255_add(x3, da, cb);
707
		f255_square(x3, x3);
708
		f255_sub(z3, da, cb);
709
		f255_square(z3, z3);
710
		f255_mul(z3, z3, x1);
711
		f255_mul(x2, aa, bb);
712
		f255_mul_a24(z2, e);
713
		f255_add(z2, z2, aa);
714
		f255_mul(z2, e, z2);
715

716
		/* obsolete
717
		print_int("x2", x2);
718
		print_int("z2", z2);
719
		print_int("x3", x3);
720
		print_int("z3", z3);
721
		*/
722
	}
723
	cswap(x2, x3, swap);
724
	cswap(z2, z3, swap);
725

726
	/*
727
	 * Inverse z2 with a modular exponentiation. This is a simple
728
	 * square-and-multiply algorithm; we mutualise most non-squarings
729
	 * since the exponent contains almost only ones.
730
	 */
731
	memcpy(a, z2, sizeof z2);
732
	for (i = 0; i < 15; i ++) {
733
		f255_square(a, a);
734
		f255_mul(a, a, z2);
735
	}
736
	memcpy(b, a, sizeof a);
737
	for (i = 0; i < 14; i ++) {
738
		int j;
739

740
		for (j = 0; j < 16; j ++) {
741
			f255_square(b, b);
742
		}
743
		f255_mul(b, b, a);
744
	}
745
	for (i = 14; i >= 0; i --) {
746
		f255_square(b, b);
747
		if ((0xFFEB >> i) & 1) {
748
			f255_mul(b, z2, b);
749
		}
750
	}
751
	f255_mul(x2, x2, b);
752
	reduce_final_f255(x2);
753
	le30_to_le8(G, 32, x2);
754
	return 1;
755
}
756

757
static size_t
758
api_mulgen(unsigned char *R,
759
	const unsigned char *x, size_t xlen, int curve)
760
{
761
	const unsigned char *G;
762
	size_t Glen;
763

764
	G = api_generator(curve, &Glen);
765
	memcpy(R, G, Glen);
766
	api_mul(R, Glen, x, xlen, curve);
767
	return Glen;
768
}
769

770
static uint32_t
771
api_muladd(unsigned char *A, const unsigned char *B, size_t len,
772
	const unsigned char *x, size_t xlen,
773
	const unsigned char *y, size_t ylen, int curve)
774
{
775
	/*
776
	 * We don't implement this method, since it is used for ECDSA
777
	 * only, and there is no ECDSA over Curve25519 (which instead
778
	 * uses EdDSA).
779
	 */
780
	(void)A;
781
	(void)B;
782
	(void)len;
783
	(void)x;
784
	(void)xlen;
785
	(void)y;
786
	(void)ylen;
787
	(void)curve;
788
	return 0;
789
}
790

791
/* see bearssl_ec.h */
792
const br_ec_impl br_ec_c25519_m31 = {
793
	(uint32_t)0x20000000,
794
	&api_generator,
795
	&api_order,
796
	&api_xoff,
797
	&api_mul,
798
	&api_mulgen,
799
	&api_muladd
800
};
801

802
Product

Resources

Company