CoCalc -- ec_p256

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/bearssl/src/ec/ec_p256_m15.c
³⁹⁵⁰⁷ views
1
/*
2
 * Copyright (c) 2017 Thomas Pornin <[email protected]>
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining 
5
 * a copy of this software and associated documentation files (the
6
 * "Software"), to deal in the Software without restriction, including
7
 * without limitation the rights to use, copy, modify, merge, publish,
8
 * distribute, sublicense, and/or sell copies of the Software, and to
9
 * permit persons to whom the Software is furnished to do so, subject to
10
 * the following conditions:
11
 *
12
 * The above copyright notice and this permission notice shall be 
13
 * included in all copies or substantial portions of the Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
16
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
18
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
 * SOFTWARE.
23
 */
24

25
#include "inner.h"
26

27
/*
28
 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
29
 * that right-shifting a signed negative integer copies the sign bit
30
 * (arithmetic right-shift). This is "implementation-defined behaviour",
31
 * i.e. it is not undefined, but it may differ between compilers. Each
32
 * compiler is supposed to document its behaviour in that respect. GCC
33
 * explicitly defines that an arithmetic right shift is used. We expect
34
 * all other compilers to do the same, because underlying CPU offer an
35
 * arithmetic right shift opcode that could not be used otherwise.
36
 */
37
#if BR_NO_ARITH_SHIFT
38
#define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
39
                    | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
40
#else
41
#define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
42
#endif
43

44
/*
45
 * Convert an integer from unsigned big-endian encoding to a sequence of
46
 * 13-bit words in little-endian order. The final "partial" word is
47
 * returned.
48
 */
49
static uint32_t
50
be8_to_le13(uint32_t *dst, const unsigned char *src, size_t len)
51
{
52
	uint32_t acc;
53
	int acc_len;
54

55
	acc = 0;
56
	acc_len = 0;
57
	while (len -- > 0) {
58
		acc |= (uint32_t)src[len] << acc_len;
59
		acc_len += 8;
60
		if (acc_len >= 13) {
61
			*dst ++ = acc & 0x1FFF;
62
			acc >>= 13;
63
			acc_len -= 13;
64
		}
65
	}
66
	return acc;
67
}
68

69
/*
70
 * Convert an integer (13-bit words, little-endian) to unsigned
71
 * big-endian encoding. The total encoding length is provided; all
72
 * the destination bytes will be filled.
73
 */
74
static void
75
le13_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
76
{
77
	uint32_t acc;
78
	int acc_len;
79

80
	acc = 0;
81
	acc_len = 0;
82
	while (len -- > 0) {
83
		if (acc_len < 8) {
84
			acc |= (*src ++) << acc_len;
85
			acc_len += 13;
86
		}
87
		dst[len] = (unsigned char)acc;
88
		acc >>= 8;
89
		acc_len -= 8;
90
	}
91
}
92

93
/*
94
 * Normalise an array of words to a strict 13 bits per word. Returned
95
 * value is the resulting carry. The source (w) and destination (d)
96
 * arrays may be identical, but shall not overlap partially.
97
 */
98
static inline uint32_t
99
norm13(uint32_t *d, const uint32_t *w, size_t len)
100
{
101
	size_t u;
102
	uint32_t cc;
103

104
	cc = 0;
105
	for (u = 0; u < len; u ++) {
106
		int32_t z;
107

108
		z = w[u] + cc;
109
		d[u] = z & 0x1FFF;
110
		cc = ARSH(z, 13);
111
	}
112
	return cc;
113
}
114

115
/*
116
 * mul20() multiplies two 260-bit integers together. Each word must fit
117
 * on 13 bits; source operands use 20 words, destination operand
118
 * receives 40 words. All overlaps allowed.
119
 *
120
 * square20() computes the square of a 260-bit integer. Each word must
121
 * fit on 13 bits; source operand uses 20 words, destination operand
122
 * receives 40 words. All overlaps allowed.
123
 */
124

125
#if BR_SLOW_MUL15
126

127
static void
128
mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
129
{
130
	/*
131
	 * Two-level Karatsuba: turns a 20x20 multiplication into
132
	 * nine 5x5 multiplications. We use 13-bit words but do not
133
	 * propagate carries immediately, so words may expand:
134
	 *
135
	 *  - First Karatsuba decomposition turns the 20x20 mul on
136
	 *    13-bit words into three 10x10 muls, two on 13-bit words
137
	 *    and one on 14-bit words.
138
	 *
139
	 *  - Second Karatsuba decomposition further splits these into:
140
	 *
141
	 *     * four 5x5 muls on 13-bit words
142
	 *     * four 5x5 muls on 14-bit words
143
	 *     * one 5x5 mul on 15-bit words
144
	 *
145
	 * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit
146
	 * or 15-bit words, respectively.
147
	 */
148
	uint32_t u[45], v[45], w[90];
149
	uint32_t cc;
150
	int i;
151

152
#define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
153
		(dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \
154
			+ (s2w)[5 * (s2_off) + 0]; \
155
		(dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \
156
			+ (s2w)[5 * (s2_off) + 1]; \
157
		(dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \
158
			+ (s2w)[5 * (s2_off) + 2]; \
159
		(dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \
160
			+ (s2w)[5 * (s2_off) + 3]; \
161
		(dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \
162
			+ (s2w)[5 * (s2_off) + 4]; \
163
	} while (0)
164

165
#define ZADDT(dw, d_off, sw, s_off)   do { \
166
		(dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \
167
		(dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \
168
		(dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \
169
		(dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \
170
		(dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \
171
	} while (0)
172

173
#define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
174
		(dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \
175
			+ (s2w)[5 * (s2_off) + 0]; \
176
		(dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \
177
			+ (s2w)[5 * (s2_off) + 1]; \
178
		(dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \
179
			+ (s2w)[5 * (s2_off) + 2]; \
180
		(dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \
181
			+ (s2w)[5 * (s2_off) + 3]; \
182
		(dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \
183
			+ (s2w)[5 * (s2_off) + 4]; \
184
	} while (0)
185

186
#define CPR1(w, cprcc)   do { \
187
		uint32_t cprz = (w) + cprcc; \
188
		(w) = cprz & 0x1FFF; \
189
		cprcc = cprz >> 13; \
190
	} while (0)
191

192
#define CPR(dw, d_off)   do { \
193
		uint32_t cprcc; \
194
		cprcc = 0; \
195
		CPR1((dw)[(d_off) + 0], cprcc); \
196
		CPR1((dw)[(d_off) + 1], cprcc); \
197
		CPR1((dw)[(d_off) + 2], cprcc); \
198
		CPR1((dw)[(d_off) + 3], cprcc); \
199
		CPR1((dw)[(d_off) + 4], cprcc); \
200
		CPR1((dw)[(d_off) + 5], cprcc); \
201
		CPR1((dw)[(d_off) + 6], cprcc); \
202
		CPR1((dw)[(d_off) + 7], cprcc); \
203
		CPR1((dw)[(d_off) + 8], cprcc); \
204
		(dw)[(d_off) + 9] = cprcc; \
205
	} while (0)
206

207
	memcpy(u, a, 20 * sizeof *a);
208
	ZADD(u, 4, a, 0, a, 1);
209
	ZADD(u, 5, a, 2, a, 3);
210
	ZADD(u, 6, a, 0, a, 2);
211
	ZADD(u, 7, a, 1, a, 3);
212
	ZADD(u, 8, u, 6, u, 7);
213

214
	memcpy(v, b, 20 * sizeof *b);
215
	ZADD(v, 4, b, 0, b, 1);
216
	ZADD(v, 5, b, 2, b, 3);
217
	ZADD(v, 6, b, 0, b, 2);
218
	ZADD(v, 7, b, 1, b, 3);
219
	ZADD(v, 8, v, 6, v, 7);
220

221
	/*
222
	 * Do the eight first 8x8 muls. Source words are at most 16382
223
	 * each, so we can add product results together "as is" in 32-bit
224
	 * words.
225
	 */
226
	for (i = 0; i < 40; i += 5) {
227
		w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]);
228
		w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1])
229
			+ MUL15(u[i + 1], v[i + 0]);
230
		w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2])
231
			+ MUL15(u[i + 1], v[i + 1])
232
			+ MUL15(u[i + 2], v[i + 0]);
233
		w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3])
234
			+ MUL15(u[i + 1], v[i + 2])
235
			+ MUL15(u[i + 2], v[i + 1])
236
			+ MUL15(u[i + 3], v[i + 0]);
237
		w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4])
238
			+ MUL15(u[i + 1], v[i + 3])
239
			+ MUL15(u[i + 2], v[i + 2])
240
			+ MUL15(u[i + 3], v[i + 1])
241
			+ MUL15(u[i + 4], v[i + 0]);
242
		w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4])
243
			+ MUL15(u[i + 2], v[i + 3])
244
			+ MUL15(u[i + 3], v[i + 2])
245
			+ MUL15(u[i + 4], v[i + 1]);
246
		w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4])
247
			+ MUL15(u[i + 3], v[i + 3])
248
			+ MUL15(u[i + 4], v[i + 2]);
249
		w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4])
250
			+ MUL15(u[i + 4], v[i + 3]);
251
		w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]);
252
		w[(i << 1) + 9] = 0;
253
	}
254

255
	/*
256
	 * For the 9th multiplication, source words are up to 32764,
257
	 * so we must do some carry propagation. If we add up to
258
	 * 4 products and the carry is no more than 524224, then the
259
	 * result fits in 32 bits, and the next carry will be no more
260
	 * than 524224 (because 4*(32764^2)+524224 < 8192*524225).
261
	 *
262
	 * We thus just skip one of the products in the middle word,
263
	 * then do a carry propagation (this reduces words to 13 bits
264
	 * each, except possibly the last, which may use up to 17 bits
265
	 * or so), then add the missing product.
266
	 */
267
	w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]);
268
	w[80 + 1] = MUL15(u[40 + 0], v[40 + 1])
269
		+ MUL15(u[40 + 1], v[40 + 0]);
270
	w[80 + 2] = MUL15(u[40 + 0], v[40 + 2])
271
		+ MUL15(u[40 + 1], v[40 + 1])
272
		+ MUL15(u[40 + 2], v[40 + 0]);
273
	w[80 + 3] = MUL15(u[40 + 0], v[40 + 3])
274
		+ MUL15(u[40 + 1], v[40 + 2])
275
		+ MUL15(u[40 + 2], v[40 + 1])
276
		+ MUL15(u[40 + 3], v[40 + 0]);
277
	w[80 + 4] = MUL15(u[40 + 0], v[40 + 4])
278
		+ MUL15(u[40 + 1], v[40 + 3])
279
		+ MUL15(u[40 + 2], v[40 + 2])
280
		+ MUL15(u[40 + 3], v[40 + 1]);
281
		/* + MUL15(u[40 + 4], v[40 + 0]) */
282
	w[80 + 5] = MUL15(u[40 + 1], v[40 + 4])
283
		+ MUL15(u[40 + 2], v[40 + 3])
284
		+ MUL15(u[40 + 3], v[40 + 2])
285
		+ MUL15(u[40 + 4], v[40 + 1]);
286
	w[80 + 6] = MUL15(u[40 + 2], v[40 + 4])
287
		+ MUL15(u[40 + 3], v[40 + 3])
288
		+ MUL15(u[40 + 4], v[40 + 2]);
289
	w[80 + 7] = MUL15(u[40 + 3], v[40 + 4])
290
		+ MUL15(u[40 + 4], v[40 + 3]);
291
	w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]);
292

293
	CPR(w, 80);
294

295
	w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]);
296

297
	/*
298
	 * The products on 14-bit words in slots 6 and 7 yield values
299
	 * up to 5*(16382^2) each, and we need to subtract two such
300
	 * values from the higher word. We need the subtraction to fit
301
	 * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit.
302
	 * However, 10*(16382^2) does not fit. So we must perform a
303
	 * bit of reduction here.
304
	 */
305
	CPR(w, 60);
306
	CPR(w, 70);
307

308
	/*
309
	 * Recompose results.
310
	 */
311

312
	/* 0..1*0..1 into 0..3 */
313
	ZSUB2F(w, 8, w, 0, w, 2);
314
	ZSUB2F(w, 9, w, 1, w, 3);
315
	ZADDT(w, 1, w, 8);
316
	ZADDT(w, 2, w, 9);
317

318
	/* 2..3*2..3 into 4..7 */
319
	ZSUB2F(w, 10, w, 4, w, 6);
320
	ZSUB2F(w, 11, w, 5, w, 7);
321
	ZADDT(w, 5, w, 10);
322
	ZADDT(w, 6, w, 11);
323

324
	/* (0..1+2..3)*(0..1+2..3) into 12..15 */
325
	ZSUB2F(w, 16, w, 12, w, 14);
326
	ZSUB2F(w, 17, w, 13, w, 15);
327
	ZADDT(w, 13, w, 16);
328
	ZADDT(w, 14, w, 17);
329

330
	/* first-level recomposition */
331
	ZSUB2F(w, 12, w, 0, w, 4);
332
	ZSUB2F(w, 13, w, 1, w, 5);
333
	ZSUB2F(w, 14, w, 2, w, 6);
334
	ZSUB2F(w, 15, w, 3, w, 7);
335
	ZADDT(w, 2, w, 12);
336
	ZADDT(w, 3, w, 13);
337
	ZADDT(w, 4, w, 14);
338
	ZADDT(w, 5, w, 15);
339

340
	/*
341
	 * Perform carry propagation to bring all words down to 13 bits.
342
	 */
343
	cc = norm13(d, w, 40);
344
	d[39] += (cc << 13);
345

346
#undef ZADD
347
#undef ZADDT
348
#undef ZSUB2F
349
#undef CPR1
350
#undef CPR
351
}
352

353
static inline void
354
square20(uint32_t *d, const uint32_t *a)
355
{
356
	mul20(d, a, a);
357
}
358

359
#else
360

361
static void
362
mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
363
{
364
	uint32_t t[39];
365

366
	t[ 0] = MUL15(a[ 0], b[ 0]);
367
	t[ 1] = MUL15(a[ 0], b[ 1])
368
		+ MUL15(a[ 1], b[ 0]);
369
	t[ 2] = MUL15(a[ 0], b[ 2])
370
		+ MUL15(a[ 1], b[ 1])
371
		+ MUL15(a[ 2], b[ 0]);
372
	t[ 3] = MUL15(a[ 0], b[ 3])
373
		+ MUL15(a[ 1], b[ 2])
374
		+ MUL15(a[ 2], b[ 1])
375
		+ MUL15(a[ 3], b[ 0]);
376
	t[ 4] = MUL15(a[ 0], b[ 4])
377
		+ MUL15(a[ 1], b[ 3])
378
		+ MUL15(a[ 2], b[ 2])
379
		+ MUL15(a[ 3], b[ 1])
380
		+ MUL15(a[ 4], b[ 0]);
381
	t[ 5] = MUL15(a[ 0], b[ 5])
382
		+ MUL15(a[ 1], b[ 4])
383
		+ MUL15(a[ 2], b[ 3])
384
		+ MUL15(a[ 3], b[ 2])
385
		+ MUL15(a[ 4], b[ 1])
386
		+ MUL15(a[ 5], b[ 0]);
387
	t[ 6] = MUL15(a[ 0], b[ 6])
388
		+ MUL15(a[ 1], b[ 5])
389
		+ MUL15(a[ 2], b[ 4])
390
		+ MUL15(a[ 3], b[ 3])
391
		+ MUL15(a[ 4], b[ 2])
392
		+ MUL15(a[ 5], b[ 1])
393
		+ MUL15(a[ 6], b[ 0]);
394
	t[ 7] = MUL15(a[ 0], b[ 7])
395
		+ MUL15(a[ 1], b[ 6])
396
		+ MUL15(a[ 2], b[ 5])
397
		+ MUL15(a[ 3], b[ 4])
398
		+ MUL15(a[ 4], b[ 3])
399
		+ MUL15(a[ 5], b[ 2])
400
		+ MUL15(a[ 6], b[ 1])
401
		+ MUL15(a[ 7], b[ 0]);
402
	t[ 8] = MUL15(a[ 0], b[ 8])
403
		+ MUL15(a[ 1], b[ 7])
404
		+ MUL15(a[ 2], b[ 6])
405
		+ MUL15(a[ 3], b[ 5])
406
		+ MUL15(a[ 4], b[ 4])
407
		+ MUL15(a[ 5], b[ 3])
408
		+ MUL15(a[ 6], b[ 2])
409
		+ MUL15(a[ 7], b[ 1])
410
		+ MUL15(a[ 8], b[ 0]);
411
	t[ 9] = MUL15(a[ 0], b[ 9])
412
		+ MUL15(a[ 1], b[ 8])
413
		+ MUL15(a[ 2], b[ 7])
414
		+ MUL15(a[ 3], b[ 6])
415
		+ MUL15(a[ 4], b[ 5])
416
		+ MUL15(a[ 5], b[ 4])
417
		+ MUL15(a[ 6], b[ 3])
418
		+ MUL15(a[ 7], b[ 2])
419
		+ MUL15(a[ 8], b[ 1])
420
		+ MUL15(a[ 9], b[ 0]);
421
	t[10] = MUL15(a[ 0], b[10])
422
		+ MUL15(a[ 1], b[ 9])
423
		+ MUL15(a[ 2], b[ 8])
424
		+ MUL15(a[ 3], b[ 7])
425
		+ MUL15(a[ 4], b[ 6])
426
		+ MUL15(a[ 5], b[ 5])
427
		+ MUL15(a[ 6], b[ 4])
428
		+ MUL15(a[ 7], b[ 3])
429
		+ MUL15(a[ 8], b[ 2])
430
		+ MUL15(a[ 9], b[ 1])
431
		+ MUL15(a[10], b[ 0]);
432
	t[11] = MUL15(a[ 0], b[11])
433
		+ MUL15(a[ 1], b[10])
434
		+ MUL15(a[ 2], b[ 9])
435
		+ MUL15(a[ 3], b[ 8])
436
		+ MUL15(a[ 4], b[ 7])
437
		+ MUL15(a[ 5], b[ 6])
438
		+ MUL15(a[ 6], b[ 5])
439
		+ MUL15(a[ 7], b[ 4])
440
		+ MUL15(a[ 8], b[ 3])
441
		+ MUL15(a[ 9], b[ 2])
442
		+ MUL15(a[10], b[ 1])
443
		+ MUL15(a[11], b[ 0]);
444
	t[12] = MUL15(a[ 0], b[12])
445
		+ MUL15(a[ 1], b[11])
446
		+ MUL15(a[ 2], b[10])
447
		+ MUL15(a[ 3], b[ 9])
448
		+ MUL15(a[ 4], b[ 8])
449
		+ MUL15(a[ 5], b[ 7])
450
		+ MUL15(a[ 6], b[ 6])
451
		+ MUL15(a[ 7], b[ 5])
452
		+ MUL15(a[ 8], b[ 4])
453
		+ MUL15(a[ 9], b[ 3])
454
		+ MUL15(a[10], b[ 2])
455
		+ MUL15(a[11], b[ 1])
456
		+ MUL15(a[12], b[ 0]);
457
	t[13] = MUL15(a[ 0], b[13])
458
		+ MUL15(a[ 1], b[12])
459
		+ MUL15(a[ 2], b[11])
460
		+ MUL15(a[ 3], b[10])
461
		+ MUL15(a[ 4], b[ 9])
462
		+ MUL15(a[ 5], b[ 8])
463
		+ MUL15(a[ 6], b[ 7])
464
		+ MUL15(a[ 7], b[ 6])
465
		+ MUL15(a[ 8], b[ 5])
466
		+ MUL15(a[ 9], b[ 4])
467
		+ MUL15(a[10], b[ 3])
468
		+ MUL15(a[11], b[ 2])
469
		+ MUL15(a[12], b[ 1])
470
		+ MUL15(a[13], b[ 0]);
471
	t[14] = MUL15(a[ 0], b[14])
472
		+ MUL15(a[ 1], b[13])
473
		+ MUL15(a[ 2], b[12])
474
		+ MUL15(a[ 3], b[11])
475
		+ MUL15(a[ 4], b[10])
476
		+ MUL15(a[ 5], b[ 9])
477
		+ MUL15(a[ 6], b[ 8])
478
		+ MUL15(a[ 7], b[ 7])
479
		+ MUL15(a[ 8], b[ 6])
480
		+ MUL15(a[ 9], b[ 5])
481
		+ MUL15(a[10], b[ 4])
482
		+ MUL15(a[11], b[ 3])
483
		+ MUL15(a[12], b[ 2])
484
		+ MUL15(a[13], b[ 1])
485
		+ MUL15(a[14], b[ 0]);
486
	t[15] = MUL15(a[ 0], b[15])
487
		+ MUL15(a[ 1], b[14])
488
		+ MUL15(a[ 2], b[13])
489
		+ MUL15(a[ 3], b[12])
490
		+ MUL15(a[ 4], b[11])
491
		+ MUL15(a[ 5], b[10])
492
		+ MUL15(a[ 6], b[ 9])
493
		+ MUL15(a[ 7], b[ 8])
494
		+ MUL15(a[ 8], b[ 7])
495
		+ MUL15(a[ 9], b[ 6])
496
		+ MUL15(a[10], b[ 5])
497
		+ MUL15(a[11], b[ 4])
498
		+ MUL15(a[12], b[ 3])
499
		+ MUL15(a[13], b[ 2])
500
		+ MUL15(a[14], b[ 1])
501
		+ MUL15(a[15], b[ 0]);
502
	t[16] = MUL15(a[ 0], b[16])
503
		+ MUL15(a[ 1], b[15])
504
		+ MUL15(a[ 2], b[14])
505
		+ MUL15(a[ 3], b[13])
506
		+ MUL15(a[ 4], b[12])
507
		+ MUL15(a[ 5], b[11])
508
		+ MUL15(a[ 6], b[10])
509
		+ MUL15(a[ 7], b[ 9])
510
		+ MUL15(a[ 8], b[ 8])
511
		+ MUL15(a[ 9], b[ 7])
512
		+ MUL15(a[10], b[ 6])
513
		+ MUL15(a[11], b[ 5])
514
		+ MUL15(a[12], b[ 4])
515
		+ MUL15(a[13], b[ 3])
516
		+ MUL15(a[14], b[ 2])
517
		+ MUL15(a[15], b[ 1])
518
		+ MUL15(a[16], b[ 0]);
519
	t[17] = MUL15(a[ 0], b[17])
520
		+ MUL15(a[ 1], b[16])
521
		+ MUL15(a[ 2], b[15])
522
		+ MUL15(a[ 3], b[14])
523
		+ MUL15(a[ 4], b[13])
524
		+ MUL15(a[ 5], b[12])
525
		+ MUL15(a[ 6], b[11])
526
		+ MUL15(a[ 7], b[10])
527
		+ MUL15(a[ 8], b[ 9])
528
		+ MUL15(a[ 9], b[ 8])
529
		+ MUL15(a[10], b[ 7])
530
		+ MUL15(a[11], b[ 6])
531
		+ MUL15(a[12], b[ 5])
532
		+ MUL15(a[13], b[ 4])
533
		+ MUL15(a[14], b[ 3])
534
		+ MUL15(a[15], b[ 2])
535
		+ MUL15(a[16], b[ 1])
536
		+ MUL15(a[17], b[ 0]);
537
	t[18] = MUL15(a[ 0], b[18])
538
		+ MUL15(a[ 1], b[17])
539
		+ MUL15(a[ 2], b[16])
540
		+ MUL15(a[ 3], b[15])
541
		+ MUL15(a[ 4], b[14])
542
		+ MUL15(a[ 5], b[13])
543
		+ MUL15(a[ 6], b[12])
544
		+ MUL15(a[ 7], b[11])
545
		+ MUL15(a[ 8], b[10])
546
		+ MUL15(a[ 9], b[ 9])
547
		+ MUL15(a[10], b[ 8])
548
		+ MUL15(a[11], b[ 7])
549
		+ MUL15(a[12], b[ 6])
550
		+ MUL15(a[13], b[ 5])
551
		+ MUL15(a[14], b[ 4])
552
		+ MUL15(a[15], b[ 3])
553
		+ MUL15(a[16], b[ 2])
554
		+ MUL15(a[17], b[ 1])
555
		+ MUL15(a[18], b[ 0]);
556
	t[19] = MUL15(a[ 0], b[19])
557
		+ MUL15(a[ 1], b[18])
558
		+ MUL15(a[ 2], b[17])
559
		+ MUL15(a[ 3], b[16])
560
		+ MUL15(a[ 4], b[15])
561
		+ MUL15(a[ 5], b[14])
562
		+ MUL15(a[ 6], b[13])
563
		+ MUL15(a[ 7], b[12])
564
		+ MUL15(a[ 8], b[11])
565
		+ MUL15(a[ 9], b[10])
566
		+ MUL15(a[10], b[ 9])
567
		+ MUL15(a[11], b[ 8])
568
		+ MUL15(a[12], b[ 7])
569
		+ MUL15(a[13], b[ 6])
570
		+ MUL15(a[14], b[ 5])
571
		+ MUL15(a[15], b[ 4])
572
		+ MUL15(a[16], b[ 3])
573
		+ MUL15(a[17], b[ 2])
574
		+ MUL15(a[18], b[ 1])
575
		+ MUL15(a[19], b[ 0]);
576
	t[20] = MUL15(a[ 1], b[19])
577
		+ MUL15(a[ 2], b[18])
578
		+ MUL15(a[ 3], b[17])
579
		+ MUL15(a[ 4], b[16])
580
		+ MUL15(a[ 5], b[15])
581
		+ MUL15(a[ 6], b[14])
582
		+ MUL15(a[ 7], b[13])
583
		+ MUL15(a[ 8], b[12])
584
		+ MUL15(a[ 9], b[11])
585
		+ MUL15(a[10], b[10])
586
		+ MUL15(a[11], b[ 9])
587
		+ MUL15(a[12], b[ 8])
588
		+ MUL15(a[13], b[ 7])
589
		+ MUL15(a[14], b[ 6])
590
		+ MUL15(a[15], b[ 5])
591
		+ MUL15(a[16], b[ 4])
592
		+ MUL15(a[17], b[ 3])
593
		+ MUL15(a[18], b[ 2])
594
		+ MUL15(a[19], b[ 1]);
595
	t[21] = MUL15(a[ 2], b[19])
596
		+ MUL15(a[ 3], b[18])
597
		+ MUL15(a[ 4], b[17])
598
		+ MUL15(a[ 5], b[16])
599
		+ MUL15(a[ 6], b[15])
600
		+ MUL15(a[ 7], b[14])
601
		+ MUL15(a[ 8], b[13])
602
		+ MUL15(a[ 9], b[12])
603
		+ MUL15(a[10], b[11])
604
		+ MUL15(a[11], b[10])
605
		+ MUL15(a[12], b[ 9])
606
		+ MUL15(a[13], b[ 8])
607
		+ MUL15(a[14], b[ 7])
608
		+ MUL15(a[15], b[ 6])
609
		+ MUL15(a[16], b[ 5])
610
		+ MUL15(a[17], b[ 4])
611
		+ MUL15(a[18], b[ 3])
612
		+ MUL15(a[19], b[ 2]);
613
	t[22] = MUL15(a[ 3], b[19])
614
		+ MUL15(a[ 4], b[18])
615
		+ MUL15(a[ 5], b[17])
616
		+ MUL15(a[ 6], b[16])
617
		+ MUL15(a[ 7], b[15])
618
		+ MUL15(a[ 8], b[14])
619
		+ MUL15(a[ 9], b[13])
620
		+ MUL15(a[10], b[12])
621
		+ MUL15(a[11], b[11])
622
		+ MUL15(a[12], b[10])
623
		+ MUL15(a[13], b[ 9])
624
		+ MUL15(a[14], b[ 8])
625
		+ MUL15(a[15], b[ 7])
626
		+ MUL15(a[16], b[ 6])
627
		+ MUL15(a[17], b[ 5])
628
		+ MUL15(a[18], b[ 4])
629
		+ MUL15(a[19], b[ 3]);
630
	t[23] = MUL15(a[ 4], b[19])
631
		+ MUL15(a[ 5], b[18])
632
		+ MUL15(a[ 6], b[17])
633
		+ MUL15(a[ 7], b[16])
634
		+ MUL15(a[ 8], b[15])
635
		+ MUL15(a[ 9], b[14])
636
		+ MUL15(a[10], b[13])
637
		+ MUL15(a[11], b[12])
638
		+ MUL15(a[12], b[11])
639
		+ MUL15(a[13], b[10])
640
		+ MUL15(a[14], b[ 9])
641
		+ MUL15(a[15], b[ 8])
642
		+ MUL15(a[16], b[ 7])
643
		+ MUL15(a[17], b[ 6])
644
		+ MUL15(a[18], b[ 5])
645
		+ MUL15(a[19], b[ 4]);
646
	t[24] = MUL15(a[ 5], b[19])
647
		+ MUL15(a[ 6], b[18])
648
		+ MUL15(a[ 7], b[17])
649
		+ MUL15(a[ 8], b[16])
650
		+ MUL15(a[ 9], b[15])
651
		+ MUL15(a[10], b[14])
652
		+ MUL15(a[11], b[13])
653
		+ MUL15(a[12], b[12])
654
		+ MUL15(a[13], b[11])
655
		+ MUL15(a[14], b[10])
656
		+ MUL15(a[15], b[ 9])
657
		+ MUL15(a[16], b[ 8])
658
		+ MUL15(a[17], b[ 7])
659
		+ MUL15(a[18], b[ 6])
660
		+ MUL15(a[19], b[ 5]);
661
	t[25] = MUL15(a[ 6], b[19])
662
		+ MUL15(a[ 7], b[18])
663
		+ MUL15(a[ 8], b[17])
664
		+ MUL15(a[ 9], b[16])
665
		+ MUL15(a[10], b[15])
666
		+ MUL15(a[11], b[14])
667
		+ MUL15(a[12], b[13])
668
		+ MUL15(a[13], b[12])
669
		+ MUL15(a[14], b[11])
670
		+ MUL15(a[15], b[10])
671
		+ MUL15(a[16], b[ 9])
672
		+ MUL15(a[17], b[ 8])
673
		+ MUL15(a[18], b[ 7])
674
		+ MUL15(a[19], b[ 6]);
675
	t[26] = MUL15(a[ 7], b[19])
676
		+ MUL15(a[ 8], b[18])
677
		+ MUL15(a[ 9], b[17])
678
		+ MUL15(a[10], b[16])
679
		+ MUL15(a[11], b[15])
680
		+ MUL15(a[12], b[14])
681
		+ MUL15(a[13], b[13])
682
		+ MUL15(a[14], b[12])
683
		+ MUL15(a[15], b[11])
684
		+ MUL15(a[16], b[10])
685
		+ MUL15(a[17], b[ 9])
686
		+ MUL15(a[18], b[ 8])
687
		+ MUL15(a[19], b[ 7]);
688
	t[27] = MUL15(a[ 8], b[19])
689
		+ MUL15(a[ 9], b[18])
690
		+ MUL15(a[10], b[17])
691
		+ MUL15(a[11], b[16])
692
		+ MUL15(a[12], b[15])
693
		+ MUL15(a[13], b[14])
694
		+ MUL15(a[14], b[13])
695
		+ MUL15(a[15], b[12])
696
		+ MUL15(a[16], b[11])
697
		+ MUL15(a[17], b[10])
698
		+ MUL15(a[18], b[ 9])
699
		+ MUL15(a[19], b[ 8]);
700
	t[28] = MUL15(a[ 9], b[19])
701
		+ MUL15(a[10], b[18])
702
		+ MUL15(a[11], b[17])
703
		+ MUL15(a[12], b[16])
704
		+ MUL15(a[13], b[15])
705
		+ MUL15(a[14], b[14])
706
		+ MUL15(a[15], b[13])
707
		+ MUL15(a[16], b[12])
708
		+ MUL15(a[17], b[11])
709
		+ MUL15(a[18], b[10])
710
		+ MUL15(a[19], b[ 9]);
711
	t[29] = MUL15(a[10], b[19])
712
		+ MUL15(a[11], b[18])
713
		+ MUL15(a[12], b[17])
714
		+ MUL15(a[13], b[16])
715
		+ MUL15(a[14], b[15])
716
		+ MUL15(a[15], b[14])
717
		+ MUL15(a[16], b[13])
718
		+ MUL15(a[17], b[12])
719
		+ MUL15(a[18], b[11])
720
		+ MUL15(a[19], b[10]);
721
	t[30] = MUL15(a[11], b[19])
722
		+ MUL15(a[12], b[18])
723
		+ MUL15(a[13], b[17])
724
		+ MUL15(a[14], b[16])
725
		+ MUL15(a[15], b[15])
726
		+ MUL15(a[16], b[14])
727
		+ MUL15(a[17], b[13])
728
		+ MUL15(a[18], b[12])
729
		+ MUL15(a[19], b[11]);
730
	t[31] = MUL15(a[12], b[19])
731
		+ MUL15(a[13], b[18])
732
		+ MUL15(a[14], b[17])
733
		+ MUL15(a[15], b[16])
734
		+ MUL15(a[16], b[15])
735
		+ MUL15(a[17], b[14])
736
		+ MUL15(a[18], b[13])
737
		+ MUL15(a[19], b[12]);
738
	t[32] = MUL15(a[13], b[19])
739
		+ MUL15(a[14], b[18])
740
		+ MUL15(a[15], b[17])
741
		+ MUL15(a[16], b[16])
742
		+ MUL15(a[17], b[15])
743
		+ MUL15(a[18], b[14])
744
		+ MUL15(a[19], b[13]);
745
	t[33] = MUL15(a[14], b[19])
746
		+ MUL15(a[15], b[18])
747
		+ MUL15(a[16], b[17])
748
		+ MUL15(a[17], b[16])
749
		+ MUL15(a[18], b[15])
750
		+ MUL15(a[19], b[14]);
751
	t[34] = MUL15(a[15], b[19])
752
		+ MUL15(a[16], b[18])
753
		+ MUL15(a[17], b[17])
754
		+ MUL15(a[18], b[16])
755
		+ MUL15(a[19], b[15]);
756
	t[35] = MUL15(a[16], b[19])
757
		+ MUL15(a[17], b[18])
758
		+ MUL15(a[18], b[17])
759
		+ MUL15(a[19], b[16]);
760
	t[36] = MUL15(a[17], b[19])
761
		+ MUL15(a[18], b[18])
762
		+ MUL15(a[19], b[17]);
763
	t[37] = MUL15(a[18], b[19])
764
		+ MUL15(a[19], b[18]);
765
	t[38] = MUL15(a[19], b[19]);
766
	d[39] = norm13(d, t, 39);
767
}
768

769
static void
770
square20(uint32_t *d, const uint32_t *a)
771
{
772
	uint32_t t[39];
773

774
	t[ 0] = MUL15(a[ 0], a[ 0]);
775
	t[ 1] = ((MUL15(a[ 0], a[ 1])) << 1);
776
	t[ 2] = MUL15(a[ 1], a[ 1])
777
		+ ((MUL15(a[ 0], a[ 2])) << 1);
778
	t[ 3] = ((MUL15(a[ 0], a[ 3])
779
		+ MUL15(a[ 1], a[ 2])) << 1);
780
	t[ 4] = MUL15(a[ 2], a[ 2])
781
		+ ((MUL15(a[ 0], a[ 4])
782
		+ MUL15(a[ 1], a[ 3])) << 1);
783
	t[ 5] = ((MUL15(a[ 0], a[ 5])
784
		+ MUL15(a[ 1], a[ 4])
785
		+ MUL15(a[ 2], a[ 3])) << 1);
786
	t[ 6] = MUL15(a[ 3], a[ 3])
787
		+ ((MUL15(a[ 0], a[ 6])
788
		+ MUL15(a[ 1], a[ 5])
789
		+ MUL15(a[ 2], a[ 4])) << 1);
790
	t[ 7] = ((MUL15(a[ 0], a[ 7])
791
		+ MUL15(a[ 1], a[ 6])
792
		+ MUL15(a[ 2], a[ 5])
793
		+ MUL15(a[ 3], a[ 4])) << 1);
794
	t[ 8] = MUL15(a[ 4], a[ 4])
795
		+ ((MUL15(a[ 0], a[ 8])
796
		+ MUL15(a[ 1], a[ 7])
797
		+ MUL15(a[ 2], a[ 6])
798
		+ MUL15(a[ 3], a[ 5])) << 1);
799
	t[ 9] = ((MUL15(a[ 0], a[ 9])
800
		+ MUL15(a[ 1], a[ 8])
801
		+ MUL15(a[ 2], a[ 7])
802
		+ MUL15(a[ 3], a[ 6])
803
		+ MUL15(a[ 4], a[ 5])) << 1);
804
	t[10] = MUL15(a[ 5], a[ 5])
805
		+ ((MUL15(a[ 0], a[10])
806
		+ MUL15(a[ 1], a[ 9])
807
		+ MUL15(a[ 2], a[ 8])
808
		+ MUL15(a[ 3], a[ 7])
809
		+ MUL15(a[ 4], a[ 6])) << 1);
810
	t[11] = ((MUL15(a[ 0], a[11])
811
		+ MUL15(a[ 1], a[10])
812
		+ MUL15(a[ 2], a[ 9])
813
		+ MUL15(a[ 3], a[ 8])
814
		+ MUL15(a[ 4], a[ 7])
815
		+ MUL15(a[ 5], a[ 6])) << 1);
816
	t[12] = MUL15(a[ 6], a[ 6])
817
		+ ((MUL15(a[ 0], a[12])
818
		+ MUL15(a[ 1], a[11])
819
		+ MUL15(a[ 2], a[10])
820
		+ MUL15(a[ 3], a[ 9])
821
		+ MUL15(a[ 4], a[ 8])
822
		+ MUL15(a[ 5], a[ 7])) << 1);
823
	t[13] = ((MUL15(a[ 0], a[13])
824
		+ MUL15(a[ 1], a[12])
825
		+ MUL15(a[ 2], a[11])
826
		+ MUL15(a[ 3], a[10])
827
		+ MUL15(a[ 4], a[ 9])
828
		+ MUL15(a[ 5], a[ 8])
829
		+ MUL15(a[ 6], a[ 7])) << 1);
830
	t[14] = MUL15(a[ 7], a[ 7])
831
		+ ((MUL15(a[ 0], a[14])
832
		+ MUL15(a[ 1], a[13])
833
		+ MUL15(a[ 2], a[12])
834
		+ MUL15(a[ 3], a[11])
835
		+ MUL15(a[ 4], a[10])
836
		+ MUL15(a[ 5], a[ 9])
837
		+ MUL15(a[ 6], a[ 8])) << 1);
838
	t[15] = ((MUL15(a[ 0], a[15])
839
		+ MUL15(a[ 1], a[14])
840
		+ MUL15(a[ 2], a[13])
841
		+ MUL15(a[ 3], a[12])
842
		+ MUL15(a[ 4], a[11])
843
		+ MUL15(a[ 5], a[10])
844
		+ MUL15(a[ 6], a[ 9])
845
		+ MUL15(a[ 7], a[ 8])) << 1);
846
	t[16] = MUL15(a[ 8], a[ 8])
847
		+ ((MUL15(a[ 0], a[16])
848
		+ MUL15(a[ 1], a[15])
849
		+ MUL15(a[ 2], a[14])
850
		+ MUL15(a[ 3], a[13])
851
		+ MUL15(a[ 4], a[12])
852
		+ MUL15(a[ 5], a[11])
853
		+ MUL15(a[ 6], a[10])
854
		+ MUL15(a[ 7], a[ 9])) << 1);
855
	t[17] = ((MUL15(a[ 0], a[17])
856
		+ MUL15(a[ 1], a[16])
857
		+ MUL15(a[ 2], a[15])
858
		+ MUL15(a[ 3], a[14])
859
		+ MUL15(a[ 4], a[13])
860
		+ MUL15(a[ 5], a[12])
861
		+ MUL15(a[ 6], a[11])
862
		+ MUL15(a[ 7], a[10])
863
		+ MUL15(a[ 8], a[ 9])) << 1);
864
	t[18] = MUL15(a[ 9], a[ 9])
865
		+ ((MUL15(a[ 0], a[18])
866
		+ MUL15(a[ 1], a[17])
867
		+ MUL15(a[ 2], a[16])
868
		+ MUL15(a[ 3], a[15])
869
		+ MUL15(a[ 4], a[14])
870
		+ MUL15(a[ 5], a[13])
871
		+ MUL15(a[ 6], a[12])
872
		+ MUL15(a[ 7], a[11])
873
		+ MUL15(a[ 8], a[10])) << 1);
874
	t[19] = ((MUL15(a[ 0], a[19])
875
		+ MUL15(a[ 1], a[18])
876
		+ MUL15(a[ 2], a[17])
877
		+ MUL15(a[ 3], a[16])
878
		+ MUL15(a[ 4], a[15])
879
		+ MUL15(a[ 5], a[14])
880
		+ MUL15(a[ 6], a[13])
881
		+ MUL15(a[ 7], a[12])
882
		+ MUL15(a[ 8], a[11])
883
		+ MUL15(a[ 9], a[10])) << 1);
884
	t[20] = MUL15(a[10], a[10])
885
		+ ((MUL15(a[ 1], a[19])
886
		+ MUL15(a[ 2], a[18])
887
		+ MUL15(a[ 3], a[17])
888
		+ MUL15(a[ 4], a[16])
889
		+ MUL15(a[ 5], a[15])
890
		+ MUL15(a[ 6], a[14])
891
		+ MUL15(a[ 7], a[13])
892
		+ MUL15(a[ 8], a[12])
893
		+ MUL15(a[ 9], a[11])) << 1);
894
	t[21] = ((MUL15(a[ 2], a[19])
895
		+ MUL15(a[ 3], a[18])
896
		+ MUL15(a[ 4], a[17])
897
		+ MUL15(a[ 5], a[16])
898
		+ MUL15(a[ 6], a[15])
899
		+ MUL15(a[ 7], a[14])
900
		+ MUL15(a[ 8], a[13])
901
		+ MUL15(a[ 9], a[12])
902
		+ MUL15(a[10], a[11])) << 1);
903
	t[22] = MUL15(a[11], a[11])
904
		+ ((MUL15(a[ 3], a[19])
905
		+ MUL15(a[ 4], a[18])
906
		+ MUL15(a[ 5], a[17])
907
		+ MUL15(a[ 6], a[16])
908
		+ MUL15(a[ 7], a[15])
909
		+ MUL15(a[ 8], a[14])
910
		+ MUL15(a[ 9], a[13])
911
		+ MUL15(a[10], a[12])) << 1);
912
	t[23] = ((MUL15(a[ 4], a[19])
913
		+ MUL15(a[ 5], a[18])
914
		+ MUL15(a[ 6], a[17])
915
		+ MUL15(a[ 7], a[16])
916
		+ MUL15(a[ 8], a[15])
917
		+ MUL15(a[ 9], a[14])
918
		+ MUL15(a[10], a[13])
919
		+ MUL15(a[11], a[12])) << 1);
920
	t[24] = MUL15(a[12], a[12])
921
		+ ((MUL15(a[ 5], a[19])
922
		+ MUL15(a[ 6], a[18])
923
		+ MUL15(a[ 7], a[17])
924
		+ MUL15(a[ 8], a[16])
925
		+ MUL15(a[ 9], a[15])
926
		+ MUL15(a[10], a[14])
927
		+ MUL15(a[11], a[13])) << 1);
928
	t[25] = ((MUL15(a[ 6], a[19])
929
		+ MUL15(a[ 7], a[18])
930
		+ MUL15(a[ 8], a[17])
931
		+ MUL15(a[ 9], a[16])
932
		+ MUL15(a[10], a[15])
933
		+ MUL15(a[11], a[14])
934
		+ MUL15(a[12], a[13])) << 1);
935
	t[26] = MUL15(a[13], a[13])
936
		+ ((MUL15(a[ 7], a[19])
937
		+ MUL15(a[ 8], a[18])
938
		+ MUL15(a[ 9], a[17])
939
		+ MUL15(a[10], a[16])
940
		+ MUL15(a[11], a[15])
941
		+ MUL15(a[12], a[14])) << 1);
942
	t[27] = ((MUL15(a[ 8], a[19])
943
		+ MUL15(a[ 9], a[18])
944
		+ MUL15(a[10], a[17])
945
		+ MUL15(a[11], a[16])
946
		+ MUL15(a[12], a[15])
947
		+ MUL15(a[13], a[14])) << 1);
948
	t[28] = MUL15(a[14], a[14])
949
		+ ((MUL15(a[ 9], a[19])
950
		+ MUL15(a[10], a[18])
951
		+ MUL15(a[11], a[17])
952
		+ MUL15(a[12], a[16])
953
		+ MUL15(a[13], a[15])) << 1);
954
	t[29] = ((MUL15(a[10], a[19])
955
		+ MUL15(a[11], a[18])
956
		+ MUL15(a[12], a[17])
957
		+ MUL15(a[13], a[16])
958
		+ MUL15(a[14], a[15])) << 1);
959
	t[30] = MUL15(a[15], a[15])
960
		+ ((MUL15(a[11], a[19])
961
		+ MUL15(a[12], a[18])
962
		+ MUL15(a[13], a[17])
963
		+ MUL15(a[14], a[16])) << 1);
964
	t[31] = ((MUL15(a[12], a[19])
965
		+ MUL15(a[13], a[18])
966
		+ MUL15(a[14], a[17])
967
		+ MUL15(a[15], a[16])) << 1);
968
	t[32] = MUL15(a[16], a[16])
969
		+ ((MUL15(a[13], a[19])
970
		+ MUL15(a[14], a[18])
971
		+ MUL15(a[15], a[17])) << 1);
972
	t[33] = ((MUL15(a[14], a[19])
973
		+ MUL15(a[15], a[18])
974
		+ MUL15(a[16], a[17])) << 1);
975
	t[34] = MUL15(a[17], a[17])
976
		+ ((MUL15(a[15], a[19])
977
		+ MUL15(a[16], a[18])) << 1);
978
	t[35] = ((MUL15(a[16], a[19])
979
		+ MUL15(a[17], a[18])) << 1);
980
	t[36] = MUL15(a[18], a[18])
981
		+ ((MUL15(a[17], a[19])) << 1);
982
	t[37] = ((MUL15(a[18], a[19])) << 1);
983
	t[38] = MUL15(a[19], a[19]);
984
	d[39] = norm13(d, t, 39);
985
}
986

987
#endif
988

989
/*
990
 * Modulus for field F256 (field for point coordinates in curve P-256).
991
 */
992
static const uint32_t F256[] = {
993
	0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x001F,
994
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0400, 0x0000,
995
	0x0000, 0x1FF8, 0x1FFF, 0x01FF
996
};
997

998
/*
999
 * The 'b' curve equation coefficient for P-256.
1000
 */
1001
static const uint32_t P256_B[] = {
1002
	0x004B, 0x1E93, 0x0F89, 0x1C78, 0x03BC, 0x187B, 0x114E, 0x1619,
1003
	0x1D06, 0x0328, 0x01AF, 0x0D31, 0x1557, 0x15DE, 0x1ECF, 0x127C,
1004
	0x0A3A, 0x0EC5, 0x118D, 0x00B5
1005
};
1006

1007
/*
1008
 * Perform a "short reduction" in field F256 (field for curve P-256).
1009
 * The source value should be less than 262 bits; on output, it will
1010
 * be at most 257 bits, and less than twice the modulus.
1011
 */
1012
static void
1013
reduce_f256(uint32_t *d)
1014
{
1015
	uint32_t x;
1016

1017
	x = d[19] >> 9;
1018
	d[19] &= 0x01FF;
1019
	d[17] += x << 3;
1020
	d[14] -= x << 10;
1021
	d[7] -= x << 5;
1022
	d[0] += x;
1023
	norm13(d, d, 20);
1024
}
1025

1026
/*
1027
 * Perform a "final reduction" in field F256 (field for curve P-256).
1028
 * The source value must be less than twice the modulus. If the value
1029
 * is not lower than the modulus, then the modulus is subtracted and
1030
 * this function returns 1; otherwise, it leaves it untouched and it
1031
 * returns 0.
1032
 */
1033
static uint32_t
1034
reduce_final_f256(uint32_t *d)
1035
{
1036
	uint32_t t[20];
1037
	uint32_t cc;
1038
	int i;
1039

1040
	memcpy(t, d, sizeof t);
1041
	cc = 0;
1042
	for (i = 0; i < 20; i ++) {
1043
		uint32_t w;
1044

1045
		w = t[i] - F256[i] - cc;
1046
		cc = w >> 31;
1047
		t[i] = w & 0x1FFF;
1048
	}
1049
	cc ^= 1;
1050
	CCOPY(cc, d, t, sizeof t);
1051
	return cc;
1052
}
1053

1054
/*
1055
 * Perform a multiplication of two integers modulo
1056
 * 2^256-2^224+2^192+2^96-1 (for NIST curve P-256). Operands are arrays
1057
 * of 20 words, each containing 13 bits of data, in little-endian order.
1058
 * On input, upper word may be up to 13 bits (hence value up to 2^260-1);
1059
 * on output, value fits on 257 bits and is lower than twice the modulus.
1060
 */
1061
static void
1062
mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
1063
{
1064
	uint32_t t[40], cc;
1065
	int i;
1066

1067
	/*
1068
	 * Compute raw multiplication. All result words fit in 13 bits
1069
	 * each.
1070
	 */
1071
	mul20(t, a, b);
1072

1073
	/*
1074
	 * Modular reduction: each high word in added/subtracted where
1075
	 * necessary.
1076
	 *
1077
	 * The modulus is:
1078
	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
1079
	 * Therefore:
1080
	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
1081
	 *
1082
	 * For a word x at bit offset n (n >= 256), we have:
1083
	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
1084
	 *            - x*2^(n - 160) + x*2^(n-256) mod p
1085
	 *
1086
	 * Thus, we can nullify the high word if we reinject it at some
1087
	 * proper emplacements.
1088
	 */
1089
	for (i = 39; i >= 20; i --) {
1090
		uint32_t x;
1091

1092
		x = t[i];
1093
		t[i - 2] += ARSH(x, 6);
1094
		t[i - 3] += (x << 7) & 0x1FFF;
1095
		t[i - 4] -= ARSH(x, 12);
1096
		t[i - 5] -= (x << 1) & 0x1FFF;
1097
		t[i - 12] -= ARSH(x, 4);
1098
		t[i - 13] -= (x << 9) & 0x1FFF;
1099
		t[i - 19] += ARSH(x, 9);
1100
		t[i - 20] += (x << 4) & 0x1FFF;
1101
	}
1102

1103
	/*
1104
	 * Propagate carries. This is a signed propagation, and the
1105
	 * result may be negative. The loop above may enlarge values,
1106
	 * but not two much: worst case is the chain involving t[i - 3],
1107
	 * in which a value may be added to itself up to 7 times. Since
1108
	 * starting values are 13-bit each, all words fit on 20 bits
1109
	 * (21 to account for the sign bit).
1110
	 */
1111
	cc = norm13(t, t, 20);
1112

1113
	/*
1114
	 * Perform modular reduction again for the bits beyond 256 (the carry
1115
	 * and the bits 256..259). Since the largest shift below is by 10
1116
	 * bits, and the values fit on 21 bits, values fit in 32-bit words,
1117
	 * thereby allowing injecting full word values.
1118
	 */
1119
	cc = (cc << 4) | (t[19] >> 9);
1120
	t[19] &= 0x01FF;
1121
	t[17] += cc << 3;
1122
	t[14] -= cc << 10;
1123
	t[7] -= cc << 5;
1124
	t[0] += cc;
1125

1126
	/*
1127
	 * If the carry is negative, then after carry propagation, we may
1128
	 * end up with a value which is negative, and we don't want that.
1129
	 * Thus, in that case, we add the modulus. Note that the subtraction
1130
	 * result, when the carry is negative, is always smaller than the
1131
	 * modulus, so the extra addition will not make the value exceed
1132
	 * twice the modulus.
1133
	 */
1134
	cc >>= 31;
1135
	t[0] -= cc;
1136
	t[7] += cc << 5;
1137
	t[14] += cc << 10;
1138
	t[17] -= cc << 3;
1139
	t[19] += cc << 9;
1140

1141
	norm13(d, t, 20);
1142
}
1143

1144
/*
1145
 * Square an integer modulo 2^256-2^224+2^192+2^96-1 (for NIST curve
1146
 * P-256). Operand is an array of 20 words, each containing 13 bits of
1147
 * data, in little-endian order. On input, upper word may be up to 13
1148
 * bits (hence value up to 2^260-1); on output, value fits on 257 bits
1149
 * and is lower than twice the modulus.
1150
 */
1151
static void
1152
square_f256(uint32_t *d, const uint32_t *a)
1153
{
1154
	uint32_t t[40], cc;
1155
	int i;
1156

1157
	/*
1158
	 * Compute raw square. All result words fit in 13 bits each.
1159
	 */
1160
	square20(t, a);
1161

1162
	/*
1163
	 * Modular reduction: each high word in added/subtracted where
1164
	 * necessary.
1165
	 *
1166
	 * The modulus is:
1167
	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
1168
	 * Therefore:
1169
	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
1170
	 *
1171
	 * For a word x at bit offset n (n >= 256), we have:
1172
	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
1173
	 *            - x*2^(n - 160) + x*2^(n-256) mod p
1174
	 *
1175
	 * Thus, we can nullify the high word if we reinject it at some
1176
	 * proper emplacements.
1177
	 */
1178
	for (i = 39; i >= 20; i --) {
1179
		uint32_t x;
1180

1181
		x = t[i];
1182
		t[i - 2] += ARSH(x, 6);
1183
		t[i - 3] += (x << 7) & 0x1FFF;
1184
		t[i - 4] -= ARSH(x, 12);
1185
		t[i - 5] -= (x << 1) & 0x1FFF;
1186
		t[i - 12] -= ARSH(x, 4);
1187
		t[i - 13] -= (x << 9) & 0x1FFF;
1188
		t[i - 19] += ARSH(x, 9);
1189
		t[i - 20] += (x << 4) & 0x1FFF;
1190
	}
1191

1192
	/*
1193
	 * Propagate carries. This is a signed propagation, and the
1194
	 * result may be negative. The loop above may enlarge values,
1195
	 * but not two much: worst case is the chain involving t[i - 3],
1196
	 * in which a value may be added to itself up to 7 times. Since
1197
	 * starting values are 13-bit each, all words fit on 20 bits
1198
	 * (21 to account for the sign bit).
1199
	 */
1200
	cc = norm13(t, t, 20);
1201

1202
	/*
1203
	 * Perform modular reduction again for the bits beyond 256 (the carry
1204
	 * and the bits 256..259). Since the largest shift below is by 10
1205
	 * bits, and the values fit on 21 bits, values fit in 32-bit words,
1206
	 * thereby allowing injecting full word values.
1207
	 */
1208
	cc = (cc << 4) | (t[19] >> 9);
1209
	t[19] &= 0x01FF;
1210
	t[17] += cc << 3;
1211
	t[14] -= cc << 10;
1212
	t[7] -= cc << 5;
1213
	t[0] += cc;
1214

1215
	/*
1216
	 * If the carry is negative, then after carry propagation, we may
1217
	 * end up with a value which is negative, and we don't want that.
1218
	 * Thus, in that case, we add the modulus. Note that the subtraction
1219
	 * result, when the carry is negative, is always smaller than the
1220
	 * modulus, so the extra addition will not make the value exceed
1221
	 * twice the modulus.
1222
	 */
1223
	cc >>= 31;
1224
	t[0] -= cc;
1225
	t[7] += cc << 5;
1226
	t[14] += cc << 10;
1227
	t[17] -= cc << 3;
1228
	t[19] += cc << 9;
1229

1230
	norm13(d, t, 20);
1231
}
1232

1233
/*
1234
 * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
1235
 * are such that:
1236
 *   X = x / z^2
1237
 *   Y = y / z^3
1238
 * For the point at infinity, z = 0.
1239
 * Each point thus admits many possible representations.
1240
 *
1241
 * Coordinates are represented in arrays of 32-bit integers, each holding
1242
 * 13 bits of data. Values may also be slightly greater than the modulus,
1243
 * but they will always be lower than twice the modulus.
1244
 */
1245
typedef struct {
1246
	uint32_t x[20];
1247
	uint32_t y[20];
1248
	uint32_t z[20];
1249
} p256_jacobian;
1250

1251
/*
1252
 * Convert a point to affine coordinates:
1253
 *  - If the point is the point at infinity, then all three coordinates
1254
 *    are set to 0.
1255
 *  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
1256
 *    coordinates are the 'X' and 'Y' affine coordinates.
1257
 * The coordinates are guaranteed to be lower than the modulus.
1258
 */
1259
static void
1260
p256_to_affine(p256_jacobian *P)
1261
{
1262
	uint32_t t1[20], t2[20];
1263
	int i;
1264

1265
	/*
1266
	 * Invert z with a modular exponentiation: the modulus is
1267
	 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
1268
	 * p-2. Exponent bit pattern (from high to low) is:
1269
	 *  - 32 bits of value 1
1270
	 *  - 31 bits of value 0
1271
	 *  - 1 bit of value 1
1272
	 *  - 96 bits of value 0
1273
	 *  - 94 bits of value 1
1274
	 *  - 1 bit of value 0
1275
	 *  - 1 bit of value 1
1276
	 * Thus, we precompute z^(2^31-1) to speed things up.
1277
	 *
1278
	 * If z = 0 (point at infinity) then the modular exponentiation
1279
	 * will yield 0, which leads to the expected result (all three
1280
	 * coordinates set to 0).
1281
	 */
1282

1283
	/*
1284
	 * A simple square-and-multiply for z^(2^31-1). We could save about
1285
	 * two dozen multiplications here with an addition chain, but
1286
	 * this would require a bit more code, and extra stack buffers.
1287
	 */
1288
	memcpy(t1, P->z, sizeof P->z);
1289
	for (i = 0; i < 30; i ++) {
1290
		square_f256(t1, t1);
1291
		mul_f256(t1, t1, P->z);
1292
	}
1293

1294
	/*
1295
	 * Square-and-multiply. Apart from the squarings, we have a few
1296
	 * multiplications to set bits to 1; we multiply by the original z
1297
	 * for setting 1 bit, and by t1 for setting 31 bits.
1298
	 */
1299
	memcpy(t2, P->z, sizeof P->z);
1300
	for (i = 1; i < 256; i ++) {
1301
		square_f256(t2, t2);
1302
		switch (i) {
1303
		case 31:
1304
		case 190:
1305
		case 221:
1306
		case 252:
1307
			mul_f256(t2, t2, t1);
1308
			break;
1309
		case 63:
1310
		case 253:
1311
		case 255:
1312
			mul_f256(t2, t2, P->z);
1313
			break;
1314
		}
1315
	}
1316

1317
	/*
1318
	 * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
1319
	 */
1320
	mul_f256(t1, t2, t2);
1321
	mul_f256(P->x, t1, P->x);
1322
	mul_f256(t1, t1, t2);
1323
	mul_f256(P->y, t1, P->y);
1324
	reduce_final_f256(P->x);
1325
	reduce_final_f256(P->y);
1326

1327
	/*
1328
	 * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
1329
	 * this will set z to 1.
1330
	 */
1331
	mul_f256(P->z, P->z, t2);
1332
	reduce_final_f256(P->z);
1333
}
1334

1335
/*
1336
 * Double a point in P-256. This function works for all valid points,
1337
 * including the point at infinity.
1338
 */
1339
static void
1340
p256_double(p256_jacobian *Q)
1341
{
1342
	/*
1343
	 * Doubling formulas are:
1344
	 *
1345
	 *   s = 4*x*y^2
1346
	 *   m = 3*(x + z^2)*(x - z^2)
1347
	 *   x' = m^2 - 2*s
1348
	 *   y' = m*(s - x') - 8*y^4
1349
	 *   z' = 2*y*z
1350
	 *
1351
	 * These formulas work for all points, including points of order 2
1352
	 * and points at infinity:
1353
	 *   - If y = 0 then z' = 0. But there is no such point in P-256
1354
	 *     anyway.
1355
	 *   - If z = 0 then z' = 0.
1356
	 */
1357
	uint32_t t1[20], t2[20], t3[20], t4[20];
1358
	int i;
1359

1360
	/*
1361
	 * Compute z^2 in t1.
1362
	 */
1363
	square_f256(t1, Q->z);
1364

1365
	/*
1366
	 * Compute x-z^2 in t2 and x+z^2 in t1.
1367
	 */
1368
	for (i = 0; i < 20; i ++) {
1369
		t2[i] = (F256[i] << 1) + Q->x[i] - t1[i];
1370
		t1[i] += Q->x[i];
1371
	}
1372
	norm13(t1, t1, 20);
1373
	norm13(t2, t2, 20);
1374

1375
	/*
1376
	 * Compute 3*(x+z^2)*(x-z^2) in t1.
1377
	 */
1378
	mul_f256(t3, t1, t2);
1379
	for (i = 0; i < 20; i ++) {
1380
		t1[i] = MUL15(3, t3[i]);
1381
	}
1382
	norm13(t1, t1, 20);
1383

1384
	/*
1385
	 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
1386
	 */
1387
	square_f256(t3, Q->y);
1388
	for (i = 0; i < 20; i ++) {
1389
		t3[i] <<= 1;
1390
	}
1391
	norm13(t3, t3, 20);
1392
	mul_f256(t2, Q->x, t3);
1393
	for (i = 0; i < 20; i ++) {
1394
		t2[i] <<= 1;
1395
	}
1396
	norm13(t2, t2, 20);
1397
	reduce_f256(t2);
1398

1399
	/*
1400
	 * Compute x' = m^2 - 2*s.
1401
	 */
1402
	square_f256(Q->x, t1);
1403
	for (i = 0; i < 20; i ++) {
1404
		Q->x[i] += (F256[i] << 2) - (t2[i] << 1);
1405
	}
1406
	norm13(Q->x, Q->x, 20);
1407
	reduce_f256(Q->x);
1408

1409
	/*
1410
	 * Compute z' = 2*y*z.
1411
	 */
1412
	mul_f256(t4, Q->y, Q->z);
1413
	for (i = 0; i < 20; i ++) {
1414
		Q->z[i] = t4[i] << 1;
1415
	}
1416
	norm13(Q->z, Q->z, 20);
1417
	reduce_f256(Q->z);
1418

1419
	/*
1420
	 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
1421
	 * 2*y^2 in t3.
1422
	 */
1423
	for (i = 0; i < 20; i ++) {
1424
		t2[i] += (F256[i] << 1) - Q->x[i];
1425
	}
1426
	norm13(t2, t2, 20);
1427
	mul_f256(Q->y, t1, t2);
1428
	square_f256(t4, t3);
1429
	for (i = 0; i < 20; i ++) {
1430
		Q->y[i] += (F256[i] << 2) - (t4[i] << 1);
1431
	}
1432
	norm13(Q->y, Q->y, 20);
1433
	reduce_f256(Q->y);
1434
}
1435

1436
/*
1437
 * Add point P2 to point P1.
1438
 *
1439
 * This function computes the wrong result in the following cases:
1440
 *
1441
 *   - If P1 == 0 but P2 != 0
1442
 *   - If P1 != 0 but P2 == 0
1443
 *   - If P1 == P2
1444
 *
1445
 * In all three cases, P1 is set to the point at infinity.
1446
 *
1447
 * Returned value is 0 if one of the following occurs:
1448
 *
1449
 *   - P1 and P2 have the same Y coordinate
1450
 *   - P1 == 0 and P2 == 0
1451
 *   - The Y coordinate of one of the points is 0 and the other point is
1452
 *     the point at infinity.
1453
 *
1454
 * The third case cannot actually happen with valid points, since a point
1455
 * with Y == 0 is a point of order 2, and there is no point of order 2 on
1456
 * curve P-256.
1457
 *
1458
 * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
1459
 * can apply the following:
1460
 *
1461
 *   - If the result is not the point at infinity, then it is correct.
1462
 *   - Otherwise, if the returned value is 1, then this is a case of
1463
 *     P1+P2 == 0, so the result is indeed the point at infinity.
1464
 *   - Otherwise, P1 == P2, so a "double" operation should have been
1465
 *     performed.
1466
 */
1467
static uint32_t
1468
p256_add(p256_jacobian *P1, const p256_jacobian *P2)
1469
{
1470
	/*
1471
	 * Addtions formulas are:
1472
	 *
1473
	 *   u1 = x1 * z2^2
1474
	 *   u2 = x2 * z1^2
1475
	 *   s1 = y1 * z2^3
1476
	 *   s2 = y2 * z1^3
1477
	 *   h = u2 - u1
1478
	 *   r = s2 - s1
1479
	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
1480
	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
1481
	 *   z3 = h * z1 * z2
1482
	 */
1483
	uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
1484
	uint32_t ret;
1485
	int i;
1486

1487
	/*
1488
	 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
1489
	 */
1490
	square_f256(t3, P2->z);
1491
	mul_f256(t1, P1->x, t3);
1492
	mul_f256(t4, P2->z, t3);
1493
	mul_f256(t3, P1->y, t4);
1494

1495
	/*
1496
	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
1497
	 */
1498
	square_f256(t4, P1->z);
1499
	mul_f256(t2, P2->x, t4);
1500
	mul_f256(t5, P1->z, t4);
1501
	mul_f256(t4, P2->y, t5);
1502

1503
	/*
1504
	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
1505
	 * We need to test whether r is zero, so we will do some extra
1506
	 * reduce.
1507
	 */
1508
	for (i = 0; i < 20; i ++) {
1509
		t2[i] += (F256[i] << 1) - t1[i];
1510
		t4[i] += (F256[i] << 1) - t3[i];
1511
	}
1512
	norm13(t2, t2, 20);
1513
	norm13(t4, t4, 20);
1514
	reduce_f256(t4);
1515
	reduce_final_f256(t4);
1516
	ret = 0;
1517
	for (i = 0; i < 20; i ++) {
1518
		ret |= t4[i];
1519
	}
1520
	ret = (ret | -ret) >> 31;
1521

1522
	/*
1523
	 * Compute u1*h^2 (in t6) and h^3 (in t5);
1524
	 */
1525
	square_f256(t7, t2);
1526
	mul_f256(t6, t1, t7);
1527
	mul_f256(t5, t7, t2);
1528

1529
	/*
1530
	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1531
	 */
1532
	square_f256(P1->x, t4);
1533
	for (i = 0; i < 20; i ++) {
1534
		P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
1535
	}
1536
	norm13(P1->x, P1->x, 20);
1537
	reduce_f256(P1->x);
1538

1539
	/*
1540
	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1541
	 */
1542
	for (i = 0; i < 20; i ++) {
1543
		t6[i] += (F256[i] << 1) - P1->x[i];
1544
	}
1545
	norm13(t6, t6, 20);
1546
	mul_f256(P1->y, t4, t6);
1547
	mul_f256(t1, t5, t3);
1548
	for (i = 0; i < 20; i ++) {
1549
		P1->y[i] += (F256[i] << 1) - t1[i];
1550
	}
1551
	norm13(P1->y, P1->y, 20);
1552
	reduce_f256(P1->y);
1553

1554
	/*
1555
	 * Compute z3 = h*z1*z2.
1556
	 */
1557
	mul_f256(t1, P1->z, P2->z);
1558
	mul_f256(P1->z, t1, t2);
1559

1560
	return ret;
1561
}
1562

1563
/*
1564
 * Add point P2 to point P1. This is a specialised function for the
1565
 * case when P2 is a non-zero point in affine coordinate.
1566
 *
1567
 * This function computes the wrong result in the following cases:
1568
 *
1569
 *   - If P1 == 0
1570
 *   - If P1 == P2
1571
 *
1572
 * In both cases, P1 is set to the point at infinity.
1573
 *
1574
 * Returned value is 0 if one of the following occurs:
1575
 *
1576
 *   - P1 and P2 have the same Y coordinate
1577
 *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
1578
 *
1579
 * The second case cannot actually happen with valid points, since a point
1580
 * with Y == 0 is a point of order 2, and there is no point of order 2 on
1581
 * curve P-256.
1582
 *
1583
 * Therefore, assuming that P1 != 0 on input, then the caller
1584
 * can apply the following:
1585
 *
1586
 *   - If the result is not the point at infinity, then it is correct.
1587
 *   - Otherwise, if the returned value is 1, then this is a case of
1588
 *     P1+P2 == 0, so the result is indeed the point at infinity.
1589
 *   - Otherwise, P1 == P2, so a "double" operation should have been
1590
 *     performed.
1591
 */
1592
static uint32_t
1593
p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
1594
{
1595
	/*
1596
	 * Addtions formulas are:
1597
	 *
1598
	 *   u1 = x1
1599
	 *   u2 = x2 * z1^2
1600
	 *   s1 = y1
1601
	 *   s2 = y2 * z1^3
1602
	 *   h = u2 - u1
1603
	 *   r = s2 - s1
1604
	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
1605
	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
1606
	 *   z3 = h * z1
1607
	 */
1608
	uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
1609
	uint32_t ret;
1610
	int i;
1611

1612
	/*
1613
	 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
1614
	 */
1615
	memcpy(t1, P1->x, sizeof t1);
1616
	memcpy(t3, P1->y, sizeof t3);
1617

1618
	/*
1619
	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
1620
	 */
1621
	square_f256(t4, P1->z);
1622
	mul_f256(t2, P2->x, t4);
1623
	mul_f256(t5, P1->z, t4);
1624
	mul_f256(t4, P2->y, t5);
1625

1626
	/*
1627
	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
1628
	 * We need to test whether r is zero, so we will do some extra
1629
	 * reduce.
1630
	 */
1631
	for (i = 0; i < 20; i ++) {
1632
		t2[i] += (F256[i] << 1) - t1[i];
1633
		t4[i] += (F256[i] << 1) - t3[i];
1634
	}
1635
	norm13(t2, t2, 20);
1636
	norm13(t4, t4, 20);
1637
	reduce_f256(t4);
1638
	reduce_final_f256(t4);
1639
	ret = 0;
1640
	for (i = 0; i < 20; i ++) {
1641
		ret |= t4[i];
1642
	}
1643
	ret = (ret | -ret) >> 31;
1644

1645
	/*
1646
	 * Compute u1*h^2 (in t6) and h^3 (in t5);
1647
	 */
1648
	square_f256(t7, t2);
1649
	mul_f256(t6, t1, t7);
1650
	mul_f256(t5, t7, t2);
1651

1652
	/*
1653
	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1654
	 */
1655
	square_f256(P1->x, t4);
1656
	for (i = 0; i < 20; i ++) {
1657
		P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
1658
	}
1659
	norm13(P1->x, P1->x, 20);
1660
	reduce_f256(P1->x);
1661

1662
	/*
1663
	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1664
	 */
1665
	for (i = 0; i < 20; i ++) {
1666
		t6[i] += (F256[i] << 1) - P1->x[i];
1667
	}
1668
	norm13(t6, t6, 20);
1669
	mul_f256(P1->y, t4, t6);
1670
	mul_f256(t1, t5, t3);
1671
	for (i = 0; i < 20; i ++) {
1672
		P1->y[i] += (F256[i] << 1) - t1[i];
1673
	}
1674
	norm13(P1->y, P1->y, 20);
1675
	reduce_f256(P1->y);
1676

1677
	/*
1678
	 * Compute z3 = h*z1*z2.
1679
	 */
1680
	mul_f256(P1->z, P1->z, t2);
1681

1682
	return ret;
1683
}
1684

1685
/*
1686
 * Decode a P-256 point. This function does not support the point at
1687
 * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
1688
 */
1689
static uint32_t
1690
p256_decode(p256_jacobian *P, const void *src, size_t len)
1691
{
1692
	const unsigned char *buf;
1693
	uint32_t tx[20], ty[20], t1[20], t2[20];
1694
	uint32_t bad;
1695
	int i;
1696

1697
	if (len != 65) {
1698
		return 0;
1699
	}
1700
	buf = src;
1701

1702
	/*
1703
	 * First byte must be 0x04 (uncompressed format). We could support
1704
	 * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
1705
	 * least significant bit of the Y coordinate), but it is explicitly
1706
	 * forbidden by RFC 5480 (section 2.2).
1707
	 */
1708
	bad = NEQ(buf[0], 0x04);
1709

1710
	/*
1711
	 * Decode the coordinates, and check that they are both lower
1712
	 * than the modulus.
1713
	 */
1714
	tx[19] = be8_to_le13(tx, buf + 1, 32);
1715
	ty[19] = be8_to_le13(ty, buf + 33, 32);
1716
	bad |= reduce_final_f256(tx);
1717
	bad |= reduce_final_f256(ty);
1718

1719
	/*
1720
	 * Check curve equation.
1721
	 */
1722
	square_f256(t1, tx);
1723
	mul_f256(t1, tx, t1);
1724
	square_f256(t2, ty);
1725
	for (i = 0; i < 20; i ++) {
1726
		t1[i] += (F256[i] << 3) - MUL15(3, tx[i]) + P256_B[i] - t2[i];
1727
	}
1728
	norm13(t1, t1, 20);
1729
	reduce_f256(t1);
1730
	reduce_final_f256(t1);
1731
	for (i = 0; i < 20; i ++) {
1732
		bad |= t1[i];
1733
	}
1734

1735
	/*
1736
	 * Copy coordinates to the point structure.
1737
	 */
1738
	memcpy(P->x, tx, sizeof tx);
1739
	memcpy(P->y, ty, sizeof ty);
1740
	memset(P->z, 0, sizeof P->z);
1741
	P->z[0] = 1;
1742
	return EQ(bad, 0);
1743
}
1744

1745
/*
1746
 * Encode a point into a buffer. This function assumes that the point is
1747
 * valid, in affine coordinates, and not the point at infinity.
1748
 */
1749
static void
1750
p256_encode(void *dst, const p256_jacobian *P)
1751
{
1752
	unsigned char *buf;
1753

1754
	buf = dst;
1755
	buf[0] = 0x04;
1756
	le13_to_be8(buf + 1, 32, P->x);
1757
	le13_to_be8(buf + 33, 32, P->y);
1758
}
1759

1760
/*
1761
 * Multiply a curve point by an integer. The integer is assumed to be
1762
 * lower than the curve order, and the base point must not be the point
1763
 * at infinity.
1764
 */
1765
static void
1766
p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
1767
{
1768
	/*
1769
	 * qz is a flag that is initially 1, and remains equal to 1
1770
	 * as long as the point is the point at infinity.
1771
	 *
1772
	 * We use a 2-bit window to handle multiplier bits by pairs.
1773
	 * The precomputed window really is the points P2 and P3.
1774
	 */
1775
	uint32_t qz;
1776
	p256_jacobian P2, P3, Q, T, U;
1777

1778
	/*
1779
	 * Compute window values.
1780
	 */
1781
	P2 = *P;
1782
	p256_double(&P2);
1783
	P3 = *P;
1784
	p256_add(&P3, &P2);
1785

1786
	/*
1787
	 * We start with Q = 0. We process multiplier bits 2 by 2.
1788
	 */
1789
	memset(&Q, 0, sizeof Q);
1790
	qz = 1;
1791
	while (xlen -- > 0) {
1792
		int k;
1793

1794
		for (k = 6; k >= 0; k -= 2) {
1795
			uint32_t bits;
1796
			uint32_t bnz;
1797

1798
			p256_double(&Q);
1799
			p256_double(&Q);
1800
			T = *P;
1801
			U = Q;
1802
			bits = (*x >> k) & (uint32_t)3;
1803
			bnz = NEQ(bits, 0);
1804
			CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
1805
			CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
1806
			p256_add(&U, &T);
1807
			CCOPY(bnz & qz, &Q, &T, sizeof Q);
1808
			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1809
			qz &= ~bnz;
1810
		}
1811
		x ++;
1812
	}
1813
	*P = Q;
1814
}
1815

1816
/*
1817
 * Precomputed window: k*G points, where G is the curve generator, and k
1818
 * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
1819
 * the point are encoded as 20 words of 13 bits each (little-endian
1820
 * order); 13-bit words are then grouped 2-by-2 into 32-bit words
1821
 * (little-endian order within each word).
1822
 */
1823
static const uint32_t Gwin[15][20] = {
1824

1825
	{ 0x04C60296, 0x02721176, 0x19D00F4A, 0x102517AC,
1826
	  0x13B8037D, 0x0748103C, 0x1E730E56, 0x08481FE2,
1827
	  0x0F97012C, 0x00D605F4, 0x1DFA11F5, 0x0C801A0D,
1828
	  0x0F670CBB, 0x0AED0CC5, 0x115E0E33, 0x181F0785,
1829
	  0x13F514A7, 0x0FF30E3B, 0x17171E1A, 0x009F18D0 },
1830

1831
	{ 0x1B341978, 0x16911F11, 0x0D9A1A60, 0x1C4E1FC8,
1832
	  0x1E040969, 0x096A06B0, 0x091C0030, 0x09EF1A29,
1833
	  0x18C40D03, 0x00F91C9E, 0x13C313D1, 0x096F0748,
1834
	  0x011419E0, 0x1CC713A6, 0x1DD31DAD, 0x1EE80C36,
1835
	  0x1ECD0C69, 0x1A0800A4, 0x08861B8E, 0x000E1DD5 },
1836

1837
	{ 0x173F1D6C, 0x02CC06F1, 0x14C21FB4, 0x043D1EB6,
1838
	  0x0F3606B7, 0x1A971C59, 0x1BF71951, 0x01481323,
1839
	  0x068D0633, 0x00BD12F9, 0x13EA1032, 0x136209E8,
1840
	  0x1C1E19A7, 0x06C7013E, 0x06C10AB0, 0x14C908BB,
1841
	  0x05830CE1, 0x1FEF18DD, 0x00620998, 0x010E0D19 },
1842

1843
	{ 0x18180852, 0x0604111A, 0x0B771509, 0x1B6F0156,
1844
	  0x00181FE2, 0x1DCC0AF4, 0x16EF0659, 0x11F70E80,
1845
	  0x11A912D0, 0x01C414D2, 0x027618C6, 0x05840FC6,
1846
	  0x100215C4, 0x187E0C3B, 0x12771C96, 0x150C0B5D,
1847
	  0x0FF705FD, 0x07981C67, 0x1AD20C63, 0x01C11C55 },
1848

1849
	{ 0x1E8113ED, 0x0A940370, 0x12920215, 0x1FA31D6F,
1850
	  0x1F7C0C82, 0x10CD03F7, 0x02640560, 0x081A0B5E,
1851
	  0x1BD21151, 0x00A21642, 0x0D0B0DA4, 0x0176113F,
1852
	  0x04440D1D, 0x001A1360, 0x1068012F, 0x1F141E49,
1853
	  0x10DF136B, 0x0E4F162B, 0x0D44104A, 0x01C1105F },
1854

1855
	{ 0x011411A9, 0x01551A4F, 0x0ADA0C6B, 0x01BD0EC8,
1856
	  0x18120C74, 0x112F1778, 0x099202CB, 0x0C05124B,
1857
	  0x195316A4, 0x01600685, 0x1E3B1FE2, 0x189014E3,
1858
	  0x0B5E1FD7, 0x0E0311F8, 0x08E000F7, 0x174E00DE,
1859
	  0x160702DF, 0x1B5A15BF, 0x03A11237, 0x01D01704 },
1860

1861
	{ 0x0C3D12A3, 0x0C501C0C, 0x17AD1300, 0x1715003F,
1862
	  0x03F719F8, 0x18031ED8, 0x1D980667, 0x0F681896,
1863
	  0x1B7D00BF, 0x011C14CE, 0x0FA000B4, 0x1C3501B0,
1864
	  0x0D901C55, 0x06790C10, 0x029E0736, 0x0DEB0400,
1865
	  0x034F183A, 0x030619B4, 0x0DEF0033, 0x00E71AC7 },
1866

1867
	{ 0x1B7D1393, 0x1B3B1076, 0x0BED1B4D, 0x13011F3A,
1868
	  0x0E0E1238, 0x156A132B, 0x013A02D3, 0x160A0D01,
1869
	  0x1CED1EE9, 0x00C5165D, 0x184C157E, 0x08141A83,
1870
	  0x153C0DA5, 0x1ED70F9D, 0x05170D51, 0x02CF13B8,
1871
	  0x18AE1771, 0x1B04113F, 0x05EC11E9, 0x015A16B3 },
1872

1873
	{ 0x04A41EE0, 0x1D1412E4, 0x1C591D79, 0x118511B7,
1874
	  0x14F00ACB, 0x1AE31E1C, 0x049C0D51, 0x016E061E,
1875
	  0x1DB71EDF, 0x01D41A35, 0x0E8208FA, 0x14441293,
1876
	  0x011F1E85, 0x1D54137A, 0x026B114F, 0x151D0832,
1877
	  0x00A50964, 0x1F9C1E1C, 0x064B12C9, 0x005409D1 },
1878

1879
	{ 0x062B123F, 0x0C0D0501, 0x183704C3, 0x08E31120,
1880
	  0x0A2E0A6C, 0x14440FED, 0x090A0D1E, 0x13271964,
1881
	  0x0B590A3A, 0x019D1D9B, 0x05780773, 0x09770A91,
1882
	  0x0F770CA3, 0x053F19D4, 0x02C80DED, 0x1A761304,
1883
	  0x091E0DD9, 0x15D201B8, 0x151109AA, 0x010F0198 },
1884

1885
	{ 0x05E101D1, 0x072314DD, 0x045F1433, 0x1A041541,
1886
	  0x10B3142E, 0x01840736, 0x1C1B19DB, 0x098B0418,
1887
	  0x1DBC083B, 0x007D1444, 0x01511740, 0x11DD1F3A,
1888
	  0x04ED0E2F, 0x1B4B1A62, 0x10480D04, 0x09E911A2,
1889
	  0x04211AFA, 0x19140893, 0x04D60CC4, 0x01210648 },
1890

1891
	{ 0x112703C4, 0x018B1BA1, 0x164C1D50, 0x05160BE0,
1892
	  0x0BCC1830, 0x01CB1554, 0x13291732, 0x1B2B1918,
1893
	  0x0DED0817, 0x00E80775, 0x0A2401D3, 0x0BFE08B3,
1894
	  0x0E531199, 0x058616E9, 0x04770B91, 0x110F0C55,
1895
	  0x19C11554, 0x0BFB1159, 0x03541C38, 0x000E1C2D },
1896

1897
	{ 0x10390C01, 0x02BB0751, 0x0AC5098E, 0x096C17AB,
1898
	  0x03C90E28, 0x10BD18BF, 0x002E1F2D, 0x092B0986,
1899
	  0x1BD700AC, 0x002E1F20, 0x1E3D1FD8, 0x077718BB,
1900
	  0x06F919C4, 0x187407ED, 0x11370E14, 0x081E139C,
1901
	  0x00481ADB, 0x14AB0289, 0x066A0EBE, 0x00C70ED6 },
1902

1903
	{ 0x0694120B, 0x124E1CC9, 0x0E2F0570, 0x17CF081A,
1904
	  0x078906AC, 0x066D17CF, 0x1B3207F4, 0x0C5705E9,
1905
	  0x10001C38, 0x00A919DE, 0x06851375, 0x0F900BD8,
1906
	  0x080401BA, 0x0EEE0D42, 0x1B8B11EA, 0x0B4519F0,
1907
	  0x090F18C0, 0x062E1508, 0x0DD909F4, 0x01EB067C },
1908

1909
	{ 0x0CDC1D5F, 0x0D1818F9, 0x07781636, 0x125B18E8,
1910
	  0x0D7003AF, 0x13110099, 0x1D9B1899, 0x175C1EB7,
1911
	  0x0E34171A, 0x01E01153, 0x081A0F36, 0x0B391783,
1912
	  0x1D1F147E, 0x19CE16D7, 0x11511B21, 0x1F2C10F9,
1913
	  0x12CA0E51, 0x05A31D39, 0x171A192E, 0x016B0E4F }
1914
};
1915

1916
/*
1917
 * Lookup one of the Gwin[] values, by index. This is constant-time.
1918
 */
1919
static void
1920
lookup_Gwin(p256_jacobian *T, uint32_t idx)
1921
{
1922
	uint32_t xy[20];
1923
	uint32_t k;
1924
	size_t u;
1925

1926
	memset(xy, 0, sizeof xy);
1927
	for (k = 0; k < 15; k ++) {
1928
		uint32_t m;
1929

1930
		m = -EQ(idx, k + 1);
1931
		for (u = 0; u < 20; u ++) {
1932
			xy[u] |= m & Gwin[k][u];
1933
		}
1934
	}
1935
	for (u = 0; u < 10; u ++) {
1936
		T->x[(u << 1) + 0] = xy[u] & 0xFFFF;
1937
		T->x[(u << 1) + 1] = xy[u] >> 16;
1938
		T->y[(u << 1) + 0] = xy[u + 10] & 0xFFFF;
1939
		T->y[(u << 1) + 1] = xy[u + 10] >> 16;
1940
	}
1941
	memset(T->z, 0, sizeof T->z);
1942
	T->z[0] = 1;
1943
}
1944

1945
/*
1946
 * Multiply the generator by an integer. The integer is assumed non-zero
1947
 * and lower than the curve order.
1948
 */
1949
static void
1950
p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
1951
{
1952
	/*
1953
	 * qz is a flag that is initially 1, and remains equal to 1
1954
	 * as long as the point is the point at infinity.
1955
	 *
1956
	 * We use a 4-bit window to handle multiplier bits by groups
1957
	 * of 4. The precomputed window is constant static data, with
1958
	 * points in affine coordinates; we use a constant-time lookup.
1959
	 */
1960
	p256_jacobian Q;
1961
	uint32_t qz;
1962

1963
	memset(&Q, 0, sizeof Q);
1964
	qz = 1;
1965
	while (xlen -- > 0) {
1966
		int k;
1967
		unsigned bx;
1968

1969
		bx = *x ++;
1970
		for (k = 0; k < 2; k ++) {
1971
			uint32_t bits;
1972
			uint32_t bnz;
1973
			p256_jacobian T, U;
1974

1975
			p256_double(&Q);
1976
			p256_double(&Q);
1977
			p256_double(&Q);
1978
			p256_double(&Q);
1979
			bits = (bx >> 4) & 0x0F;
1980
			bnz = NEQ(bits, 0);
1981
			lookup_Gwin(&T, bits);
1982
			U = Q;
1983
			p256_add_mixed(&U, &T);
1984
			CCOPY(bnz & qz, &Q, &T, sizeof Q);
1985
			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1986
			qz &= ~bnz;
1987
			bx <<= 4;
1988
		}
1989
	}
1990
	*P = Q;
1991
}
1992

1993
static const unsigned char P256_G[] = {
1994
	0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
1995
	0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
1996
	0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
1997
	0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
1998
	0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
1999
	0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
2000
	0x68, 0x37, 0xBF, 0x51, 0xF5
2001
};
2002

2003
static const unsigned char P256_N[] = {
2004
	0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
2005
	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
2006
	0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
2007
	0x25, 0x51
2008
};
2009

2010
static const unsigned char *
2011
api_generator(int curve, size_t *len)
2012
{
2013
	(void)curve;
2014
	*len = sizeof P256_G;
2015
	return P256_G;
2016
}
2017

2018
static const unsigned char *
2019
api_order(int curve, size_t *len)
2020
{
2021
	(void)curve;
2022
	*len = sizeof P256_N;
2023
	return P256_N;
2024
}
2025

2026
static size_t
2027
api_xoff(int curve, size_t *len)
2028
{
2029
	(void)curve;
2030
	*len = 32;
2031
	return 1;
2032
}
2033

2034
static uint32_t
2035
api_mul(unsigned char *G, size_t Glen,
2036
	const unsigned char *x, size_t xlen, int curve)
2037
{
2038
	uint32_t r;
2039
	p256_jacobian P;
2040

2041
	(void)curve;
2042
	if (Glen != 65) {
2043
		return 0;
2044
	}
2045
	r = p256_decode(&P, G, Glen);
2046
	p256_mul(&P, x, xlen);
2047
	p256_to_affine(&P);
2048
	p256_encode(G, &P);
2049
	return r;
2050
}
2051

2052
static size_t
2053
api_mulgen(unsigned char *R,
2054
	const unsigned char *x, size_t xlen, int curve)
2055
{
2056
	p256_jacobian P;
2057

2058
	(void)curve;
2059
	p256_mulgen(&P, x, xlen);
2060
	p256_to_affine(&P);
2061
	p256_encode(R, &P);
2062
	return 65;
2063
}
2064

2065
static uint32_t
2066
api_muladd(unsigned char *A, const unsigned char *B, size_t len,
2067
	const unsigned char *x, size_t xlen,
2068
	const unsigned char *y, size_t ylen, int curve)
2069
{
2070
	p256_jacobian P, Q;
2071
	uint32_t r, t, z;
2072
	int i;
2073

2074
	(void)curve;
2075
	if (len != 65) {
2076
		return 0;
2077
	}
2078
	r = p256_decode(&P, A, len);
2079
	p256_mul(&P, x, xlen);
2080
	if (B == NULL) {
2081
		p256_mulgen(&Q, y, ylen);
2082
	} else {
2083
		r &= p256_decode(&Q, B, len);
2084
		p256_mul(&Q, y, ylen);
2085
	}
2086

2087
	/*
2088
	 * The final addition may fail in case both points are equal.
2089
	 */
2090
	t = p256_add(&P, &Q);
2091
	reduce_final_f256(P.z);
2092
	z = 0;
2093
	for (i = 0; i < 20; i ++) {
2094
		z |= P.z[i];
2095
	}
2096
	z = EQ(z, 0);
2097
	p256_double(&Q);
2098

2099
	/*
2100
	 * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
2101
	 * have the following:
2102
	 *
2103
	 *   z = 0, t = 0   return P (normal addition)
2104
	 *   z = 0, t = 1   return P (normal addition)
2105
	 *   z = 1, t = 0   return Q (a 'double' case)
2106
	 *   z = 1, t = 1   report an error (P+Q = 0)
2107
	 */
2108
	CCOPY(z & ~t, &P, &Q, sizeof Q);
2109
	p256_to_affine(&P);
2110
	p256_encode(A, &P);
2111
	r &= ~(z & t);
2112
	return r;
2113
}
2114

2115
/* see bearssl_ec.h */
2116
const br_ec_impl br_ec_p256_m15 = {
2117
	(uint32_t)0x00800000,
2118
	&api_generator,
2119
	&api_order,
2120
	&api_xoff,
2121
	&api_mul,
2122
	&api_mulgen,
2123
	&api_muladd
2124
};
2125

2126
Product

Resources

Company