Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/bearssl/src/ec/ec_c25519_m15.c
39536 views
1
/*
2
* Copyright (c) 2017 Thomas Pornin <[email protected]>
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining
5
* a copy of this software and associated documentation files (the
6
* "Software"), to deal in the Software without restriction, including
7
* without limitation the rights to use, copy, modify, merge, publish,
8
* distribute, sublicense, and/or sell copies of the Software, and to
9
* permit persons to whom the Software is furnished to do so, subject to
10
* the following conditions:
11
*
12
* The above copyright notice and this permission notice shall be
13
* included in all copies or substantial portions of the Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
25
#include "inner.h"
26
27
/* obsolete
28
#include <stdio.h>
29
#include <stdlib.h>
30
static void
31
print_int(const char *name, const uint32_t *x)
32
{
33
size_t u;
34
unsigned char tmp[36];
35
36
printf("%s = ", name);
37
for (u = 0; u < 20; u ++) {
38
if (x[u] > 0x1FFF) {
39
printf("INVALID:");
40
for (u = 0; u < 20; u ++) {
41
printf(" %04X", x[u]);
42
}
43
printf("\n");
44
return;
45
}
46
}
47
memset(tmp, 0, sizeof tmp);
48
for (u = 0; u < 20; u ++) {
49
uint32_t w;
50
int j, k;
51
52
w = x[u];
53
j = 13 * (int)u;
54
k = j & 7;
55
if (k != 0) {
56
w <<= k;
57
j -= k;
58
}
59
k = j >> 3;
60
tmp[35 - k] |= (unsigned char)w;
61
tmp[34 - k] |= (unsigned char)(w >> 8);
62
tmp[33 - k] |= (unsigned char)(w >> 16);
63
tmp[32 - k] |= (unsigned char)(w >> 24);
64
}
65
for (u = 4; u < 36; u ++) {
66
printf("%02X", tmp[u]);
67
}
68
printf("\n");
69
}
70
*/
71
72
/*
73
* If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
74
* that right-shifting a signed negative integer copies the sign bit
75
* (arithmetic right-shift). This is "implementation-defined behaviour",
76
* i.e. it is not undefined, but it may differ between compilers. Each
77
* compiler is supposed to document its behaviour in that respect. GCC
78
* explicitly defines that an arithmetic right shift is used. We expect
79
* all other compilers to do the same, because underlying CPU offer an
80
* arithmetic right shift opcode that could not be used otherwise.
81
*/
82
#if BR_NO_ARITH_SHIFT
83
#define ARSH(x, n) (((uint32_t)(x) >> (n)) \
84
| ((-((uint32_t)(x) >> 31)) << (32 - (n))))
85
#else
86
#define ARSH(x, n) ((*(int32_t *)&(x)) >> (n))
87
#endif
88
89
/*
90
* Convert an integer from unsigned little-endian encoding to a sequence of
91
* 13-bit words in little-endian order. The final "partial" word is
92
* returned.
93
*/
94
static uint32_t
95
le8_to_le13(uint32_t *dst, const unsigned char *src, size_t len)
96
{
97
uint32_t acc;
98
int acc_len;
99
100
acc = 0;
101
acc_len = 0;
102
while (len -- > 0) {
103
acc |= (uint32_t)(*src ++) << acc_len;
104
acc_len += 8;
105
if (acc_len >= 13) {
106
*dst ++ = acc & 0x1FFF;
107
acc >>= 13;
108
acc_len -= 13;
109
}
110
}
111
return acc;
112
}
113
114
/*
115
* Convert an integer (13-bit words, little-endian) to unsigned
116
* little-endian encoding. The total encoding length is provided; all
117
* the destination bytes will be filled.
118
*/
119
static void
120
le13_to_le8(unsigned char *dst, size_t len, const uint32_t *src)
121
{
122
uint32_t acc;
123
int acc_len;
124
125
acc = 0;
126
acc_len = 0;
127
while (len -- > 0) {
128
if (acc_len < 8) {
129
acc |= (*src ++) << acc_len;
130
acc_len += 13;
131
}
132
*dst ++ = (unsigned char)acc;
133
acc >>= 8;
134
acc_len -= 8;
135
}
136
}
137
138
/*
139
* Normalise an array of words to a strict 13 bits per word. Returned
140
* value is the resulting carry. The source (w) and destination (d)
141
* arrays may be identical, but shall not overlap partially.
142
*/
143
static inline uint32_t
144
norm13(uint32_t *d, const uint32_t *w, size_t len)
145
{
146
size_t u;
147
uint32_t cc;
148
149
cc = 0;
150
for (u = 0; u < len; u ++) {
151
int32_t z;
152
153
z = w[u] + cc;
154
d[u] = z & 0x1FFF;
155
cc = ARSH(z, 13);
156
}
157
return cc;
158
}
159
160
/*
161
* mul20() multiplies two 260-bit integers together. Each word must fit
162
* on 13 bits; source operands use 20 words, destination operand
163
* receives 40 words. All overlaps allowed.
164
*
165
* square20() computes the square of a 260-bit integer. Each word must
166
* fit on 13 bits; source operand uses 20 words, destination operand
167
* receives 40 words. All overlaps allowed.
168
*/
169
170
#if BR_SLOW_MUL15
171
172
static void
173
mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
174
{
175
/*
176
* Two-level Karatsuba: turns a 20x20 multiplication into
177
* nine 5x5 multiplications. We use 13-bit words but do not
178
* propagate carries immediately, so words may expand:
179
*
180
* - First Karatsuba decomposition turns the 20x20 mul on
181
* 13-bit words into three 10x10 muls, two on 13-bit words
182
* and one on 14-bit words.
183
*
184
* - Second Karatsuba decomposition further splits these into:
185
*
186
* * four 5x5 muls on 13-bit words
187
* * four 5x5 muls on 14-bit words
188
* * one 5x5 mul on 15-bit words
189
*
190
* Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit
191
* or 15-bit words, respectively.
192
*/
193
uint32_t u[45], v[45], w[90];
194
uint32_t cc;
195
int i;
196
197
#define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off) do { \
198
(dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \
199
+ (s2w)[5 * (s2_off) + 0]; \
200
(dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \
201
+ (s2w)[5 * (s2_off) + 1]; \
202
(dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \
203
+ (s2w)[5 * (s2_off) + 2]; \
204
(dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \
205
+ (s2w)[5 * (s2_off) + 3]; \
206
(dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \
207
+ (s2w)[5 * (s2_off) + 4]; \
208
} while (0)
209
210
#define ZADDT(dw, d_off, sw, s_off) do { \
211
(dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \
212
(dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \
213
(dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \
214
(dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \
215
(dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \
216
} while (0)
217
218
#define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off) do { \
219
(dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \
220
+ (s2w)[5 * (s2_off) + 0]; \
221
(dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \
222
+ (s2w)[5 * (s2_off) + 1]; \
223
(dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \
224
+ (s2w)[5 * (s2_off) + 2]; \
225
(dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \
226
+ (s2w)[5 * (s2_off) + 3]; \
227
(dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \
228
+ (s2w)[5 * (s2_off) + 4]; \
229
} while (0)
230
231
#define CPR1(w, cprcc) do { \
232
uint32_t cprz = (w) + cprcc; \
233
(w) = cprz & 0x1FFF; \
234
cprcc = cprz >> 13; \
235
} while (0)
236
237
#define CPR(dw, d_off) do { \
238
uint32_t cprcc; \
239
cprcc = 0; \
240
CPR1((dw)[(d_off) + 0], cprcc); \
241
CPR1((dw)[(d_off) + 1], cprcc); \
242
CPR1((dw)[(d_off) + 2], cprcc); \
243
CPR1((dw)[(d_off) + 3], cprcc); \
244
CPR1((dw)[(d_off) + 4], cprcc); \
245
CPR1((dw)[(d_off) + 5], cprcc); \
246
CPR1((dw)[(d_off) + 6], cprcc); \
247
CPR1((dw)[(d_off) + 7], cprcc); \
248
CPR1((dw)[(d_off) + 8], cprcc); \
249
(dw)[(d_off) + 9] = cprcc; \
250
} while (0)
251
252
memcpy(u, a, 20 * sizeof *a);
253
ZADD(u, 4, a, 0, a, 1);
254
ZADD(u, 5, a, 2, a, 3);
255
ZADD(u, 6, a, 0, a, 2);
256
ZADD(u, 7, a, 1, a, 3);
257
ZADD(u, 8, u, 6, u, 7);
258
259
memcpy(v, b, 20 * sizeof *b);
260
ZADD(v, 4, b, 0, b, 1);
261
ZADD(v, 5, b, 2, b, 3);
262
ZADD(v, 6, b, 0, b, 2);
263
ZADD(v, 7, b, 1, b, 3);
264
ZADD(v, 8, v, 6, v, 7);
265
266
/*
267
* Do the eight first 8x8 muls. Source words are at most 16382
268
* each, so we can add product results together "as is" in 32-bit
269
* words.
270
*/
271
for (i = 0; i < 40; i += 5) {
272
w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]);
273
w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1])
274
+ MUL15(u[i + 1], v[i + 0]);
275
w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2])
276
+ MUL15(u[i + 1], v[i + 1])
277
+ MUL15(u[i + 2], v[i + 0]);
278
w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3])
279
+ MUL15(u[i + 1], v[i + 2])
280
+ MUL15(u[i + 2], v[i + 1])
281
+ MUL15(u[i + 3], v[i + 0]);
282
w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4])
283
+ MUL15(u[i + 1], v[i + 3])
284
+ MUL15(u[i + 2], v[i + 2])
285
+ MUL15(u[i + 3], v[i + 1])
286
+ MUL15(u[i + 4], v[i + 0]);
287
w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4])
288
+ MUL15(u[i + 2], v[i + 3])
289
+ MUL15(u[i + 3], v[i + 2])
290
+ MUL15(u[i + 4], v[i + 1]);
291
w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4])
292
+ MUL15(u[i + 3], v[i + 3])
293
+ MUL15(u[i + 4], v[i + 2]);
294
w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4])
295
+ MUL15(u[i + 4], v[i + 3]);
296
w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]);
297
w[(i << 1) + 9] = 0;
298
}
299
300
/*
301
* For the 9th multiplication, source words are up to 32764,
302
* so we must do some carry propagation. If we add up to
303
* 4 products and the carry is no more than 524224, then the
304
* result fits in 32 bits, and the next carry will be no more
305
* than 524224 (because 4*(32764^2)+524224 < 8192*524225).
306
*
307
* We thus just skip one of the products in the middle word,
308
* then do a carry propagation (this reduces words to 13 bits
309
* each, except possibly the last, which may use up to 17 bits
310
* or so), then add the missing product.
311
*/
312
w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]);
313
w[80 + 1] = MUL15(u[40 + 0], v[40 + 1])
314
+ MUL15(u[40 + 1], v[40 + 0]);
315
w[80 + 2] = MUL15(u[40 + 0], v[40 + 2])
316
+ MUL15(u[40 + 1], v[40 + 1])
317
+ MUL15(u[40 + 2], v[40 + 0]);
318
w[80 + 3] = MUL15(u[40 + 0], v[40 + 3])
319
+ MUL15(u[40 + 1], v[40 + 2])
320
+ MUL15(u[40 + 2], v[40 + 1])
321
+ MUL15(u[40 + 3], v[40 + 0]);
322
w[80 + 4] = MUL15(u[40 + 0], v[40 + 4])
323
+ MUL15(u[40 + 1], v[40 + 3])
324
+ MUL15(u[40 + 2], v[40 + 2])
325
+ MUL15(u[40 + 3], v[40 + 1]);
326
/* + MUL15(u[40 + 4], v[40 + 0]) */
327
w[80 + 5] = MUL15(u[40 + 1], v[40 + 4])
328
+ MUL15(u[40 + 2], v[40 + 3])
329
+ MUL15(u[40 + 3], v[40 + 2])
330
+ MUL15(u[40 + 4], v[40 + 1]);
331
w[80 + 6] = MUL15(u[40 + 2], v[40 + 4])
332
+ MUL15(u[40 + 3], v[40 + 3])
333
+ MUL15(u[40 + 4], v[40 + 2]);
334
w[80 + 7] = MUL15(u[40 + 3], v[40 + 4])
335
+ MUL15(u[40 + 4], v[40 + 3]);
336
w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]);
337
338
CPR(w, 80);
339
340
w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]);
341
342
/*
343
* The products on 14-bit words in slots 6 and 7 yield values
344
* up to 5*(16382^2) each, and we need to subtract two such
345
* values from the higher word. We need the subtraction to fit
346
* in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit.
347
* However, 10*(16382^2) does not fit. So we must perform a
348
* bit of reduction here.
349
*/
350
CPR(w, 60);
351
CPR(w, 70);
352
353
/*
354
* Recompose results.
355
*/
356
357
/* 0..1*0..1 into 0..3 */
358
ZSUB2F(w, 8, w, 0, w, 2);
359
ZSUB2F(w, 9, w, 1, w, 3);
360
ZADDT(w, 1, w, 8);
361
ZADDT(w, 2, w, 9);
362
363
/* 2..3*2..3 into 4..7 */
364
ZSUB2F(w, 10, w, 4, w, 6);
365
ZSUB2F(w, 11, w, 5, w, 7);
366
ZADDT(w, 5, w, 10);
367
ZADDT(w, 6, w, 11);
368
369
/* (0..1+2..3)*(0..1+2..3) into 12..15 */
370
ZSUB2F(w, 16, w, 12, w, 14);
371
ZSUB2F(w, 17, w, 13, w, 15);
372
ZADDT(w, 13, w, 16);
373
ZADDT(w, 14, w, 17);
374
375
/* first-level recomposition */
376
ZSUB2F(w, 12, w, 0, w, 4);
377
ZSUB2F(w, 13, w, 1, w, 5);
378
ZSUB2F(w, 14, w, 2, w, 6);
379
ZSUB2F(w, 15, w, 3, w, 7);
380
ZADDT(w, 2, w, 12);
381
ZADDT(w, 3, w, 13);
382
ZADDT(w, 4, w, 14);
383
ZADDT(w, 5, w, 15);
384
385
/*
386
* Perform carry propagation to bring all words down to 13 bits.
387
*/
388
cc = norm13(d, w, 40);
389
d[39] += (cc << 13);
390
391
#undef ZADD
392
#undef ZADDT
393
#undef ZSUB2F
394
#undef CPR1
395
#undef CPR
396
}
397
398
static inline void
399
square20(uint32_t *d, const uint32_t *a)
400
{
401
mul20(d, a, a);
402
}
403
404
#else
405
406
static void
407
mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
408
{
409
uint32_t t[39];
410
411
t[ 0] = MUL15(a[ 0], b[ 0]);
412
t[ 1] = MUL15(a[ 0], b[ 1])
413
+ MUL15(a[ 1], b[ 0]);
414
t[ 2] = MUL15(a[ 0], b[ 2])
415
+ MUL15(a[ 1], b[ 1])
416
+ MUL15(a[ 2], b[ 0]);
417
t[ 3] = MUL15(a[ 0], b[ 3])
418
+ MUL15(a[ 1], b[ 2])
419
+ MUL15(a[ 2], b[ 1])
420
+ MUL15(a[ 3], b[ 0]);
421
t[ 4] = MUL15(a[ 0], b[ 4])
422
+ MUL15(a[ 1], b[ 3])
423
+ MUL15(a[ 2], b[ 2])
424
+ MUL15(a[ 3], b[ 1])
425
+ MUL15(a[ 4], b[ 0]);
426
t[ 5] = MUL15(a[ 0], b[ 5])
427
+ MUL15(a[ 1], b[ 4])
428
+ MUL15(a[ 2], b[ 3])
429
+ MUL15(a[ 3], b[ 2])
430
+ MUL15(a[ 4], b[ 1])
431
+ MUL15(a[ 5], b[ 0]);
432
t[ 6] = MUL15(a[ 0], b[ 6])
433
+ MUL15(a[ 1], b[ 5])
434
+ MUL15(a[ 2], b[ 4])
435
+ MUL15(a[ 3], b[ 3])
436
+ MUL15(a[ 4], b[ 2])
437
+ MUL15(a[ 5], b[ 1])
438
+ MUL15(a[ 6], b[ 0]);
439
t[ 7] = MUL15(a[ 0], b[ 7])
440
+ MUL15(a[ 1], b[ 6])
441
+ MUL15(a[ 2], b[ 5])
442
+ MUL15(a[ 3], b[ 4])
443
+ MUL15(a[ 4], b[ 3])
444
+ MUL15(a[ 5], b[ 2])
445
+ MUL15(a[ 6], b[ 1])
446
+ MUL15(a[ 7], b[ 0]);
447
t[ 8] = MUL15(a[ 0], b[ 8])
448
+ MUL15(a[ 1], b[ 7])
449
+ MUL15(a[ 2], b[ 6])
450
+ MUL15(a[ 3], b[ 5])
451
+ MUL15(a[ 4], b[ 4])
452
+ MUL15(a[ 5], b[ 3])
453
+ MUL15(a[ 6], b[ 2])
454
+ MUL15(a[ 7], b[ 1])
455
+ MUL15(a[ 8], b[ 0]);
456
t[ 9] = MUL15(a[ 0], b[ 9])
457
+ MUL15(a[ 1], b[ 8])
458
+ MUL15(a[ 2], b[ 7])
459
+ MUL15(a[ 3], b[ 6])
460
+ MUL15(a[ 4], b[ 5])
461
+ MUL15(a[ 5], b[ 4])
462
+ MUL15(a[ 6], b[ 3])
463
+ MUL15(a[ 7], b[ 2])
464
+ MUL15(a[ 8], b[ 1])
465
+ MUL15(a[ 9], b[ 0]);
466
t[10] = MUL15(a[ 0], b[10])
467
+ MUL15(a[ 1], b[ 9])
468
+ MUL15(a[ 2], b[ 8])
469
+ MUL15(a[ 3], b[ 7])
470
+ MUL15(a[ 4], b[ 6])
471
+ MUL15(a[ 5], b[ 5])
472
+ MUL15(a[ 6], b[ 4])
473
+ MUL15(a[ 7], b[ 3])
474
+ MUL15(a[ 8], b[ 2])
475
+ MUL15(a[ 9], b[ 1])
476
+ MUL15(a[10], b[ 0]);
477
t[11] = MUL15(a[ 0], b[11])
478
+ MUL15(a[ 1], b[10])
479
+ MUL15(a[ 2], b[ 9])
480
+ MUL15(a[ 3], b[ 8])
481
+ MUL15(a[ 4], b[ 7])
482
+ MUL15(a[ 5], b[ 6])
483
+ MUL15(a[ 6], b[ 5])
484
+ MUL15(a[ 7], b[ 4])
485
+ MUL15(a[ 8], b[ 3])
486
+ MUL15(a[ 9], b[ 2])
487
+ MUL15(a[10], b[ 1])
488
+ MUL15(a[11], b[ 0]);
489
t[12] = MUL15(a[ 0], b[12])
490
+ MUL15(a[ 1], b[11])
491
+ MUL15(a[ 2], b[10])
492
+ MUL15(a[ 3], b[ 9])
493
+ MUL15(a[ 4], b[ 8])
494
+ MUL15(a[ 5], b[ 7])
495
+ MUL15(a[ 6], b[ 6])
496
+ MUL15(a[ 7], b[ 5])
497
+ MUL15(a[ 8], b[ 4])
498
+ MUL15(a[ 9], b[ 3])
499
+ MUL15(a[10], b[ 2])
500
+ MUL15(a[11], b[ 1])
501
+ MUL15(a[12], b[ 0]);
502
t[13] = MUL15(a[ 0], b[13])
503
+ MUL15(a[ 1], b[12])
504
+ MUL15(a[ 2], b[11])
505
+ MUL15(a[ 3], b[10])
506
+ MUL15(a[ 4], b[ 9])
507
+ MUL15(a[ 5], b[ 8])
508
+ MUL15(a[ 6], b[ 7])
509
+ MUL15(a[ 7], b[ 6])
510
+ MUL15(a[ 8], b[ 5])
511
+ MUL15(a[ 9], b[ 4])
512
+ MUL15(a[10], b[ 3])
513
+ MUL15(a[11], b[ 2])
514
+ MUL15(a[12], b[ 1])
515
+ MUL15(a[13], b[ 0]);
516
t[14] = MUL15(a[ 0], b[14])
517
+ MUL15(a[ 1], b[13])
518
+ MUL15(a[ 2], b[12])
519
+ MUL15(a[ 3], b[11])
520
+ MUL15(a[ 4], b[10])
521
+ MUL15(a[ 5], b[ 9])
522
+ MUL15(a[ 6], b[ 8])
523
+ MUL15(a[ 7], b[ 7])
524
+ MUL15(a[ 8], b[ 6])
525
+ MUL15(a[ 9], b[ 5])
526
+ MUL15(a[10], b[ 4])
527
+ MUL15(a[11], b[ 3])
528
+ MUL15(a[12], b[ 2])
529
+ MUL15(a[13], b[ 1])
530
+ MUL15(a[14], b[ 0]);
531
t[15] = MUL15(a[ 0], b[15])
532
+ MUL15(a[ 1], b[14])
533
+ MUL15(a[ 2], b[13])
534
+ MUL15(a[ 3], b[12])
535
+ MUL15(a[ 4], b[11])
536
+ MUL15(a[ 5], b[10])
537
+ MUL15(a[ 6], b[ 9])
538
+ MUL15(a[ 7], b[ 8])
539
+ MUL15(a[ 8], b[ 7])
540
+ MUL15(a[ 9], b[ 6])
541
+ MUL15(a[10], b[ 5])
542
+ MUL15(a[11], b[ 4])
543
+ MUL15(a[12], b[ 3])
544
+ MUL15(a[13], b[ 2])
545
+ MUL15(a[14], b[ 1])
546
+ MUL15(a[15], b[ 0]);
547
t[16] = MUL15(a[ 0], b[16])
548
+ MUL15(a[ 1], b[15])
549
+ MUL15(a[ 2], b[14])
550
+ MUL15(a[ 3], b[13])
551
+ MUL15(a[ 4], b[12])
552
+ MUL15(a[ 5], b[11])
553
+ MUL15(a[ 6], b[10])
554
+ MUL15(a[ 7], b[ 9])
555
+ MUL15(a[ 8], b[ 8])
556
+ MUL15(a[ 9], b[ 7])
557
+ MUL15(a[10], b[ 6])
558
+ MUL15(a[11], b[ 5])
559
+ MUL15(a[12], b[ 4])
560
+ MUL15(a[13], b[ 3])
561
+ MUL15(a[14], b[ 2])
562
+ MUL15(a[15], b[ 1])
563
+ MUL15(a[16], b[ 0]);
564
t[17] = MUL15(a[ 0], b[17])
565
+ MUL15(a[ 1], b[16])
566
+ MUL15(a[ 2], b[15])
567
+ MUL15(a[ 3], b[14])
568
+ MUL15(a[ 4], b[13])
569
+ MUL15(a[ 5], b[12])
570
+ MUL15(a[ 6], b[11])
571
+ MUL15(a[ 7], b[10])
572
+ MUL15(a[ 8], b[ 9])
573
+ MUL15(a[ 9], b[ 8])
574
+ MUL15(a[10], b[ 7])
575
+ MUL15(a[11], b[ 6])
576
+ MUL15(a[12], b[ 5])
577
+ MUL15(a[13], b[ 4])
578
+ MUL15(a[14], b[ 3])
579
+ MUL15(a[15], b[ 2])
580
+ MUL15(a[16], b[ 1])
581
+ MUL15(a[17], b[ 0]);
582
t[18] = MUL15(a[ 0], b[18])
583
+ MUL15(a[ 1], b[17])
584
+ MUL15(a[ 2], b[16])
585
+ MUL15(a[ 3], b[15])
586
+ MUL15(a[ 4], b[14])
587
+ MUL15(a[ 5], b[13])
588
+ MUL15(a[ 6], b[12])
589
+ MUL15(a[ 7], b[11])
590
+ MUL15(a[ 8], b[10])
591
+ MUL15(a[ 9], b[ 9])
592
+ MUL15(a[10], b[ 8])
593
+ MUL15(a[11], b[ 7])
594
+ MUL15(a[12], b[ 6])
595
+ MUL15(a[13], b[ 5])
596
+ MUL15(a[14], b[ 4])
597
+ MUL15(a[15], b[ 3])
598
+ MUL15(a[16], b[ 2])
599
+ MUL15(a[17], b[ 1])
600
+ MUL15(a[18], b[ 0]);
601
t[19] = MUL15(a[ 0], b[19])
602
+ MUL15(a[ 1], b[18])
603
+ MUL15(a[ 2], b[17])
604
+ MUL15(a[ 3], b[16])
605
+ MUL15(a[ 4], b[15])
606
+ MUL15(a[ 5], b[14])
607
+ MUL15(a[ 6], b[13])
608
+ MUL15(a[ 7], b[12])
609
+ MUL15(a[ 8], b[11])
610
+ MUL15(a[ 9], b[10])
611
+ MUL15(a[10], b[ 9])
612
+ MUL15(a[11], b[ 8])
613
+ MUL15(a[12], b[ 7])
614
+ MUL15(a[13], b[ 6])
615
+ MUL15(a[14], b[ 5])
616
+ MUL15(a[15], b[ 4])
617
+ MUL15(a[16], b[ 3])
618
+ MUL15(a[17], b[ 2])
619
+ MUL15(a[18], b[ 1])
620
+ MUL15(a[19], b[ 0]);
621
t[20] = MUL15(a[ 1], b[19])
622
+ MUL15(a[ 2], b[18])
623
+ MUL15(a[ 3], b[17])
624
+ MUL15(a[ 4], b[16])
625
+ MUL15(a[ 5], b[15])
626
+ MUL15(a[ 6], b[14])
627
+ MUL15(a[ 7], b[13])
628
+ MUL15(a[ 8], b[12])
629
+ MUL15(a[ 9], b[11])
630
+ MUL15(a[10], b[10])
631
+ MUL15(a[11], b[ 9])
632
+ MUL15(a[12], b[ 8])
633
+ MUL15(a[13], b[ 7])
634
+ MUL15(a[14], b[ 6])
635
+ MUL15(a[15], b[ 5])
636
+ MUL15(a[16], b[ 4])
637
+ MUL15(a[17], b[ 3])
638
+ MUL15(a[18], b[ 2])
639
+ MUL15(a[19], b[ 1]);
640
t[21] = MUL15(a[ 2], b[19])
641
+ MUL15(a[ 3], b[18])
642
+ MUL15(a[ 4], b[17])
643
+ MUL15(a[ 5], b[16])
644
+ MUL15(a[ 6], b[15])
645
+ MUL15(a[ 7], b[14])
646
+ MUL15(a[ 8], b[13])
647
+ MUL15(a[ 9], b[12])
648
+ MUL15(a[10], b[11])
649
+ MUL15(a[11], b[10])
650
+ MUL15(a[12], b[ 9])
651
+ MUL15(a[13], b[ 8])
652
+ MUL15(a[14], b[ 7])
653
+ MUL15(a[15], b[ 6])
654
+ MUL15(a[16], b[ 5])
655
+ MUL15(a[17], b[ 4])
656
+ MUL15(a[18], b[ 3])
657
+ MUL15(a[19], b[ 2]);
658
t[22] = MUL15(a[ 3], b[19])
659
+ MUL15(a[ 4], b[18])
660
+ MUL15(a[ 5], b[17])
661
+ MUL15(a[ 6], b[16])
662
+ MUL15(a[ 7], b[15])
663
+ MUL15(a[ 8], b[14])
664
+ MUL15(a[ 9], b[13])
665
+ MUL15(a[10], b[12])
666
+ MUL15(a[11], b[11])
667
+ MUL15(a[12], b[10])
668
+ MUL15(a[13], b[ 9])
669
+ MUL15(a[14], b[ 8])
670
+ MUL15(a[15], b[ 7])
671
+ MUL15(a[16], b[ 6])
672
+ MUL15(a[17], b[ 5])
673
+ MUL15(a[18], b[ 4])
674
+ MUL15(a[19], b[ 3]);
675
t[23] = MUL15(a[ 4], b[19])
676
+ MUL15(a[ 5], b[18])
677
+ MUL15(a[ 6], b[17])
678
+ MUL15(a[ 7], b[16])
679
+ MUL15(a[ 8], b[15])
680
+ MUL15(a[ 9], b[14])
681
+ MUL15(a[10], b[13])
682
+ MUL15(a[11], b[12])
683
+ MUL15(a[12], b[11])
684
+ MUL15(a[13], b[10])
685
+ MUL15(a[14], b[ 9])
686
+ MUL15(a[15], b[ 8])
687
+ MUL15(a[16], b[ 7])
688
+ MUL15(a[17], b[ 6])
689
+ MUL15(a[18], b[ 5])
690
+ MUL15(a[19], b[ 4]);
691
t[24] = MUL15(a[ 5], b[19])
692
+ MUL15(a[ 6], b[18])
693
+ MUL15(a[ 7], b[17])
694
+ MUL15(a[ 8], b[16])
695
+ MUL15(a[ 9], b[15])
696
+ MUL15(a[10], b[14])
697
+ MUL15(a[11], b[13])
698
+ MUL15(a[12], b[12])
699
+ MUL15(a[13], b[11])
700
+ MUL15(a[14], b[10])
701
+ MUL15(a[15], b[ 9])
702
+ MUL15(a[16], b[ 8])
703
+ MUL15(a[17], b[ 7])
704
+ MUL15(a[18], b[ 6])
705
+ MUL15(a[19], b[ 5]);
706
t[25] = MUL15(a[ 6], b[19])
707
+ MUL15(a[ 7], b[18])
708
+ MUL15(a[ 8], b[17])
709
+ MUL15(a[ 9], b[16])
710
+ MUL15(a[10], b[15])
711
+ MUL15(a[11], b[14])
712
+ MUL15(a[12], b[13])
713
+ MUL15(a[13], b[12])
714
+ MUL15(a[14], b[11])
715
+ MUL15(a[15], b[10])
716
+ MUL15(a[16], b[ 9])
717
+ MUL15(a[17], b[ 8])
718
+ MUL15(a[18], b[ 7])
719
+ MUL15(a[19], b[ 6]);
720
t[26] = MUL15(a[ 7], b[19])
721
+ MUL15(a[ 8], b[18])
722
+ MUL15(a[ 9], b[17])
723
+ MUL15(a[10], b[16])
724
+ MUL15(a[11], b[15])
725
+ MUL15(a[12], b[14])
726
+ MUL15(a[13], b[13])
727
+ MUL15(a[14], b[12])
728
+ MUL15(a[15], b[11])
729
+ MUL15(a[16], b[10])
730
+ MUL15(a[17], b[ 9])
731
+ MUL15(a[18], b[ 8])
732
+ MUL15(a[19], b[ 7]);
733
t[27] = MUL15(a[ 8], b[19])
734
+ MUL15(a[ 9], b[18])
735
+ MUL15(a[10], b[17])
736
+ MUL15(a[11], b[16])
737
+ MUL15(a[12], b[15])
738
+ MUL15(a[13], b[14])
739
+ MUL15(a[14], b[13])
740
+ MUL15(a[15], b[12])
741
+ MUL15(a[16], b[11])
742
+ MUL15(a[17], b[10])
743
+ MUL15(a[18], b[ 9])
744
+ MUL15(a[19], b[ 8]);
745
t[28] = MUL15(a[ 9], b[19])
746
+ MUL15(a[10], b[18])
747
+ MUL15(a[11], b[17])
748
+ MUL15(a[12], b[16])
749
+ MUL15(a[13], b[15])
750
+ MUL15(a[14], b[14])
751
+ MUL15(a[15], b[13])
752
+ MUL15(a[16], b[12])
753
+ MUL15(a[17], b[11])
754
+ MUL15(a[18], b[10])
755
+ MUL15(a[19], b[ 9]);
756
t[29] = MUL15(a[10], b[19])
757
+ MUL15(a[11], b[18])
758
+ MUL15(a[12], b[17])
759
+ MUL15(a[13], b[16])
760
+ MUL15(a[14], b[15])
761
+ MUL15(a[15], b[14])
762
+ MUL15(a[16], b[13])
763
+ MUL15(a[17], b[12])
764
+ MUL15(a[18], b[11])
765
+ MUL15(a[19], b[10]);
766
t[30] = MUL15(a[11], b[19])
767
+ MUL15(a[12], b[18])
768
+ MUL15(a[13], b[17])
769
+ MUL15(a[14], b[16])
770
+ MUL15(a[15], b[15])
771
+ MUL15(a[16], b[14])
772
+ MUL15(a[17], b[13])
773
+ MUL15(a[18], b[12])
774
+ MUL15(a[19], b[11]);
775
t[31] = MUL15(a[12], b[19])
776
+ MUL15(a[13], b[18])
777
+ MUL15(a[14], b[17])
778
+ MUL15(a[15], b[16])
779
+ MUL15(a[16], b[15])
780
+ MUL15(a[17], b[14])
781
+ MUL15(a[18], b[13])
782
+ MUL15(a[19], b[12]);
783
t[32] = MUL15(a[13], b[19])
784
+ MUL15(a[14], b[18])
785
+ MUL15(a[15], b[17])
786
+ MUL15(a[16], b[16])
787
+ MUL15(a[17], b[15])
788
+ MUL15(a[18], b[14])
789
+ MUL15(a[19], b[13]);
790
t[33] = MUL15(a[14], b[19])
791
+ MUL15(a[15], b[18])
792
+ MUL15(a[16], b[17])
793
+ MUL15(a[17], b[16])
794
+ MUL15(a[18], b[15])
795
+ MUL15(a[19], b[14]);
796
t[34] = MUL15(a[15], b[19])
797
+ MUL15(a[16], b[18])
798
+ MUL15(a[17], b[17])
799
+ MUL15(a[18], b[16])
800
+ MUL15(a[19], b[15]);
801
t[35] = MUL15(a[16], b[19])
802
+ MUL15(a[17], b[18])
803
+ MUL15(a[18], b[17])
804
+ MUL15(a[19], b[16]);
805
t[36] = MUL15(a[17], b[19])
806
+ MUL15(a[18], b[18])
807
+ MUL15(a[19], b[17]);
808
t[37] = MUL15(a[18], b[19])
809
+ MUL15(a[19], b[18]);
810
t[38] = MUL15(a[19], b[19]);
811
812
d[39] = norm13(d, t, 39);
813
}
814
815
static void
816
square20(uint32_t *d, const uint32_t *a)
817
{
818
uint32_t t[39];
819
820
t[ 0] = MUL15(a[ 0], a[ 0]);
821
t[ 1] = ((MUL15(a[ 0], a[ 1])) << 1);
822
t[ 2] = MUL15(a[ 1], a[ 1])
823
+ ((MUL15(a[ 0], a[ 2])) << 1);
824
t[ 3] = ((MUL15(a[ 0], a[ 3])
825
+ MUL15(a[ 1], a[ 2])) << 1);
826
t[ 4] = MUL15(a[ 2], a[ 2])
827
+ ((MUL15(a[ 0], a[ 4])
828
+ MUL15(a[ 1], a[ 3])) << 1);
829
t[ 5] = ((MUL15(a[ 0], a[ 5])
830
+ MUL15(a[ 1], a[ 4])
831
+ MUL15(a[ 2], a[ 3])) << 1);
832
t[ 6] = MUL15(a[ 3], a[ 3])
833
+ ((MUL15(a[ 0], a[ 6])
834
+ MUL15(a[ 1], a[ 5])
835
+ MUL15(a[ 2], a[ 4])) << 1);
836
t[ 7] = ((MUL15(a[ 0], a[ 7])
837
+ MUL15(a[ 1], a[ 6])
838
+ MUL15(a[ 2], a[ 5])
839
+ MUL15(a[ 3], a[ 4])) << 1);
840
t[ 8] = MUL15(a[ 4], a[ 4])
841
+ ((MUL15(a[ 0], a[ 8])
842
+ MUL15(a[ 1], a[ 7])
843
+ MUL15(a[ 2], a[ 6])
844
+ MUL15(a[ 3], a[ 5])) << 1);
845
t[ 9] = ((MUL15(a[ 0], a[ 9])
846
+ MUL15(a[ 1], a[ 8])
847
+ MUL15(a[ 2], a[ 7])
848
+ MUL15(a[ 3], a[ 6])
849
+ MUL15(a[ 4], a[ 5])) << 1);
850
t[10] = MUL15(a[ 5], a[ 5])
851
+ ((MUL15(a[ 0], a[10])
852
+ MUL15(a[ 1], a[ 9])
853
+ MUL15(a[ 2], a[ 8])
854
+ MUL15(a[ 3], a[ 7])
855
+ MUL15(a[ 4], a[ 6])) << 1);
856
t[11] = ((MUL15(a[ 0], a[11])
857
+ MUL15(a[ 1], a[10])
858
+ MUL15(a[ 2], a[ 9])
859
+ MUL15(a[ 3], a[ 8])
860
+ MUL15(a[ 4], a[ 7])
861
+ MUL15(a[ 5], a[ 6])) << 1);
862
t[12] = MUL15(a[ 6], a[ 6])
863
+ ((MUL15(a[ 0], a[12])
864
+ MUL15(a[ 1], a[11])
865
+ MUL15(a[ 2], a[10])
866
+ MUL15(a[ 3], a[ 9])
867
+ MUL15(a[ 4], a[ 8])
868
+ MUL15(a[ 5], a[ 7])) << 1);
869
t[13] = ((MUL15(a[ 0], a[13])
870
+ MUL15(a[ 1], a[12])
871
+ MUL15(a[ 2], a[11])
872
+ MUL15(a[ 3], a[10])
873
+ MUL15(a[ 4], a[ 9])
874
+ MUL15(a[ 5], a[ 8])
875
+ MUL15(a[ 6], a[ 7])) << 1);
876
t[14] = MUL15(a[ 7], a[ 7])
877
+ ((MUL15(a[ 0], a[14])
878
+ MUL15(a[ 1], a[13])
879
+ MUL15(a[ 2], a[12])
880
+ MUL15(a[ 3], a[11])
881
+ MUL15(a[ 4], a[10])
882
+ MUL15(a[ 5], a[ 9])
883
+ MUL15(a[ 6], a[ 8])) << 1);
884
t[15] = ((MUL15(a[ 0], a[15])
885
+ MUL15(a[ 1], a[14])
886
+ MUL15(a[ 2], a[13])
887
+ MUL15(a[ 3], a[12])
888
+ MUL15(a[ 4], a[11])
889
+ MUL15(a[ 5], a[10])
890
+ MUL15(a[ 6], a[ 9])
891
+ MUL15(a[ 7], a[ 8])) << 1);
892
t[16] = MUL15(a[ 8], a[ 8])
893
+ ((MUL15(a[ 0], a[16])
894
+ MUL15(a[ 1], a[15])
895
+ MUL15(a[ 2], a[14])
896
+ MUL15(a[ 3], a[13])
897
+ MUL15(a[ 4], a[12])
898
+ MUL15(a[ 5], a[11])
899
+ MUL15(a[ 6], a[10])
900
+ MUL15(a[ 7], a[ 9])) << 1);
901
t[17] = ((MUL15(a[ 0], a[17])
902
+ MUL15(a[ 1], a[16])
903
+ MUL15(a[ 2], a[15])
904
+ MUL15(a[ 3], a[14])
905
+ MUL15(a[ 4], a[13])
906
+ MUL15(a[ 5], a[12])
907
+ MUL15(a[ 6], a[11])
908
+ MUL15(a[ 7], a[10])
909
+ MUL15(a[ 8], a[ 9])) << 1);
910
t[18] = MUL15(a[ 9], a[ 9])
911
+ ((MUL15(a[ 0], a[18])
912
+ MUL15(a[ 1], a[17])
913
+ MUL15(a[ 2], a[16])
914
+ MUL15(a[ 3], a[15])
915
+ MUL15(a[ 4], a[14])
916
+ MUL15(a[ 5], a[13])
917
+ MUL15(a[ 6], a[12])
918
+ MUL15(a[ 7], a[11])
919
+ MUL15(a[ 8], a[10])) << 1);
920
t[19] = ((MUL15(a[ 0], a[19])
921
+ MUL15(a[ 1], a[18])
922
+ MUL15(a[ 2], a[17])
923
+ MUL15(a[ 3], a[16])
924
+ MUL15(a[ 4], a[15])
925
+ MUL15(a[ 5], a[14])
926
+ MUL15(a[ 6], a[13])
927
+ MUL15(a[ 7], a[12])
928
+ MUL15(a[ 8], a[11])
929
+ MUL15(a[ 9], a[10])) << 1);
930
t[20] = MUL15(a[10], a[10])
931
+ ((MUL15(a[ 1], a[19])
932
+ MUL15(a[ 2], a[18])
933
+ MUL15(a[ 3], a[17])
934
+ MUL15(a[ 4], a[16])
935
+ MUL15(a[ 5], a[15])
936
+ MUL15(a[ 6], a[14])
937
+ MUL15(a[ 7], a[13])
938
+ MUL15(a[ 8], a[12])
939
+ MUL15(a[ 9], a[11])) << 1);
940
t[21] = ((MUL15(a[ 2], a[19])
941
+ MUL15(a[ 3], a[18])
942
+ MUL15(a[ 4], a[17])
943
+ MUL15(a[ 5], a[16])
944
+ MUL15(a[ 6], a[15])
945
+ MUL15(a[ 7], a[14])
946
+ MUL15(a[ 8], a[13])
947
+ MUL15(a[ 9], a[12])
948
+ MUL15(a[10], a[11])) << 1);
949
t[22] = MUL15(a[11], a[11])
950
+ ((MUL15(a[ 3], a[19])
951
+ MUL15(a[ 4], a[18])
952
+ MUL15(a[ 5], a[17])
953
+ MUL15(a[ 6], a[16])
954
+ MUL15(a[ 7], a[15])
955
+ MUL15(a[ 8], a[14])
956
+ MUL15(a[ 9], a[13])
957
+ MUL15(a[10], a[12])) << 1);
958
t[23] = ((MUL15(a[ 4], a[19])
959
+ MUL15(a[ 5], a[18])
960
+ MUL15(a[ 6], a[17])
961
+ MUL15(a[ 7], a[16])
962
+ MUL15(a[ 8], a[15])
963
+ MUL15(a[ 9], a[14])
964
+ MUL15(a[10], a[13])
965
+ MUL15(a[11], a[12])) << 1);
966
t[24] = MUL15(a[12], a[12])
967
+ ((MUL15(a[ 5], a[19])
968
+ MUL15(a[ 6], a[18])
969
+ MUL15(a[ 7], a[17])
970
+ MUL15(a[ 8], a[16])
971
+ MUL15(a[ 9], a[15])
972
+ MUL15(a[10], a[14])
973
+ MUL15(a[11], a[13])) << 1);
974
t[25] = ((MUL15(a[ 6], a[19])
975
+ MUL15(a[ 7], a[18])
976
+ MUL15(a[ 8], a[17])
977
+ MUL15(a[ 9], a[16])
978
+ MUL15(a[10], a[15])
979
+ MUL15(a[11], a[14])
980
+ MUL15(a[12], a[13])) << 1);
981
t[26] = MUL15(a[13], a[13])
982
+ ((MUL15(a[ 7], a[19])
983
+ MUL15(a[ 8], a[18])
984
+ MUL15(a[ 9], a[17])
985
+ MUL15(a[10], a[16])
986
+ MUL15(a[11], a[15])
987
+ MUL15(a[12], a[14])) << 1);
988
t[27] = ((MUL15(a[ 8], a[19])
989
+ MUL15(a[ 9], a[18])
990
+ MUL15(a[10], a[17])
991
+ MUL15(a[11], a[16])
992
+ MUL15(a[12], a[15])
993
+ MUL15(a[13], a[14])) << 1);
994
t[28] = MUL15(a[14], a[14])
995
+ ((MUL15(a[ 9], a[19])
996
+ MUL15(a[10], a[18])
997
+ MUL15(a[11], a[17])
998
+ MUL15(a[12], a[16])
999
+ MUL15(a[13], a[15])) << 1);
1000
t[29] = ((MUL15(a[10], a[19])
1001
+ MUL15(a[11], a[18])
1002
+ MUL15(a[12], a[17])
1003
+ MUL15(a[13], a[16])
1004
+ MUL15(a[14], a[15])) << 1);
1005
t[30] = MUL15(a[15], a[15])
1006
+ ((MUL15(a[11], a[19])
1007
+ MUL15(a[12], a[18])
1008
+ MUL15(a[13], a[17])
1009
+ MUL15(a[14], a[16])) << 1);
1010
t[31] = ((MUL15(a[12], a[19])
1011
+ MUL15(a[13], a[18])
1012
+ MUL15(a[14], a[17])
1013
+ MUL15(a[15], a[16])) << 1);
1014
t[32] = MUL15(a[16], a[16])
1015
+ ((MUL15(a[13], a[19])
1016
+ MUL15(a[14], a[18])
1017
+ MUL15(a[15], a[17])) << 1);
1018
t[33] = ((MUL15(a[14], a[19])
1019
+ MUL15(a[15], a[18])
1020
+ MUL15(a[16], a[17])) << 1);
1021
t[34] = MUL15(a[17], a[17])
1022
+ ((MUL15(a[15], a[19])
1023
+ MUL15(a[16], a[18])) << 1);
1024
t[35] = ((MUL15(a[16], a[19])
1025
+ MUL15(a[17], a[18])) << 1);
1026
t[36] = MUL15(a[18], a[18])
1027
+ ((MUL15(a[17], a[19])) << 1);
1028
t[37] = ((MUL15(a[18], a[19])) << 1);
1029
t[38] = MUL15(a[19], a[19]);
1030
1031
d[39] = norm13(d, t, 39);
1032
}
1033
1034
#endif
1035
1036
/*
1037
* Perform a "final reduction" in field F255 (field for Curve25519)
1038
* The source value must be less than twice the modulus. If the value
1039
* is not lower than the modulus, then the modulus is subtracted and
1040
* this function returns 1; otherwise, it leaves it untouched and it
1041
* returns 0.
1042
*/
1043
static uint32_t
1044
reduce_final_f255(uint32_t *d)
1045
{
1046
uint32_t t[20];
1047
uint32_t cc;
1048
int i;
1049
1050
memcpy(t, d, sizeof t);
1051
cc = 19;
1052
for (i = 0; i < 20; i ++) {
1053
uint32_t w;
1054
1055
w = t[i] + cc;
1056
cc = w >> 13;
1057
t[i] = w & 0x1FFF;
1058
}
1059
cc = t[19] >> 8;
1060
t[19] &= 0xFF;
1061
CCOPY(cc, d, t, sizeof t);
1062
return cc;
1063
}
1064
1065
static void
1066
f255_mulgen(uint32_t *d, const uint32_t *a, const uint32_t *b, int square)
1067
{
1068
uint32_t t[40], cc, w;
1069
1070
/*
1071
* Compute raw multiplication. All result words fit in 13 bits
1072
* each; upper word (t[39]) must fit on 5 bits, since the product
1073
* of two 256-bit integers must fit on 512 bits.
1074
*/
1075
if (square) {
1076
square20(t, a);
1077
} else {
1078
mul20(t, a, b);
1079
}
1080
1081
/*
1082
* Modular reduction: each high word is added where necessary.
1083
* Since the modulus is 2^255-19 and word 20 corresponds to
1084
* offset 20*13 = 260, word 20+k must be added to word k with
1085
* a factor of 19*2^5 = 608. The extra bits in word 19 are also
1086
* added that way.
1087
*/
1088
cc = MUL15(t[19] >> 8, 19);
1089
t[19] &= 0xFF;
1090
1091
#define MM1(x) do { \
1092
w = t[x] + cc + MUL15(t[(x) + 20], 608); \
1093
t[x] = w & 0x1FFF; \
1094
cc = w >> 13; \
1095
} while (0)
1096
1097
MM1( 0);
1098
MM1( 1);
1099
MM1( 2);
1100
MM1( 3);
1101
MM1( 4);
1102
MM1( 5);
1103
MM1( 6);
1104
MM1( 7);
1105
MM1( 8);
1106
MM1( 9);
1107
MM1(10);
1108
MM1(11);
1109
MM1(12);
1110
MM1(13);
1111
MM1(14);
1112
MM1(15);
1113
MM1(16);
1114
MM1(17);
1115
MM1(18);
1116
MM1(19);
1117
1118
#undef MM1
1119
1120
cc = MUL15(w >> 8, 19);
1121
t[19] &= 0xFF;
1122
1123
#define MM2(x) do { \
1124
w = t[x] + cc; \
1125
d[x] = w & 0x1FFF; \
1126
cc = w >> 13; \
1127
} while (0)
1128
1129
MM2( 0);
1130
MM2( 1);
1131
MM2( 2);
1132
MM2( 3);
1133
MM2( 4);
1134
MM2( 5);
1135
MM2( 6);
1136
MM2( 7);
1137
MM2( 8);
1138
MM2( 9);
1139
MM2(10);
1140
MM2(11);
1141
MM2(12);
1142
MM2(13);
1143
MM2(14);
1144
MM2(15);
1145
MM2(16);
1146
MM2(17);
1147
MM2(18);
1148
MM2(19);
1149
1150
#undef MM2
1151
}
1152
1153
/*
1154
* Perform a multiplication of two integers modulo 2^255-19.
1155
* Operands are arrays of 20 words, each containing 13 bits of data, in
1156
* little-endian order. Input value may be up to 2^256-1; on output, value
1157
* fits on 256 bits and is lower than twice the modulus.
1158
*
1159
* f255_mul() is the general multiplication, f255_square() is specialised
1160
* for squarings.
1161
*/
1162
#define f255_mul(d, a, b) f255_mulgen(d, a, b, 0)
1163
#define f255_square(d, a) f255_mulgen(d, a, a, 1)
1164
1165
/*
1166
* Add two values in F255. Partial reduction is performed (down to less
1167
* than twice the modulus).
1168
*/
1169
static void
1170
f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b)
1171
{
1172
int i;
1173
uint32_t cc, w;
1174
1175
cc = 0;
1176
for (i = 0; i < 20; i ++) {
1177
w = a[i] + b[i] + cc;
1178
d[i] = w & 0x1FFF;
1179
cc = w >> 13;
1180
}
1181
cc = MUL15(w >> 8, 19);
1182
d[19] &= 0xFF;
1183
for (i = 0; i < 20; i ++) {
1184
w = d[i] + cc;
1185
d[i] = w & 0x1FFF;
1186
cc = w >> 13;
1187
}
1188
}
1189
1190
/*
1191
* Subtract one value from another in F255. Partial reduction is
1192
* performed (down to less than twice the modulus).
1193
*/
1194
static void
1195
f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b)
1196
{
1197
/*
1198
* We actually compute a - b + 2*p, so that the final value is
1199
* necessarily positive.
1200
*/
1201
int i;
1202
uint32_t cc, w;
1203
1204
cc = (uint32_t)-38;
1205
for (i = 0; i < 20; i ++) {
1206
w = a[i] - b[i] + cc;
1207
d[i] = w & 0x1FFF;
1208
cc = ARSH(w, 13);
1209
}
1210
cc = MUL15((w + 0x200) >> 8, 19);
1211
d[19] &= 0xFF;
1212
for (i = 0; i < 20; i ++) {
1213
w = d[i] + cc;
1214
d[i] = w & 0x1FFF;
1215
cc = w >> 13;
1216
}
1217
}
1218
1219
/*
1220
* Multiply an integer by the 'A24' constant (121665). Partial reduction
1221
* is performed (down to less than twice the modulus).
1222
*/
1223
static void
1224
f255_mul_a24(uint32_t *d, const uint32_t *a)
1225
{
1226
int i;
1227
uint32_t cc, w;
1228
1229
cc = 0;
1230
for (i = 0; i < 20; i ++) {
1231
w = MUL15(a[i], 121665) + cc;
1232
d[i] = w & 0x1FFF;
1233
cc = w >> 13;
1234
}
1235
cc = MUL15(w >> 8, 19);
1236
d[19] &= 0xFF;
1237
for (i = 0; i < 20; i ++) {
1238
w = d[i] + cc;
1239
d[i] = w & 0x1FFF;
1240
cc = w >> 13;
1241
}
1242
}
1243
1244
static const unsigned char GEN[] = {
1245
0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1246
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1247
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1248
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1249
};
1250
1251
static const unsigned char ORDER[] = {
1252
0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1253
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1254
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1255
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
1256
};
1257
1258
static const unsigned char *
1259
api_generator(int curve, size_t *len)
1260
{
1261
(void)curve;
1262
*len = 32;
1263
return GEN;
1264
}
1265
1266
static const unsigned char *
1267
api_order(int curve, size_t *len)
1268
{
1269
(void)curve;
1270
*len = 32;
1271
return ORDER;
1272
}
1273
1274
static size_t
1275
api_xoff(int curve, size_t *len)
1276
{
1277
(void)curve;
1278
*len = 32;
1279
return 0;
1280
}
1281
1282
static void
1283
cswap(uint32_t *a, uint32_t *b, uint32_t ctl)
1284
{
1285
int i;
1286
1287
ctl = -ctl;
1288
for (i = 0; i < 20; i ++) {
1289
uint32_t aw, bw, tw;
1290
1291
aw = a[i];
1292
bw = b[i];
1293
tw = ctl & (aw ^ bw);
1294
a[i] = aw ^ tw;
1295
b[i] = bw ^ tw;
1296
}
1297
}
1298
1299
static uint32_t
1300
api_mul(unsigned char *G, size_t Glen,
1301
const unsigned char *kb, size_t kblen, int curve)
1302
{
1303
uint32_t x1[20], x2[20], x3[20], z2[20], z3[20];
1304
uint32_t a[20], aa[20], b[20], bb[20];
1305
uint32_t c[20], d[20], e[20], da[20], cb[20];
1306
unsigned char k[32];
1307
uint32_t swap;
1308
int i;
1309
1310
(void)curve;
1311
1312
/*
1313
* Points are encoded over exactly 32 bytes. Multipliers must fit
1314
* in 32 bytes as well.
1315
* RFC 7748 mandates that the high bit of the last point byte must
1316
* be ignored/cleared.
1317
*/
1318
if (Glen != 32 || kblen > 32) {
1319
return 0;
1320
}
1321
G[31] &= 0x7F;
1322
1323
/*
1324
* Initialise variables x1, x2, z2, x3 and z3. We set all of them
1325
* into Montgomery representation.
1326
*/
1327
x1[19] = le8_to_le13(x1, G, 32);
1328
memcpy(x3, x1, sizeof x1);
1329
memset(z2, 0, sizeof z2);
1330
memset(x2, 0, sizeof x2);
1331
x2[0] = 1;
1332
memset(z3, 0, sizeof z3);
1333
z3[0] = 1;
1334
1335
memset(k, 0, (sizeof k) - kblen);
1336
memcpy(k + (sizeof k) - kblen, kb, kblen);
1337
k[31] &= 0xF8;
1338
k[0] &= 0x7F;
1339
k[0] |= 0x40;
1340
1341
/* obsolete
1342
print_int("x1", x1);
1343
*/
1344
1345
swap = 0;
1346
for (i = 254; i >= 0; i --) {
1347
uint32_t kt;
1348
1349
kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
1350
swap ^= kt;
1351
cswap(x2, x3, swap);
1352
cswap(z2, z3, swap);
1353
swap = kt;
1354
1355
/* obsolete
1356
print_int("x2", x2);
1357
print_int("z2", z2);
1358
print_int("x3", x3);
1359
print_int("z3", z3);
1360
*/
1361
1362
f255_add(a, x2, z2);
1363
f255_square(aa, a);
1364
f255_sub(b, x2, z2);
1365
f255_square(bb, b);
1366
f255_sub(e, aa, bb);
1367
f255_add(c, x3, z3);
1368
f255_sub(d, x3, z3);
1369
f255_mul(da, d, a);
1370
f255_mul(cb, c, b);
1371
1372
/* obsolete
1373
print_int("a ", a);
1374
print_int("aa", aa);
1375
print_int("b ", b);
1376
print_int("bb", bb);
1377
print_int("e ", e);
1378
print_int("c ", c);
1379
print_int("d ", d);
1380
print_int("da", da);
1381
print_int("cb", cb);
1382
*/
1383
1384
f255_add(x3, da, cb);
1385
f255_square(x3, x3);
1386
f255_sub(z3, da, cb);
1387
f255_square(z3, z3);
1388
f255_mul(z3, z3, x1);
1389
f255_mul(x2, aa, bb);
1390
f255_mul_a24(z2, e);
1391
f255_add(z2, z2, aa);
1392
f255_mul(z2, e, z2);
1393
1394
/* obsolete
1395
print_int("x2", x2);
1396
print_int("z2", z2);
1397
print_int("x3", x3);
1398
print_int("z3", z3);
1399
*/
1400
}
1401
cswap(x2, x3, swap);
1402
cswap(z2, z3, swap);
1403
1404
/*
1405
* Inverse z2 with a modular exponentiation. This is a simple
1406
* square-and-multiply algorithm; we mutualise most non-squarings
1407
* since the exponent contains almost only ones.
1408
*/
1409
memcpy(a, z2, sizeof z2);
1410
for (i = 0; i < 15; i ++) {
1411
f255_square(a, a);
1412
f255_mul(a, a, z2);
1413
}
1414
memcpy(b, a, sizeof a);
1415
for (i = 0; i < 14; i ++) {
1416
int j;
1417
1418
for (j = 0; j < 16; j ++) {
1419
f255_square(b, b);
1420
}
1421
f255_mul(b, b, a);
1422
}
1423
for (i = 14; i >= 0; i --) {
1424
f255_square(b, b);
1425
if ((0xFFEB >> i) & 1) {
1426
f255_mul(b, z2, b);
1427
}
1428
}
1429
f255_mul(x2, x2, b);
1430
reduce_final_f255(x2);
1431
le13_to_le8(G, 32, x2);
1432
return 1;
1433
}
1434
1435
static size_t
1436
api_mulgen(unsigned char *R,
1437
const unsigned char *x, size_t xlen, int curve)
1438
{
1439
const unsigned char *G;
1440
size_t Glen;
1441
1442
G = api_generator(curve, &Glen);
1443
memcpy(R, G, Glen);
1444
api_mul(R, Glen, x, xlen, curve);
1445
return Glen;
1446
}
1447
1448
static uint32_t
1449
api_muladd(unsigned char *A, const unsigned char *B, size_t len,
1450
const unsigned char *x, size_t xlen,
1451
const unsigned char *y, size_t ylen, int curve)
1452
{
1453
/*
1454
* We don't implement this method, since it is used for ECDSA
1455
* only, and there is no ECDSA over Curve25519 (which instead
1456
* uses EdDSA).
1457
*/
1458
(void)A;
1459
(void)B;
1460
(void)len;
1461
(void)x;
1462
(void)xlen;
1463
(void)y;
1464
(void)ylen;
1465
(void)curve;
1466
return 0;
1467
}
1468
1469
/* see bearssl_ec.h */
1470
const br_ec_impl br_ec_c25519_m15 = {
1471
(uint32_t)0x20000000,
1472
&api_generator,
1473
&api_order,
1474
&api_xoff,
1475
&api_mul,
1476
&api_mulgen,
1477
&api_muladd
1478
};
1479
1480