Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/curve25519-hacl64.c
26282 views
1
// SPDX-License-Identifier: GPL-2.0 OR MIT
2
/*
3
* Copyright (C) 2016-2017 INRIA and Microsoft Corporation.
4
* Copyright (C) 2018-2019 Jason A. Donenfeld <[email protected]>. All Rights Reserved.
5
*
6
* This is a machine-generated formally verified implementation of Curve25519
7
* ECDH from: <https://github.com/mitls/hacl-star>. Though originally machine
8
* generated, it has been tweaked to be suitable for use in the kernel. It is
9
* optimized for 64-bit machines that can efficiently work with 128-bit
10
* integer types.
11
*/
12
13
#include <linux/unaligned.h>
14
#include <crypto/curve25519.h>
15
#include <linux/string.h>
16
17
static __always_inline u64 u64_eq_mask(u64 a, u64 b)
18
{
19
u64 x = a ^ b;
20
u64 minus_x = ~x + (u64)1U;
21
u64 x_or_minus_x = x | minus_x;
22
u64 xnx = x_or_minus_x >> (u32)63U;
23
u64 c = xnx - (u64)1U;
24
return c;
25
}
26
27
static __always_inline u64 u64_gte_mask(u64 a, u64 b)
28
{
29
u64 x = a;
30
u64 y = b;
31
u64 x_xor_y = x ^ y;
32
u64 x_sub_y = x - y;
33
u64 x_sub_y_xor_y = x_sub_y ^ y;
34
u64 q = x_xor_y | x_sub_y_xor_y;
35
u64 x_xor_q = x ^ q;
36
u64 x_xor_q_ = x_xor_q >> (u32)63U;
37
u64 c = x_xor_q_ - (u64)1U;
38
return c;
39
}
40
41
static __always_inline void modulo_carry_top(u64 *b)
42
{
43
u64 b4 = b[4];
44
u64 b0 = b[0];
45
u64 b4_ = b4 & 0x7ffffffffffffLLU;
46
u64 b0_ = b0 + 19 * (b4 >> 51);
47
b[4] = b4_;
48
b[0] = b0_;
49
}
50
51
static __always_inline void fproduct_copy_from_wide_(u64 *output, u128 *input)
52
{
53
{
54
u128 xi = input[0];
55
output[0] = ((u64)(xi));
56
}
57
{
58
u128 xi = input[1];
59
output[1] = ((u64)(xi));
60
}
61
{
62
u128 xi = input[2];
63
output[2] = ((u64)(xi));
64
}
65
{
66
u128 xi = input[3];
67
output[3] = ((u64)(xi));
68
}
69
{
70
u128 xi = input[4];
71
output[4] = ((u64)(xi));
72
}
73
}
74
75
static __always_inline void
76
fproduct_sum_scalar_multiplication_(u128 *output, u64 *input, u64 s)
77
{
78
output[0] += (u128)input[0] * s;
79
output[1] += (u128)input[1] * s;
80
output[2] += (u128)input[2] * s;
81
output[3] += (u128)input[3] * s;
82
output[4] += (u128)input[4] * s;
83
}
84
85
static __always_inline void fproduct_carry_wide_(u128 *tmp)
86
{
87
{
88
u32 ctr = 0;
89
u128 tctr = tmp[ctr];
90
u128 tctrp1 = tmp[ctr + 1];
91
u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
92
u128 c = ((tctr) >> (51));
93
tmp[ctr] = ((u128)(r0));
94
tmp[ctr + 1] = ((tctrp1) + (c));
95
}
96
{
97
u32 ctr = 1;
98
u128 tctr = tmp[ctr];
99
u128 tctrp1 = tmp[ctr + 1];
100
u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
101
u128 c = ((tctr) >> (51));
102
tmp[ctr] = ((u128)(r0));
103
tmp[ctr + 1] = ((tctrp1) + (c));
104
}
105
106
{
107
u32 ctr = 2;
108
u128 tctr = tmp[ctr];
109
u128 tctrp1 = tmp[ctr + 1];
110
u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
111
u128 c = ((tctr) >> (51));
112
tmp[ctr] = ((u128)(r0));
113
tmp[ctr + 1] = ((tctrp1) + (c));
114
}
115
{
116
u32 ctr = 3;
117
u128 tctr = tmp[ctr];
118
u128 tctrp1 = tmp[ctr + 1];
119
u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
120
u128 c = ((tctr) >> (51));
121
tmp[ctr] = ((u128)(r0));
122
tmp[ctr + 1] = ((tctrp1) + (c));
123
}
124
}
125
126
static __always_inline void fmul_shift_reduce(u64 *output)
127
{
128
u64 tmp = output[4];
129
u64 b0;
130
{
131
u32 ctr = 5 - 0 - 1;
132
u64 z = output[ctr - 1];
133
output[ctr] = z;
134
}
135
{
136
u32 ctr = 5 - 1 - 1;
137
u64 z = output[ctr - 1];
138
output[ctr] = z;
139
}
140
{
141
u32 ctr = 5 - 2 - 1;
142
u64 z = output[ctr - 1];
143
output[ctr] = z;
144
}
145
{
146
u32 ctr = 5 - 3 - 1;
147
u64 z = output[ctr - 1];
148
output[ctr] = z;
149
}
150
output[0] = tmp;
151
b0 = output[0];
152
output[0] = 19 * b0;
153
}
154
155
static __always_inline void fmul_mul_shift_reduce_(u128 *output, u64 *input,
156
u64 *input21)
157
{
158
u32 i;
159
u64 input2i;
160
{
161
u64 input2i = input21[0];
162
fproduct_sum_scalar_multiplication_(output, input, input2i);
163
fmul_shift_reduce(input);
164
}
165
{
166
u64 input2i = input21[1];
167
fproduct_sum_scalar_multiplication_(output, input, input2i);
168
fmul_shift_reduce(input);
169
}
170
{
171
u64 input2i = input21[2];
172
fproduct_sum_scalar_multiplication_(output, input, input2i);
173
fmul_shift_reduce(input);
174
}
175
{
176
u64 input2i = input21[3];
177
fproduct_sum_scalar_multiplication_(output, input, input2i);
178
fmul_shift_reduce(input);
179
}
180
i = 4;
181
input2i = input21[i];
182
fproduct_sum_scalar_multiplication_(output, input, input2i);
183
}
184
185
static __always_inline void fmul_fmul(u64 *output, u64 *input, u64 *input21)
186
{
187
u64 tmp[5] = { input[0], input[1], input[2], input[3], input[4] };
188
{
189
u128 b4;
190
u128 b0;
191
u128 b4_;
192
u128 b0_;
193
u64 i0;
194
u64 i1;
195
u64 i0_;
196
u64 i1_;
197
u128 t[5] = { 0 };
198
fmul_mul_shift_reduce_(t, tmp, input21);
199
fproduct_carry_wide_(t);
200
b4 = t[4];
201
b0 = t[0];
202
b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
203
b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
204
t[4] = b4_;
205
t[0] = b0_;
206
fproduct_copy_from_wide_(output, t);
207
i0 = output[0];
208
i1 = output[1];
209
i0_ = i0 & 0x7ffffffffffffLLU;
210
i1_ = i1 + (i0 >> 51);
211
output[0] = i0_;
212
output[1] = i1_;
213
}
214
}
215
216
static __always_inline void fsquare_fsquare__(u128 *tmp, u64 *output)
217
{
218
u64 r0 = output[0];
219
u64 r1 = output[1];
220
u64 r2 = output[2];
221
u64 r3 = output[3];
222
u64 r4 = output[4];
223
u64 d0 = r0 * 2;
224
u64 d1 = r1 * 2;
225
u64 d2 = r2 * 2 * 19;
226
u64 d419 = r4 * 19;
227
u64 d4 = d419 * 2;
228
u128 s0 = ((((((u128)(r0) * (r0))) + (((u128)(d4) * (r1))))) +
229
(((u128)(d2) * (r3))));
230
u128 s1 = ((((((u128)(d0) * (r1))) + (((u128)(d4) * (r2))))) +
231
(((u128)(r3 * 19) * (r3))));
232
u128 s2 = ((((((u128)(d0) * (r2))) + (((u128)(r1) * (r1))))) +
233
(((u128)(d4) * (r3))));
234
u128 s3 = ((((((u128)(d0) * (r3))) + (((u128)(d1) * (r2))))) +
235
(((u128)(r4) * (d419))));
236
u128 s4 = ((((((u128)(d0) * (r4))) + (((u128)(d1) * (r3))))) +
237
(((u128)(r2) * (r2))));
238
tmp[0] = s0;
239
tmp[1] = s1;
240
tmp[2] = s2;
241
tmp[3] = s3;
242
tmp[4] = s4;
243
}
244
245
static __always_inline void fsquare_fsquare_(u128 *tmp, u64 *output)
246
{
247
u128 b4;
248
u128 b0;
249
u128 b4_;
250
u128 b0_;
251
u64 i0;
252
u64 i1;
253
u64 i0_;
254
u64 i1_;
255
fsquare_fsquare__(tmp, output);
256
fproduct_carry_wide_(tmp);
257
b4 = tmp[4];
258
b0 = tmp[0];
259
b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
260
b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
261
tmp[4] = b4_;
262
tmp[0] = b0_;
263
fproduct_copy_from_wide_(output, tmp);
264
i0 = output[0];
265
i1 = output[1];
266
i0_ = i0 & 0x7ffffffffffffLLU;
267
i1_ = i1 + (i0 >> 51);
268
output[0] = i0_;
269
output[1] = i1_;
270
}
271
272
static __always_inline void fsquare_fsquare_times_(u64 *output, u128 *tmp,
273
u32 count1)
274
{
275
u32 i;
276
fsquare_fsquare_(tmp, output);
277
for (i = 1; i < count1; ++i)
278
fsquare_fsquare_(tmp, output);
279
}
280
281
static __always_inline void fsquare_fsquare_times(u64 *output, u64 *input,
282
u32 count1)
283
{
284
u128 t[5];
285
memcpy(output, input, 5 * sizeof(*input));
286
fsquare_fsquare_times_(output, t, count1);
287
}
288
289
static __always_inline void fsquare_fsquare_times_inplace(u64 *output,
290
u32 count1)
291
{
292
u128 t[5];
293
fsquare_fsquare_times_(output, t, count1);
294
}
295
296
static __always_inline void crecip_crecip(u64 *out, u64 *z)
297
{
298
u64 buf[20] = { 0 };
299
u64 *a0 = buf;
300
u64 *t00 = buf + 5;
301
u64 *b0 = buf + 10;
302
u64 *t01;
303
u64 *b1;
304
u64 *c0;
305
u64 *a;
306
u64 *t0;
307
u64 *b;
308
u64 *c;
309
fsquare_fsquare_times(a0, z, 1);
310
fsquare_fsquare_times(t00, a0, 2);
311
fmul_fmul(b0, t00, z);
312
fmul_fmul(a0, b0, a0);
313
fsquare_fsquare_times(t00, a0, 1);
314
fmul_fmul(b0, t00, b0);
315
fsquare_fsquare_times(t00, b0, 5);
316
t01 = buf + 5;
317
b1 = buf + 10;
318
c0 = buf + 15;
319
fmul_fmul(b1, t01, b1);
320
fsquare_fsquare_times(t01, b1, 10);
321
fmul_fmul(c0, t01, b1);
322
fsquare_fsquare_times(t01, c0, 20);
323
fmul_fmul(t01, t01, c0);
324
fsquare_fsquare_times_inplace(t01, 10);
325
fmul_fmul(b1, t01, b1);
326
fsquare_fsquare_times(t01, b1, 50);
327
a = buf;
328
t0 = buf + 5;
329
b = buf + 10;
330
c = buf + 15;
331
fmul_fmul(c, t0, b);
332
fsquare_fsquare_times(t0, c, 100);
333
fmul_fmul(t0, t0, c);
334
fsquare_fsquare_times_inplace(t0, 50);
335
fmul_fmul(t0, t0, b);
336
fsquare_fsquare_times_inplace(t0, 5);
337
fmul_fmul(out, t0, a);
338
}
339
340
static __always_inline void fsum(u64 *a, u64 *b)
341
{
342
a[0] += b[0];
343
a[1] += b[1];
344
a[2] += b[2];
345
a[3] += b[3];
346
a[4] += b[4];
347
}
348
349
static __always_inline void fdifference(u64 *a, u64 *b)
350
{
351
u64 tmp[5] = { 0 };
352
u64 b0;
353
u64 b1;
354
u64 b2;
355
u64 b3;
356
u64 b4;
357
memcpy(tmp, b, 5 * sizeof(*b));
358
b0 = tmp[0];
359
b1 = tmp[1];
360
b2 = tmp[2];
361
b3 = tmp[3];
362
b4 = tmp[4];
363
tmp[0] = b0 + 0x3fffffffffff68LLU;
364
tmp[1] = b1 + 0x3ffffffffffff8LLU;
365
tmp[2] = b2 + 0x3ffffffffffff8LLU;
366
tmp[3] = b3 + 0x3ffffffffffff8LLU;
367
tmp[4] = b4 + 0x3ffffffffffff8LLU;
368
{
369
u64 xi = a[0];
370
u64 yi = tmp[0];
371
a[0] = yi - xi;
372
}
373
{
374
u64 xi = a[1];
375
u64 yi = tmp[1];
376
a[1] = yi - xi;
377
}
378
{
379
u64 xi = a[2];
380
u64 yi = tmp[2];
381
a[2] = yi - xi;
382
}
383
{
384
u64 xi = a[3];
385
u64 yi = tmp[3];
386
a[3] = yi - xi;
387
}
388
{
389
u64 xi = a[4];
390
u64 yi = tmp[4];
391
a[4] = yi - xi;
392
}
393
}
394
395
static __always_inline void fscalar(u64 *output, u64 *b, u64 s)
396
{
397
u128 tmp[5];
398
u128 b4;
399
u128 b0;
400
u128 b4_;
401
u128 b0_;
402
{
403
u64 xi = b[0];
404
tmp[0] = ((u128)(xi) * (s));
405
}
406
{
407
u64 xi = b[1];
408
tmp[1] = ((u128)(xi) * (s));
409
}
410
{
411
u64 xi = b[2];
412
tmp[2] = ((u128)(xi) * (s));
413
}
414
{
415
u64 xi = b[3];
416
tmp[3] = ((u128)(xi) * (s));
417
}
418
{
419
u64 xi = b[4];
420
tmp[4] = ((u128)(xi) * (s));
421
}
422
fproduct_carry_wide_(tmp);
423
b4 = tmp[4];
424
b0 = tmp[0];
425
b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
426
b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
427
tmp[4] = b4_;
428
tmp[0] = b0_;
429
fproduct_copy_from_wide_(output, tmp);
430
}
431
432
static __always_inline void fmul(u64 *output, u64 *a, u64 *b)
433
{
434
fmul_fmul(output, a, b);
435
}
436
437
static __always_inline void crecip(u64 *output, u64 *input)
438
{
439
crecip_crecip(output, input);
440
}
441
442
static __always_inline void point_swap_conditional_step(u64 *a, u64 *b,
443
u64 swap1, u32 ctr)
444
{
445
u32 i = ctr - 1;
446
u64 ai = a[i];
447
u64 bi = b[i];
448
u64 x = swap1 & (ai ^ bi);
449
u64 ai1 = ai ^ x;
450
u64 bi1 = bi ^ x;
451
a[i] = ai1;
452
b[i] = bi1;
453
}
454
455
static __always_inline void point_swap_conditional5(u64 *a, u64 *b, u64 swap1)
456
{
457
point_swap_conditional_step(a, b, swap1, 5);
458
point_swap_conditional_step(a, b, swap1, 4);
459
point_swap_conditional_step(a, b, swap1, 3);
460
point_swap_conditional_step(a, b, swap1, 2);
461
point_swap_conditional_step(a, b, swap1, 1);
462
}
463
464
static __always_inline void point_swap_conditional(u64 *a, u64 *b, u64 iswap)
465
{
466
u64 swap1 = 0 - iswap;
467
point_swap_conditional5(a, b, swap1);
468
point_swap_conditional5(a + 5, b + 5, swap1);
469
}
470
471
static __always_inline void point_copy(u64 *output, u64 *input)
472
{
473
memcpy(output, input, 5 * sizeof(*input));
474
memcpy(output + 5, input + 5, 5 * sizeof(*input));
475
}
476
477
static __always_inline void addanddouble_fmonty(u64 *pp, u64 *ppq, u64 *p,
478
u64 *pq, u64 *qmqp)
479
{
480
u64 *qx = qmqp;
481
u64 *x2 = pp;
482
u64 *z2 = pp + 5;
483
u64 *x3 = ppq;
484
u64 *z3 = ppq + 5;
485
u64 *x = p;
486
u64 *z = p + 5;
487
u64 *xprime = pq;
488
u64 *zprime = pq + 5;
489
u64 buf[40] = { 0 };
490
u64 *origx = buf;
491
u64 *origxprime0 = buf + 5;
492
u64 *xxprime0;
493
u64 *zzprime0;
494
u64 *origxprime;
495
xxprime0 = buf + 25;
496
zzprime0 = buf + 30;
497
memcpy(origx, x, 5 * sizeof(*x));
498
fsum(x, z);
499
fdifference(z, origx);
500
memcpy(origxprime0, xprime, 5 * sizeof(*xprime));
501
fsum(xprime, zprime);
502
fdifference(zprime, origxprime0);
503
fmul(xxprime0, xprime, z);
504
fmul(zzprime0, x, zprime);
505
origxprime = buf + 5;
506
{
507
u64 *xx0;
508
u64 *zz0;
509
u64 *xxprime;
510
u64 *zzprime;
511
u64 *zzzprime;
512
xx0 = buf + 15;
513
zz0 = buf + 20;
514
xxprime = buf + 25;
515
zzprime = buf + 30;
516
zzzprime = buf + 35;
517
memcpy(origxprime, xxprime, 5 * sizeof(*xxprime));
518
fsum(xxprime, zzprime);
519
fdifference(zzprime, origxprime);
520
fsquare_fsquare_times(x3, xxprime, 1);
521
fsquare_fsquare_times(zzzprime, zzprime, 1);
522
fmul(z3, zzzprime, qx);
523
fsquare_fsquare_times(xx0, x, 1);
524
fsquare_fsquare_times(zz0, z, 1);
525
{
526
u64 *zzz;
527
u64 *xx;
528
u64 *zz;
529
u64 scalar;
530
zzz = buf + 10;
531
xx = buf + 15;
532
zz = buf + 20;
533
fmul(x2, xx, zz);
534
fdifference(zz, xx);
535
scalar = 121665;
536
fscalar(zzz, zz, scalar);
537
fsum(zzz, xx);
538
fmul(z2, zzz, zz);
539
}
540
}
541
}
542
543
static __always_inline void
544
ladder_smallloop_cmult_small_loop_step(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2,
545
u64 *q, u8 byt)
546
{
547
u64 bit0 = (u64)(byt >> 7);
548
u64 bit;
549
point_swap_conditional(nq, nqpq, bit0);
550
addanddouble_fmonty(nq2, nqpq2, nq, nqpq, q);
551
bit = (u64)(byt >> 7);
552
point_swap_conditional(nq2, nqpq2, bit);
553
}
554
555
static __always_inline void
556
ladder_smallloop_cmult_small_loop_double_step(u64 *nq, u64 *nqpq, u64 *nq2,
557
u64 *nqpq2, u64 *q, u8 byt)
558
{
559
u8 byt1;
560
ladder_smallloop_cmult_small_loop_step(nq, nqpq, nq2, nqpq2, q, byt);
561
byt1 = byt << 1;
562
ladder_smallloop_cmult_small_loop_step(nq2, nqpq2, nq, nqpq, q, byt1);
563
}
564
565
static __always_inline void
566
ladder_smallloop_cmult_small_loop(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2,
567
u64 *q, u8 byt, u32 i)
568
{
569
while (i--) {
570
ladder_smallloop_cmult_small_loop_double_step(nq, nqpq, nq2,
571
nqpq2, q, byt);
572
byt <<= 2;
573
}
574
}
575
576
static __always_inline void ladder_bigloop_cmult_big_loop(u8 *n1, u64 *nq,
577
u64 *nqpq, u64 *nq2,
578
u64 *nqpq2, u64 *q,
579
u32 i)
580
{
581
while (i--) {
582
u8 byte = n1[i];
583
ladder_smallloop_cmult_small_loop(nq, nqpq, nq2, nqpq2, q,
584
byte, 4);
585
}
586
}
587
588
static void ladder_cmult(u64 *result, u8 *n1, u64 *q)
589
{
590
u64 point_buf[40] = { 0 };
591
u64 *nq = point_buf;
592
u64 *nqpq = point_buf + 10;
593
u64 *nq2 = point_buf + 20;
594
u64 *nqpq2 = point_buf + 30;
595
point_copy(nqpq, q);
596
nq[0] = 1;
597
ladder_bigloop_cmult_big_loop(n1, nq, nqpq, nq2, nqpq2, q, 32);
598
point_copy(result, nq);
599
}
600
601
static __always_inline void format_fexpand(u64 *output, const u8 *input)
602
{
603
const u8 *x00 = input + 6;
604
const u8 *x01 = input + 12;
605
const u8 *x02 = input + 19;
606
const u8 *x0 = input + 24;
607
u64 i0, i1, i2, i3, i4, output0, output1, output2, output3, output4;
608
i0 = get_unaligned_le64(input);
609
i1 = get_unaligned_le64(x00);
610
i2 = get_unaligned_le64(x01);
611
i3 = get_unaligned_le64(x02);
612
i4 = get_unaligned_le64(x0);
613
output0 = i0 & 0x7ffffffffffffLLU;
614
output1 = i1 >> 3 & 0x7ffffffffffffLLU;
615
output2 = i2 >> 6 & 0x7ffffffffffffLLU;
616
output3 = i3 >> 1 & 0x7ffffffffffffLLU;
617
output4 = i4 >> 12 & 0x7ffffffffffffLLU;
618
output[0] = output0;
619
output[1] = output1;
620
output[2] = output2;
621
output[3] = output3;
622
output[4] = output4;
623
}
624
625
static __always_inline void format_fcontract_first_carry_pass(u64 *input)
626
{
627
u64 t0 = input[0];
628
u64 t1 = input[1];
629
u64 t2 = input[2];
630
u64 t3 = input[3];
631
u64 t4 = input[4];
632
u64 t1_ = t1 + (t0 >> 51);
633
u64 t0_ = t0 & 0x7ffffffffffffLLU;
634
u64 t2_ = t2 + (t1_ >> 51);
635
u64 t1__ = t1_ & 0x7ffffffffffffLLU;
636
u64 t3_ = t3 + (t2_ >> 51);
637
u64 t2__ = t2_ & 0x7ffffffffffffLLU;
638
u64 t4_ = t4 + (t3_ >> 51);
639
u64 t3__ = t3_ & 0x7ffffffffffffLLU;
640
input[0] = t0_;
641
input[1] = t1__;
642
input[2] = t2__;
643
input[3] = t3__;
644
input[4] = t4_;
645
}
646
647
static __always_inline void format_fcontract_first_carry_full(u64 *input)
648
{
649
format_fcontract_first_carry_pass(input);
650
modulo_carry_top(input);
651
}
652
653
static __always_inline void format_fcontract_second_carry_pass(u64 *input)
654
{
655
u64 t0 = input[0];
656
u64 t1 = input[1];
657
u64 t2 = input[2];
658
u64 t3 = input[3];
659
u64 t4 = input[4];
660
u64 t1_ = t1 + (t0 >> 51);
661
u64 t0_ = t0 & 0x7ffffffffffffLLU;
662
u64 t2_ = t2 + (t1_ >> 51);
663
u64 t1__ = t1_ & 0x7ffffffffffffLLU;
664
u64 t3_ = t3 + (t2_ >> 51);
665
u64 t2__ = t2_ & 0x7ffffffffffffLLU;
666
u64 t4_ = t4 + (t3_ >> 51);
667
u64 t3__ = t3_ & 0x7ffffffffffffLLU;
668
input[0] = t0_;
669
input[1] = t1__;
670
input[2] = t2__;
671
input[3] = t3__;
672
input[4] = t4_;
673
}
674
675
static __always_inline void format_fcontract_second_carry_full(u64 *input)
676
{
677
u64 i0;
678
u64 i1;
679
u64 i0_;
680
u64 i1_;
681
format_fcontract_second_carry_pass(input);
682
modulo_carry_top(input);
683
i0 = input[0];
684
i1 = input[1];
685
i0_ = i0 & 0x7ffffffffffffLLU;
686
i1_ = i1 + (i0 >> 51);
687
input[0] = i0_;
688
input[1] = i1_;
689
}
690
691
static __always_inline void format_fcontract_trim(u64 *input)
692
{
693
u64 a0 = input[0];
694
u64 a1 = input[1];
695
u64 a2 = input[2];
696
u64 a3 = input[3];
697
u64 a4 = input[4];
698
u64 mask0 = u64_gte_mask(a0, 0x7ffffffffffedLLU);
699
u64 mask1 = u64_eq_mask(a1, 0x7ffffffffffffLLU);
700
u64 mask2 = u64_eq_mask(a2, 0x7ffffffffffffLLU);
701
u64 mask3 = u64_eq_mask(a3, 0x7ffffffffffffLLU);
702
u64 mask4 = u64_eq_mask(a4, 0x7ffffffffffffLLU);
703
u64 mask = (((mask0 & mask1) & mask2) & mask3) & mask4;
704
u64 a0_ = a0 - (0x7ffffffffffedLLU & mask);
705
u64 a1_ = a1 - (0x7ffffffffffffLLU & mask);
706
u64 a2_ = a2 - (0x7ffffffffffffLLU & mask);
707
u64 a3_ = a3 - (0x7ffffffffffffLLU & mask);
708
u64 a4_ = a4 - (0x7ffffffffffffLLU & mask);
709
input[0] = a0_;
710
input[1] = a1_;
711
input[2] = a2_;
712
input[3] = a3_;
713
input[4] = a4_;
714
}
715
716
static __always_inline void format_fcontract_store(u8 *output, u64 *input)
717
{
718
u64 t0 = input[0];
719
u64 t1 = input[1];
720
u64 t2 = input[2];
721
u64 t3 = input[3];
722
u64 t4 = input[4];
723
u64 o0 = t1 << 51 | t0;
724
u64 o1 = t2 << 38 | t1 >> 13;
725
u64 o2 = t3 << 25 | t2 >> 26;
726
u64 o3 = t4 << 12 | t3 >> 39;
727
u8 *b0 = output;
728
u8 *b1 = output + 8;
729
u8 *b2 = output + 16;
730
u8 *b3 = output + 24;
731
put_unaligned_le64(o0, b0);
732
put_unaligned_le64(o1, b1);
733
put_unaligned_le64(o2, b2);
734
put_unaligned_le64(o3, b3);
735
}
736
737
static __always_inline void format_fcontract(u8 *output, u64 *input)
738
{
739
format_fcontract_first_carry_full(input);
740
format_fcontract_second_carry_full(input);
741
format_fcontract_trim(input);
742
format_fcontract_store(output, input);
743
}
744
745
static __always_inline void format_scalar_of_point(u8 *scalar, u64 *point)
746
{
747
u64 *x = point;
748
u64 *z = point + 5;
749
u64 buf[10] __aligned(32) = { 0 };
750
u64 *zmone = buf;
751
u64 *sc = buf + 5;
752
crecip(zmone, z);
753
fmul(sc, x, zmone);
754
format_fcontract(scalar, sc);
755
}
756
757
void curve25519_generic(u8 mypublic[CURVE25519_KEY_SIZE],
758
const u8 secret[CURVE25519_KEY_SIZE],
759
const u8 basepoint[CURVE25519_KEY_SIZE])
760
{
761
u64 buf0[10] __aligned(32) = { 0 };
762
u64 *x0 = buf0;
763
u64 *z = buf0 + 5;
764
u64 *q;
765
format_fexpand(x0, basepoint);
766
z[0] = 1;
767
q = buf0;
768
{
769
u8 e[32] __aligned(32) = { 0 };
770
u8 *scalar;
771
memcpy(e, secret, 32);
772
curve25519_clamp_secret(e);
773
scalar = e;
774
{
775
u64 buf[15] = { 0 };
776
u64 *nq = buf;
777
u64 *x = nq;
778
x[0] = 1;
779
ladder_cmult(nq, scalar, q);
780
format_scalar_of_point(mypublic, nq);
781
memzero_explicit(buf, sizeof(buf));
782
}
783
memzero_explicit(e, sizeof(e));
784
}
785
memzero_explicit(buf0, sizeof(buf0));
786
}
787
788