CoCalc -- curve25519-hacl64.c

GitHub Repository: torvalds/linux
Path: blob/master/lib/crypto/curve25519-hacl64.c
²⁶²⁸² views
1
// SPDX-License-Identifier: GPL-2.0 OR MIT
2
/*
3
 * Copyright (C) 2016-2017 INRIA and Microsoft Corporation.
4
 * Copyright (C) 2018-2019 Jason A. Donenfeld <[email protected]>. All Rights Reserved.
5
 *
6
 * This is a machine-generated formally verified implementation of Curve25519
7
 * ECDH from: <https://github.com/mitls/hacl-star>. Though originally machine
8
 * generated, it has been tweaked to be suitable for use in the kernel. It is
9
 * optimized for 64-bit machines that can efficiently work with 128-bit
10
 * integer types.
11
 */
12

13
#include <linux/unaligned.h>
14
#include <crypto/curve25519.h>
15
#include <linux/string.h>
16

17
static __always_inline u64 u64_eq_mask(u64 a, u64 b)
18
{
19
	u64 x = a ^ b;
20
	u64 minus_x = ~x + (u64)1U;
21
	u64 x_or_minus_x = x | minus_x;
22
	u64 xnx = x_or_minus_x >> (u32)63U;
23
	u64 c = xnx - (u64)1U;
24
	return c;
25
}
26

27
static __always_inline u64 u64_gte_mask(u64 a, u64 b)
28
{
29
	u64 x = a;
30
	u64 y = b;
31
	u64 x_xor_y = x ^ y;
32
	u64 x_sub_y = x - y;
33
	u64 x_sub_y_xor_y = x_sub_y ^ y;
34
	u64 q = x_xor_y | x_sub_y_xor_y;
35
	u64 x_xor_q = x ^ q;
36
	u64 x_xor_q_ = x_xor_q >> (u32)63U;
37
	u64 c = x_xor_q_ - (u64)1U;
38
	return c;
39
}
40

41
static __always_inline void modulo_carry_top(u64 *b)
42
{
43
	u64 b4 = b[4];
44
	u64 b0 = b[0];
45
	u64 b4_ = b4 & 0x7ffffffffffffLLU;
46
	u64 b0_ = b0 + 19 * (b4 >> 51);
47
	b[4] = b4_;
48
	b[0] = b0_;
49
}
50

51
static __always_inline void fproduct_copy_from_wide_(u64 *output, u128 *input)
52
{
53
	{
54
		u128 xi = input[0];
55
		output[0] = ((u64)(xi));
56
	}
57
	{
58
		u128 xi = input[1];
59
		output[1] = ((u64)(xi));
60
	}
61
	{
62
		u128 xi = input[2];
63
		output[2] = ((u64)(xi));
64
	}
65
	{
66
		u128 xi = input[3];
67
		output[3] = ((u64)(xi));
68
	}
69
	{
70
		u128 xi = input[4];
71
		output[4] = ((u64)(xi));
72
	}
73
}
74

75
static __always_inline void
76
fproduct_sum_scalar_multiplication_(u128 *output, u64 *input, u64 s)
77
{
78
	output[0] += (u128)input[0] * s;
79
	output[1] += (u128)input[1] * s;
80
	output[2] += (u128)input[2] * s;
81
	output[3] += (u128)input[3] * s;
82
	output[4] += (u128)input[4] * s;
83
}
84

85
static __always_inline void fproduct_carry_wide_(u128 *tmp)
86
{
87
	{
88
		u32 ctr = 0;
89
		u128 tctr = tmp[ctr];
90
		u128 tctrp1 = tmp[ctr + 1];
91
		u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
92
		u128 c = ((tctr) >> (51));
93
		tmp[ctr] = ((u128)(r0));
94
		tmp[ctr + 1] = ((tctrp1) + (c));
95
	}
96
	{
97
		u32 ctr = 1;
98
		u128 tctr = tmp[ctr];
99
		u128 tctrp1 = tmp[ctr + 1];
100
		u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
101
		u128 c = ((tctr) >> (51));
102
		tmp[ctr] = ((u128)(r0));
103
		tmp[ctr + 1] = ((tctrp1) + (c));
104
	}
105

106
	{
107
		u32 ctr = 2;
108
		u128 tctr = tmp[ctr];
109
		u128 tctrp1 = tmp[ctr + 1];
110
		u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
111
		u128 c = ((tctr) >> (51));
112
		tmp[ctr] = ((u128)(r0));
113
		tmp[ctr + 1] = ((tctrp1) + (c));
114
	}
115
	{
116
		u32 ctr = 3;
117
		u128 tctr = tmp[ctr];
118
		u128 tctrp1 = tmp[ctr + 1];
119
		u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
120
		u128 c = ((tctr) >> (51));
121
		tmp[ctr] = ((u128)(r0));
122
		tmp[ctr + 1] = ((tctrp1) + (c));
123
	}
124
}
125

126
static __always_inline void fmul_shift_reduce(u64 *output)
127
{
128
	u64 tmp = output[4];
129
	u64 b0;
130
	{
131
		u32 ctr = 5 - 0 - 1;
132
		u64 z = output[ctr - 1];
133
		output[ctr] = z;
134
	}
135
	{
136
		u32 ctr = 5 - 1 - 1;
137
		u64 z = output[ctr - 1];
138
		output[ctr] = z;
139
	}
140
	{
141
		u32 ctr = 5 - 2 - 1;
142
		u64 z = output[ctr - 1];
143
		output[ctr] = z;
144
	}
145
	{
146
		u32 ctr = 5 - 3 - 1;
147
		u64 z = output[ctr - 1];
148
		output[ctr] = z;
149
	}
150
	output[0] = tmp;
151
	b0 = output[0];
152
	output[0] = 19 * b0;
153
}
154

155
static __always_inline void fmul_mul_shift_reduce_(u128 *output, u64 *input,
156
						   u64 *input21)
157
{
158
	u32 i;
159
	u64 input2i;
160
	{
161
		u64 input2i = input21[0];
162
		fproduct_sum_scalar_multiplication_(output, input, input2i);
163
		fmul_shift_reduce(input);
164
	}
165
	{
166
		u64 input2i = input21[1];
167
		fproduct_sum_scalar_multiplication_(output, input, input2i);
168
		fmul_shift_reduce(input);
169
	}
170
	{
171
		u64 input2i = input21[2];
172
		fproduct_sum_scalar_multiplication_(output, input, input2i);
173
		fmul_shift_reduce(input);
174
	}
175
	{
176
		u64 input2i = input21[3];
177
		fproduct_sum_scalar_multiplication_(output, input, input2i);
178
		fmul_shift_reduce(input);
179
	}
180
	i = 4;
181
	input2i = input21[i];
182
	fproduct_sum_scalar_multiplication_(output, input, input2i);
183
}
184

185
static __always_inline void fmul_fmul(u64 *output, u64 *input, u64 *input21)
186
{
187
	u64 tmp[5] = { input[0], input[1], input[2], input[3], input[4] };
188
	{
189
		u128 b4;
190
		u128 b0;
191
		u128 b4_;
192
		u128 b0_;
193
		u64 i0;
194
		u64 i1;
195
		u64 i0_;
196
		u64 i1_;
197
		u128 t[5] = { 0 };
198
		fmul_mul_shift_reduce_(t, tmp, input21);
199
		fproduct_carry_wide_(t);
200
		b4 = t[4];
201
		b0 = t[0];
202
		b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
203
		b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
204
		t[4] = b4_;
205
		t[0] = b0_;
206
		fproduct_copy_from_wide_(output, t);
207
		i0 = output[0];
208
		i1 = output[1];
209
		i0_ = i0 & 0x7ffffffffffffLLU;
210
		i1_ = i1 + (i0 >> 51);
211
		output[0] = i0_;
212
		output[1] = i1_;
213
	}
214
}
215

216
static __always_inline void fsquare_fsquare__(u128 *tmp, u64 *output)
217
{
218
	u64 r0 = output[0];
219
	u64 r1 = output[1];
220
	u64 r2 = output[2];
221
	u64 r3 = output[3];
222
	u64 r4 = output[4];
223
	u64 d0 = r0 * 2;
224
	u64 d1 = r1 * 2;
225
	u64 d2 = r2 * 2 * 19;
226
	u64 d419 = r4 * 19;
227
	u64 d4 = d419 * 2;
228
	u128 s0 = ((((((u128)(r0) * (r0))) + (((u128)(d4) * (r1))))) +
229
		   (((u128)(d2) * (r3))));
230
	u128 s1 = ((((((u128)(d0) * (r1))) + (((u128)(d4) * (r2))))) +
231
		   (((u128)(r3 * 19) * (r3))));
232
	u128 s2 = ((((((u128)(d0) * (r2))) + (((u128)(r1) * (r1))))) +
233
		   (((u128)(d4) * (r3))));
234
	u128 s3 = ((((((u128)(d0) * (r3))) + (((u128)(d1) * (r2))))) +
235
		   (((u128)(r4) * (d419))));
236
	u128 s4 = ((((((u128)(d0) * (r4))) + (((u128)(d1) * (r3))))) +
237
		   (((u128)(r2) * (r2))));
238
	tmp[0] = s0;
239
	tmp[1] = s1;
240
	tmp[2] = s2;
241
	tmp[3] = s3;
242
	tmp[4] = s4;
243
}
244

245
static __always_inline void fsquare_fsquare_(u128 *tmp, u64 *output)
246
{
247
	u128 b4;
248
	u128 b0;
249
	u128 b4_;
250
	u128 b0_;
251
	u64 i0;
252
	u64 i1;
253
	u64 i0_;
254
	u64 i1_;
255
	fsquare_fsquare__(tmp, output);
256
	fproduct_carry_wide_(tmp);
257
	b4 = tmp[4];
258
	b0 = tmp[0];
259
	b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
260
	b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
261
	tmp[4] = b4_;
262
	tmp[0] = b0_;
263
	fproduct_copy_from_wide_(output, tmp);
264
	i0 = output[0];
265
	i1 = output[1];
266
	i0_ = i0 & 0x7ffffffffffffLLU;
267
	i1_ = i1 + (i0 >> 51);
268
	output[0] = i0_;
269
	output[1] = i1_;
270
}
271

272
static __always_inline void fsquare_fsquare_times_(u64 *output, u128 *tmp,
273
						   u32 count1)
274
{
275
	u32 i;
276
	fsquare_fsquare_(tmp, output);
277
	for (i = 1; i < count1; ++i)
278
		fsquare_fsquare_(tmp, output);
279
}
280

281
static __always_inline void fsquare_fsquare_times(u64 *output, u64 *input,
282
						  u32 count1)
283
{
284
	u128 t[5];
285
	memcpy(output, input, 5 * sizeof(*input));
286
	fsquare_fsquare_times_(output, t, count1);
287
}
288

289
static __always_inline void fsquare_fsquare_times_inplace(u64 *output,
290
							  u32 count1)
291
{
292
	u128 t[5];
293
	fsquare_fsquare_times_(output, t, count1);
294
}
295

296
static __always_inline void crecip_crecip(u64 *out, u64 *z)
297
{
298
	u64 buf[20] = { 0 };
299
	u64 *a0 = buf;
300
	u64 *t00 = buf + 5;
301
	u64 *b0 = buf + 10;
302
	u64 *t01;
303
	u64 *b1;
304
	u64 *c0;
305
	u64 *a;
306
	u64 *t0;
307
	u64 *b;
308
	u64 *c;
309
	fsquare_fsquare_times(a0, z, 1);
310
	fsquare_fsquare_times(t00, a0, 2);
311
	fmul_fmul(b0, t00, z);
312
	fmul_fmul(a0, b0, a0);
313
	fsquare_fsquare_times(t00, a0, 1);
314
	fmul_fmul(b0, t00, b0);
315
	fsquare_fsquare_times(t00, b0, 5);
316
	t01 = buf + 5;
317
	b1 = buf + 10;
318
	c0 = buf + 15;
319
	fmul_fmul(b1, t01, b1);
320
	fsquare_fsquare_times(t01, b1, 10);
321
	fmul_fmul(c0, t01, b1);
322
	fsquare_fsquare_times(t01, c0, 20);
323
	fmul_fmul(t01, t01, c0);
324
	fsquare_fsquare_times_inplace(t01, 10);
325
	fmul_fmul(b1, t01, b1);
326
	fsquare_fsquare_times(t01, b1, 50);
327
	a = buf;
328
	t0 = buf + 5;
329
	b = buf + 10;
330
	c = buf + 15;
331
	fmul_fmul(c, t0, b);
332
	fsquare_fsquare_times(t0, c, 100);
333
	fmul_fmul(t0, t0, c);
334
	fsquare_fsquare_times_inplace(t0, 50);
335
	fmul_fmul(t0, t0, b);
336
	fsquare_fsquare_times_inplace(t0, 5);
337
	fmul_fmul(out, t0, a);
338
}
339

340
static __always_inline void fsum(u64 *a, u64 *b)
341
{
342
	a[0] += b[0];
343
	a[1] += b[1];
344
	a[2] += b[2];
345
	a[3] += b[3];
346
	a[4] += b[4];
347
}
348

349
static __always_inline void fdifference(u64 *a, u64 *b)
350
{
351
	u64 tmp[5] = { 0 };
352
	u64 b0;
353
	u64 b1;
354
	u64 b2;
355
	u64 b3;
356
	u64 b4;
357
	memcpy(tmp, b, 5 * sizeof(*b));
358
	b0 = tmp[0];
359
	b1 = tmp[1];
360
	b2 = tmp[2];
361
	b3 = tmp[3];
362
	b4 = tmp[4];
363
	tmp[0] = b0 + 0x3fffffffffff68LLU;
364
	tmp[1] = b1 + 0x3ffffffffffff8LLU;
365
	tmp[2] = b2 + 0x3ffffffffffff8LLU;
366
	tmp[3] = b3 + 0x3ffffffffffff8LLU;
367
	tmp[4] = b4 + 0x3ffffffffffff8LLU;
368
	{
369
		u64 xi = a[0];
370
		u64 yi = tmp[0];
371
		a[0] = yi - xi;
372
	}
373
	{
374
		u64 xi = a[1];
375
		u64 yi = tmp[1];
376
		a[1] = yi - xi;
377
	}
378
	{
379
		u64 xi = a[2];
380
		u64 yi = tmp[2];
381
		a[2] = yi - xi;
382
	}
383
	{
384
		u64 xi = a[3];
385
		u64 yi = tmp[3];
386
		a[3] = yi - xi;
387
	}
388
	{
389
		u64 xi = a[4];
390
		u64 yi = tmp[4];
391
		a[4] = yi - xi;
392
	}
393
}
394

395
static __always_inline void fscalar(u64 *output, u64 *b, u64 s)
396
{
397
	u128 tmp[5];
398
	u128 b4;
399
	u128 b0;
400
	u128 b4_;
401
	u128 b0_;
402
	{
403
		u64 xi = b[0];
404
		tmp[0] = ((u128)(xi) * (s));
405
	}
406
	{
407
		u64 xi = b[1];
408
		tmp[1] = ((u128)(xi) * (s));
409
	}
410
	{
411
		u64 xi = b[2];
412
		tmp[2] = ((u128)(xi) * (s));
413
	}
414
	{
415
		u64 xi = b[3];
416
		tmp[3] = ((u128)(xi) * (s));
417
	}
418
	{
419
		u64 xi = b[4];
420
		tmp[4] = ((u128)(xi) * (s));
421
	}
422
	fproduct_carry_wide_(tmp);
423
	b4 = tmp[4];
424
	b0 = tmp[0];
425
	b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
426
	b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
427
	tmp[4] = b4_;
428
	tmp[0] = b0_;
429
	fproduct_copy_from_wide_(output, tmp);
430
}
431

432
static __always_inline void fmul(u64 *output, u64 *a, u64 *b)
433
{
434
	fmul_fmul(output, a, b);
435
}
436

437
static __always_inline void crecip(u64 *output, u64 *input)
438
{
439
	crecip_crecip(output, input);
440
}
441

442
static __always_inline void point_swap_conditional_step(u64 *a, u64 *b,
443
							u64 swap1, u32 ctr)
444
{
445
	u32 i = ctr - 1;
446
	u64 ai = a[i];
447
	u64 bi = b[i];
448
	u64 x = swap1 & (ai ^ bi);
449
	u64 ai1 = ai ^ x;
450
	u64 bi1 = bi ^ x;
451
	a[i] = ai1;
452
	b[i] = bi1;
453
}
454

455
static __always_inline void point_swap_conditional5(u64 *a, u64 *b, u64 swap1)
456
{
457
	point_swap_conditional_step(a, b, swap1, 5);
458
	point_swap_conditional_step(a, b, swap1, 4);
459
	point_swap_conditional_step(a, b, swap1, 3);
460
	point_swap_conditional_step(a, b, swap1, 2);
461
	point_swap_conditional_step(a, b, swap1, 1);
462
}
463

464
static __always_inline void point_swap_conditional(u64 *a, u64 *b, u64 iswap)
465
{
466
	u64 swap1 = 0 - iswap;
467
	point_swap_conditional5(a, b, swap1);
468
	point_swap_conditional5(a + 5, b + 5, swap1);
469
}
470

471
static __always_inline void point_copy(u64 *output, u64 *input)
472
{
473
	memcpy(output, input, 5 * sizeof(*input));
474
	memcpy(output + 5, input + 5, 5 * sizeof(*input));
475
}
476

477
static __always_inline void addanddouble_fmonty(u64 *pp, u64 *ppq, u64 *p,
478
						u64 *pq, u64 *qmqp)
479
{
480
	u64 *qx = qmqp;
481
	u64 *x2 = pp;
482
	u64 *z2 = pp + 5;
483
	u64 *x3 = ppq;
484
	u64 *z3 = ppq + 5;
485
	u64 *x = p;
486
	u64 *z = p + 5;
487
	u64 *xprime = pq;
488
	u64 *zprime = pq + 5;
489
	u64 buf[40] = { 0 };
490
	u64 *origx = buf;
491
	u64 *origxprime0 = buf + 5;
492
	u64 *xxprime0;
493
	u64 *zzprime0;
494
	u64 *origxprime;
495
	xxprime0 = buf + 25;
496
	zzprime0 = buf + 30;
497
	memcpy(origx, x, 5 * sizeof(*x));
498
	fsum(x, z);
499
	fdifference(z, origx);
500
	memcpy(origxprime0, xprime, 5 * sizeof(*xprime));
501
	fsum(xprime, zprime);
502
	fdifference(zprime, origxprime0);
503
	fmul(xxprime0, xprime, z);
504
	fmul(zzprime0, x, zprime);
505
	origxprime = buf + 5;
506
	{
507
		u64 *xx0;
508
		u64 *zz0;
509
		u64 *xxprime;
510
		u64 *zzprime;
511
		u64 *zzzprime;
512
		xx0 = buf + 15;
513
		zz0 = buf + 20;
514
		xxprime = buf + 25;
515
		zzprime = buf + 30;
516
		zzzprime = buf + 35;
517
		memcpy(origxprime, xxprime, 5 * sizeof(*xxprime));
518
		fsum(xxprime, zzprime);
519
		fdifference(zzprime, origxprime);
520
		fsquare_fsquare_times(x3, xxprime, 1);
521
		fsquare_fsquare_times(zzzprime, zzprime, 1);
522
		fmul(z3, zzzprime, qx);
523
		fsquare_fsquare_times(xx0, x, 1);
524
		fsquare_fsquare_times(zz0, z, 1);
525
		{
526
			u64 *zzz;
527
			u64 *xx;
528
			u64 *zz;
529
			u64 scalar;
530
			zzz = buf + 10;
531
			xx = buf + 15;
532
			zz = buf + 20;
533
			fmul(x2, xx, zz);
534
			fdifference(zz, xx);
535
			scalar = 121665;
536
			fscalar(zzz, zz, scalar);
537
			fsum(zzz, xx);
538
			fmul(z2, zzz, zz);
539
		}
540
	}
541
}
542

543
static __always_inline void
544
ladder_smallloop_cmult_small_loop_step(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2,
545
				       u64 *q, u8 byt)
546
{
547
	u64 bit0 = (u64)(byt >> 7);
548
	u64 bit;
549
	point_swap_conditional(nq, nqpq, bit0);
550
	addanddouble_fmonty(nq2, nqpq2, nq, nqpq, q);
551
	bit = (u64)(byt >> 7);
552
	point_swap_conditional(nq2, nqpq2, bit);
553
}
554

555
static __always_inline void
556
ladder_smallloop_cmult_small_loop_double_step(u64 *nq, u64 *nqpq, u64 *nq2,
557
					      u64 *nqpq2, u64 *q, u8 byt)
558
{
559
	u8 byt1;
560
	ladder_smallloop_cmult_small_loop_step(nq, nqpq, nq2, nqpq2, q, byt);
561
	byt1 = byt << 1;
562
	ladder_smallloop_cmult_small_loop_step(nq2, nqpq2, nq, nqpq, q, byt1);
563
}
564

565
static __always_inline void
566
ladder_smallloop_cmult_small_loop(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2,
567
				  u64 *q, u8 byt, u32 i)
568
{
569
	while (i--) {
570
		ladder_smallloop_cmult_small_loop_double_step(nq, nqpq, nq2,
571
							      nqpq2, q, byt);
572
		byt <<= 2;
573
	}
574
}
575

576
static __always_inline void ladder_bigloop_cmult_big_loop(u8 *n1, u64 *nq,
577
							  u64 *nqpq, u64 *nq2,
578
							  u64 *nqpq2, u64 *q,
579
							  u32 i)
580
{
581
	while (i--) {
582
		u8 byte = n1[i];
583
		ladder_smallloop_cmult_small_loop(nq, nqpq, nq2, nqpq2, q,
584
						  byte, 4);
585
	}
586
}
587

588
static void ladder_cmult(u64 *result, u8 *n1, u64 *q)
589
{
590
	u64 point_buf[40] = { 0 };
591
	u64 *nq = point_buf;
592
	u64 *nqpq = point_buf + 10;
593
	u64 *nq2 = point_buf + 20;
594
	u64 *nqpq2 = point_buf + 30;
595
	point_copy(nqpq, q);
596
	nq[0] = 1;
597
	ladder_bigloop_cmult_big_loop(n1, nq, nqpq, nq2, nqpq2, q, 32);
598
	point_copy(result, nq);
599
}
600

601
static __always_inline void format_fexpand(u64 *output, const u8 *input)
602
{
603
	const u8 *x00 = input + 6;
604
	const u8 *x01 = input + 12;
605
	const u8 *x02 = input + 19;
606
	const u8 *x0 = input + 24;
607
	u64 i0, i1, i2, i3, i4, output0, output1, output2, output3, output4;
608
	i0 = get_unaligned_le64(input);
609
	i1 = get_unaligned_le64(x00);
610
	i2 = get_unaligned_le64(x01);
611
	i3 = get_unaligned_le64(x02);
612
	i4 = get_unaligned_le64(x0);
613
	output0 = i0 & 0x7ffffffffffffLLU;
614
	output1 = i1 >> 3 & 0x7ffffffffffffLLU;
615
	output2 = i2 >> 6 & 0x7ffffffffffffLLU;
616
	output3 = i3 >> 1 & 0x7ffffffffffffLLU;
617
	output4 = i4 >> 12 & 0x7ffffffffffffLLU;
618
	output[0] = output0;
619
	output[1] = output1;
620
	output[2] = output2;
621
	output[3] = output3;
622
	output[4] = output4;
623
}
624

625
static __always_inline void format_fcontract_first_carry_pass(u64 *input)
626
{
627
	u64 t0 = input[0];
628
	u64 t1 = input[1];
629
	u64 t2 = input[2];
630
	u64 t3 = input[3];
631
	u64 t4 = input[4];
632
	u64 t1_ = t1 + (t0 >> 51);
633
	u64 t0_ = t0 & 0x7ffffffffffffLLU;
634
	u64 t2_ = t2 + (t1_ >> 51);
635
	u64 t1__ = t1_ & 0x7ffffffffffffLLU;
636
	u64 t3_ = t3 + (t2_ >> 51);
637
	u64 t2__ = t2_ & 0x7ffffffffffffLLU;
638
	u64 t4_ = t4 + (t3_ >> 51);
639
	u64 t3__ = t3_ & 0x7ffffffffffffLLU;
640
	input[0] = t0_;
641
	input[1] = t1__;
642
	input[2] = t2__;
643
	input[3] = t3__;
644
	input[4] = t4_;
645
}
646

647
static __always_inline void format_fcontract_first_carry_full(u64 *input)
648
{
649
	format_fcontract_first_carry_pass(input);
650
	modulo_carry_top(input);
651
}
652

653
static __always_inline void format_fcontract_second_carry_pass(u64 *input)
654
{
655
	u64 t0 = input[0];
656
	u64 t1 = input[1];
657
	u64 t2 = input[2];
658
	u64 t3 = input[3];
659
	u64 t4 = input[4];
660
	u64 t1_ = t1 + (t0 >> 51);
661
	u64 t0_ = t0 & 0x7ffffffffffffLLU;
662
	u64 t2_ = t2 + (t1_ >> 51);
663
	u64 t1__ = t1_ & 0x7ffffffffffffLLU;
664
	u64 t3_ = t3 + (t2_ >> 51);
665
	u64 t2__ = t2_ & 0x7ffffffffffffLLU;
666
	u64 t4_ = t4 + (t3_ >> 51);
667
	u64 t3__ = t3_ & 0x7ffffffffffffLLU;
668
	input[0] = t0_;
669
	input[1] = t1__;
670
	input[2] = t2__;
671
	input[3] = t3__;
672
	input[4] = t4_;
673
}
674

675
static __always_inline void format_fcontract_second_carry_full(u64 *input)
676
{
677
	u64 i0;
678
	u64 i1;
679
	u64 i0_;
680
	u64 i1_;
681
	format_fcontract_second_carry_pass(input);
682
	modulo_carry_top(input);
683
	i0 = input[0];
684
	i1 = input[1];
685
	i0_ = i0 & 0x7ffffffffffffLLU;
686
	i1_ = i1 + (i0 >> 51);
687
	input[0] = i0_;
688
	input[1] = i1_;
689
}
690

691
static __always_inline void format_fcontract_trim(u64 *input)
692
{
693
	u64 a0 = input[0];
694
	u64 a1 = input[1];
695
	u64 a2 = input[2];
696
	u64 a3 = input[3];
697
	u64 a4 = input[4];
698
	u64 mask0 = u64_gte_mask(a0, 0x7ffffffffffedLLU);
699
	u64 mask1 = u64_eq_mask(a1, 0x7ffffffffffffLLU);
700
	u64 mask2 = u64_eq_mask(a2, 0x7ffffffffffffLLU);
701
	u64 mask3 = u64_eq_mask(a3, 0x7ffffffffffffLLU);
702
	u64 mask4 = u64_eq_mask(a4, 0x7ffffffffffffLLU);
703
	u64 mask = (((mask0 & mask1) & mask2) & mask3) & mask4;
704
	u64 a0_ = a0 - (0x7ffffffffffedLLU & mask);
705
	u64 a1_ = a1 - (0x7ffffffffffffLLU & mask);
706
	u64 a2_ = a2 - (0x7ffffffffffffLLU & mask);
707
	u64 a3_ = a3 - (0x7ffffffffffffLLU & mask);
708
	u64 a4_ = a4 - (0x7ffffffffffffLLU & mask);
709
	input[0] = a0_;
710
	input[1] = a1_;
711
	input[2] = a2_;
712
	input[3] = a3_;
713
	input[4] = a4_;
714
}
715

716
static __always_inline void format_fcontract_store(u8 *output, u64 *input)
717
{
718
	u64 t0 = input[0];
719
	u64 t1 = input[1];
720
	u64 t2 = input[2];
721
	u64 t3 = input[3];
722
	u64 t4 = input[4];
723
	u64 o0 = t1 << 51 | t0;
724
	u64 o1 = t2 << 38 | t1 >> 13;
725
	u64 o2 = t3 << 25 | t2 >> 26;
726
	u64 o3 = t4 << 12 | t3 >> 39;
727
	u8 *b0 = output;
728
	u8 *b1 = output + 8;
729
	u8 *b2 = output + 16;
730
	u8 *b3 = output + 24;
731
	put_unaligned_le64(o0, b0);
732
	put_unaligned_le64(o1, b1);
733
	put_unaligned_le64(o2, b2);
734
	put_unaligned_le64(o3, b3);
735
}
736

737
static __always_inline void format_fcontract(u8 *output, u64 *input)
738
{
739
	format_fcontract_first_carry_full(input);
740
	format_fcontract_second_carry_full(input);
741
	format_fcontract_trim(input);
742
	format_fcontract_store(output, input);
743
}
744

745
static __always_inline void format_scalar_of_point(u8 *scalar, u64 *point)
746
{
747
	u64 *x = point;
748
	u64 *z = point + 5;
749
	u64 buf[10] __aligned(32) = { 0 };
750
	u64 *zmone = buf;
751
	u64 *sc = buf + 5;
752
	crecip(zmone, z);
753
	fmul(sc, x, zmone);
754
	format_fcontract(scalar, sc);
755
}
756

757
void curve25519_generic(u8 mypublic[CURVE25519_KEY_SIZE],
758
			const u8 secret[CURVE25519_KEY_SIZE],
759
			const u8 basepoint[CURVE25519_KEY_SIZE])
760
{
761
	u64 buf0[10] __aligned(32) = { 0 };
762
	u64 *x0 = buf0;
763
	u64 *z = buf0 + 5;
764
	u64 *q;
765
	format_fexpand(x0, basepoint);
766
	z[0] = 1;
767
	q = buf0;
768
	{
769
		u8 e[32] __aligned(32) = { 0 };
770
		u8 *scalar;
771
		memcpy(e, secret, 32);
772
		curve25519_clamp_secret(e);
773
		scalar = e;
774
		{
775
			u64 buf[15] = { 0 };
776
			u64 *nq = buf;
777
			u64 *x = nq;
778
			x[0] = 1;
779
			ladder_cmult(nq, scalar, q);
780
			format_scalar_of_point(mypublic, nq);
781
			memzero_explicit(buf, sizeof(buf));
782
		}
783
		memzero_explicit(e, sizeof(e));
784
	}
785
	memzero_explicit(buf0, sizeof(buf0));
786
}
787

788
Product

Resources

Company