CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
Ardupilot

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: Ardupilot/ardupilot
Path: blob/master/libraries/AP_CheckFirmware/monocypher.cpp
Views: 1798
1
// Monocypher version 3.1.2
2
//
3
// This file is dual-licensed. Choose whichever licence you want from
4
// the two licences listed below.
5
//
6
// The first licence is a regular 2-clause BSD licence. The second licence
7
// is the CC-0 from Creative Commons. It is intended to release Monocypher
8
// to the public domain. The BSD licence serves as a fallback option.
9
//
10
// SPDX-License-Identifier: BSD-2-Clause OR CC0-1.0
11
//
12
// ------------------------------------------------------------------------
13
//
14
// Copyright (c) 2017-2020, Loup Vaillant
15
// All rights reserved.
16
//
17
//
18
// Redistribution and use in source and binary forms, with or without
19
// modification, are permitted provided that the following conditions are
20
// met:
21
//
22
// 1. Redistributions of source code must retain the above copyright
23
// notice, this list of conditions and the following disclaimer.
24
//
25
// 2. Redistributions in binary form must reproduce the above copyright
26
// notice, this list of conditions and the following disclaimer in the
27
// documentation and/or other materials provided with the
28
// distribution.
29
//
30
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
31
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
32
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
33
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
34
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
35
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
36
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
37
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
38
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
39
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
40
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41
//
42
// ------------------------------------------------------------------------
43
//
44
// Written in 2017-2020 by Loup Vaillant
45
//
46
// To the extent possible under law, the author(s) have dedicated all copyright
47
// and related neighboring rights to this software to the public domain
48
// worldwide. This software is distributed without any warranty.
49
//
50
// You should have received a copy of the CC0 Public Domain Dedication along
51
// with this software. If not, see
52
// <https://creativecommons.org/publicdomain/zero/1.0/>
53
54
#include "monocypher.h"
55
56
// we don't need Argon2
57
#define MONOCYPHER_ARGON2_ENABLE 0
58
59
// we want the bootloader to be as small as possible
60
#define BLAKE2_NO_UNROLLING 1
61
62
/////////////////
63
/// Utilities ///
64
/////////////////
65
#define FOR_T(type, i0, start, end) for (type i0 = (start); i0 < (end); i0++)
66
#define FOR(i1, start, end) FOR_T(size_t, i1, start, end)
67
#define COPY(dst, src, size) FOR(i2, 0, size) (dst)[i2] = (src)[i2]
68
#define ZERO(buf, size) FOR(i3, 0, size) (buf)[i3] = 0
69
#define WIPE_CTX(ctx) crypto_wipe(ctx , sizeof(*(ctx)))
70
#define WIPE_BUFFER(buffer) crypto_wipe(buffer, sizeof(buffer))
71
#define MIN(a, b) ((a) <= (b) ? (a) : (b))
72
#define MAX(a, b) ((a) >= (b) ? (a) : (b))
73
74
typedef int8_t i8;
75
typedef uint8_t u8;
76
typedef int16_t i16;
77
typedef uint32_t u32;
78
typedef int32_t i32;
79
typedef int64_t i64;
80
typedef uint64_t u64;
81
82
static const u8 zero[128] = {0};
83
84
// returns the smallest positive integer y such that
85
// (x + y) % pow_2 == 0
86
// Basically, it's how many bytes we need to add to "align" x.
87
// Only works when pow_2 is a power of 2.
88
// Note: we use ~x+1 instead of -x to avoid compiler warnings
89
static size_t align(size_t x, size_t pow_2)
90
{
91
return (~x + 1) & (pow_2 - 1);
92
}
93
94
static u32 load24_le(const u8 s[3])
95
{
96
return (u32)s[0]
97
| ((u32)s[1] << 8)
98
| ((u32)s[2] << 16);
99
}
100
101
static u32 load32_le(const u8 s[4])
102
{
103
return (u32)s[0]
104
| ((u32)s[1] << 8)
105
| ((u32)s[2] << 16)
106
| ((u32)s[3] << 24);
107
}
108
109
static u64 load64_le(const u8 s[8])
110
{
111
return load32_le(s) | ((u64)load32_le(s+4) << 32);
112
}
113
114
static void store32_le(u8 out[4], u32 in)
115
{
116
out[0] = in & 0xff;
117
out[1] = (in >> 8) & 0xff;
118
out[2] = (in >> 16) & 0xff;
119
out[3] = (in >> 24) & 0xff;
120
}
121
122
static void store64_le(u8 out[8], u64 in)
123
{
124
store32_le(out , (u32)in );
125
store32_le(out + 4, in >> 32);
126
}
127
128
static void load32_le_buf (u32 *dst, const u8 *src, size_t size) {
129
FOR(i, 0, size) { dst[i] = load32_le(src + i*4); }
130
}
131
static void load64_le_buf (u64 *dst, const u8 *src, size_t size) {
132
FOR(i, 0, size) { dst[i] = load64_le(src + i*8); }
133
}
134
static void store32_le_buf(u8 *dst, const u32 *src, size_t size) {
135
FOR(i, 0, size) { store32_le(dst + i*4, src[i]); }
136
}
137
static void store64_le_buf(u8 *dst, const u64 *src, size_t size) {
138
FOR(i, 0, size) { store64_le(dst + i*8, src[i]); }
139
}
140
141
static u64 rotr64(u64 x, u64 n) { return (x >> n) ^ (x << (64 - n)); }
142
static u32 rotl32(u32 x, u32 n) { return (x << n) ^ (x >> (32 - n)); }
143
144
static int neq0(u64 diff)
145
{ // constant time comparison to zero
146
// return diff != 0 ? -1 : 0
147
u64 half = (diff >> 32) | ((u32)diff);
148
return (1 & ((half - 1) >> 32)) - 1;
149
}
150
151
static u64 x16(const u8 a[16], const u8 b[16])
152
{
153
return (load64_le(a + 0) ^ load64_le(b + 0))
154
| (load64_le(a + 8) ^ load64_le(b + 8));
155
}
156
static u64 x32(const u8 a[32],const u8 b[32]){return x16(a,b)| x16(a+16, b+16);}
157
static u64 x64(const u8 a[64],const u8 b[64]){return x32(a,b)| x32(a+32, b+32);}
158
int crypto_verify16(const u8 a[16], const u8 b[16]){ return neq0(x16(a, b)); }
159
int crypto_verify32(const u8 a[32], const u8 b[32]){ return neq0(x32(a, b)); }
160
int crypto_verify64(const u8 a[64], const u8 b[64]){ return neq0(x64(a, b)); }
161
162
void crypto_wipe(void *secret, size_t size)
163
{
164
volatile u8 *v_secret = (u8*)secret;
165
ZERO(v_secret, size);
166
}
167
168
/////////////////
169
/// Chacha 20 ///
170
/////////////////
171
#define QUARTERROUND(a, b, c, d) \
172
a += b; d = rotl32(d ^ a, 16); \
173
c += d; b = rotl32(b ^ c, 12); \
174
a += b; d = rotl32(d ^ a, 8); \
175
c += d; b = rotl32(b ^ c, 7)
176
177
static void chacha20_rounds(u32 out[16], const u32 in[16])
178
{
179
// The temporary variables make Chacha20 10% faster.
180
u32 t0 = in[ 0]; u32 t1 = in[ 1]; u32 t2 = in[ 2]; u32 t3 = in[ 3];
181
u32 t4 = in[ 4]; u32 t5 = in[ 5]; u32 t6 = in[ 6]; u32 t7 = in[ 7];
182
u32 t8 = in[ 8]; u32 t9 = in[ 9]; u32 t10 = in[10]; u32 t11 = in[11];
183
u32 t12 = in[12]; u32 t13 = in[13]; u32 t14 = in[14]; u32 t15 = in[15];
184
185
FOR (i, 0, 10) { // 20 rounds, 2 rounds per loop.
186
QUARTERROUND(t0, t4, t8 , t12); // column 0
187
QUARTERROUND(t1, t5, t9 , t13); // column 1
188
QUARTERROUND(t2, t6, t10, t14); // column 2
189
QUARTERROUND(t3, t7, t11, t15); // column 3
190
QUARTERROUND(t0, t5, t10, t15); // diagonal 0
191
QUARTERROUND(t1, t6, t11, t12); // diagonal 1
192
QUARTERROUND(t2, t7, t8 , t13); // diagonal 2
193
QUARTERROUND(t3, t4, t9 , t14); // diagonal 3
194
}
195
out[ 0] = t0; out[ 1] = t1; out[ 2] = t2; out[ 3] = t3;
196
out[ 4] = t4; out[ 5] = t5; out[ 6] = t6; out[ 7] = t7;
197
out[ 8] = t8; out[ 9] = t9; out[10] = t10; out[11] = t11;
198
out[12] = t12; out[13] = t13; out[14] = t14; out[15] = t15;
199
}
200
201
static void chacha20_init_key(u32 block[16], const u8 key[32])
202
{
203
load32_le_buf(block , (const u8*)"expand 32-byte k", 4); // constant
204
load32_le_buf(block+4, key , 8); // key
205
}
206
207
void crypto_hchacha20(u8 out[32], const u8 key[32], const u8 in [16])
208
{
209
u32 block[16];
210
chacha20_init_key(block, key);
211
// input
212
load32_le_buf(block + 12, in, 4);
213
chacha20_rounds(block, block);
214
// prevent reversal of the rounds by revealing only half of the buffer.
215
store32_le_buf(out , block , 4); // constant
216
store32_le_buf(out+16, block+12, 4); // counter and nonce
217
WIPE_BUFFER(block);
218
}
219
220
u64 crypto_chacha20_ctr(u8 *cipher_text, const u8 *plain_text,
221
size_t text_size, const u8 key[32], const u8 nonce[8],
222
u64 ctr)
223
{
224
u32 input[16];
225
chacha20_init_key(input, key);
226
input[12] = (u32) ctr;
227
input[13] = (u32)(ctr >> 32);
228
load32_le_buf(input+14, nonce, 2);
229
230
// Whole blocks
231
u32 pool[16];
232
size_t nb_blocks = text_size >> 6;
233
FOR (i, 0, nb_blocks) {
234
chacha20_rounds(pool, input);
235
if (plain_text != 0) {
236
FOR (j, 0, 16) {
237
u32 p = pool[j] + input[j];
238
store32_le(cipher_text, p ^ load32_le(plain_text));
239
cipher_text += 4;
240
plain_text += 4;
241
}
242
} else {
243
FOR (j, 0, 16) {
244
u32 p = pool[j] + input[j];
245
store32_le(cipher_text, p);
246
cipher_text += 4;
247
}
248
}
249
input[12]++;
250
if (input[12] == 0) {
251
input[13]++;
252
}
253
}
254
text_size &= 63;
255
256
// Last (incomplete) block
257
if (text_size > 0) {
258
if (plain_text == 0) {
259
plain_text = zero;
260
}
261
chacha20_rounds(pool, input);
262
u8 tmp[64];
263
FOR (i, 0, 16) {
264
store32_le(tmp + i*4, pool[i] + input[i]);
265
}
266
FOR (i, 0, text_size) {
267
cipher_text[i] = tmp[i] ^ plain_text[i];
268
}
269
WIPE_BUFFER(tmp);
270
}
271
ctr = input[12] + ((u64)input[13] << 32) + (text_size > 0);
272
273
WIPE_BUFFER(pool);
274
WIPE_BUFFER(input);
275
return ctr;
276
}
277
278
u32 crypto_ietf_chacha20_ctr(u8 *cipher_text, const u8 *plain_text,
279
size_t text_size,
280
const u8 key[32], const u8 nonce[12], u32 ctr)
281
{
282
u64 big_ctr = ctr + ((u64)load32_le(nonce) << 32);
283
return (u32)crypto_chacha20_ctr(cipher_text, plain_text, text_size,
284
key, nonce + 4, big_ctr);
285
}
286
287
u64 crypto_xchacha20_ctr(u8 *cipher_text, const u8 *plain_text,
288
size_t text_size,
289
const u8 key[32], const u8 nonce[24], u64 ctr)
290
{
291
u8 sub_key[32];
292
crypto_hchacha20(sub_key, key, nonce);
293
ctr = crypto_chacha20_ctr(cipher_text, plain_text, text_size,
294
sub_key, nonce+16, ctr);
295
WIPE_BUFFER(sub_key);
296
return ctr;
297
}
298
299
void crypto_chacha20(u8 *cipher_text, const u8 *plain_text, size_t text_size,
300
const u8 key[32], const u8 nonce[8])
301
{
302
crypto_chacha20_ctr(cipher_text, plain_text, text_size, key, nonce, 0);
303
304
}
305
void crypto_ietf_chacha20(u8 *cipher_text, const u8 *plain_text,
306
size_t text_size,
307
const u8 key[32], const u8 nonce[12])
308
{
309
crypto_ietf_chacha20_ctr(cipher_text, plain_text, text_size, key, nonce, 0);
310
}
311
312
void crypto_xchacha20(u8 *cipher_text, const u8 *plain_text, size_t text_size,
313
const u8 key[32], const u8 nonce[24])
314
{
315
crypto_xchacha20_ctr(cipher_text, plain_text, text_size, key, nonce, 0);
316
}
317
318
/////////////////
319
/// Poly 1305 ///
320
/////////////////
321
322
// h = (h + c) * r
323
// preconditions:
324
// ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
325
// ctx->c <= 1_ffffffff_ffffffff_ffffffff_ffffffff
326
// ctx->r <= 0ffffffc_0ffffffc_0ffffffc_0fffffff
327
// Postcondition:
328
// ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
329
static void poly_block(crypto_poly1305_ctx *ctx)
330
{
331
// s = h + c, without carry propagation
332
const u64 s0 = ctx->h[0] + (u64)ctx->c[0]; // s0 <= 1_fffffffe
333
const u64 s1 = ctx->h[1] + (u64)ctx->c[1]; // s1 <= 1_fffffffe
334
const u64 s2 = ctx->h[2] + (u64)ctx->c[2]; // s2 <= 1_fffffffe
335
const u64 s3 = ctx->h[3] + (u64)ctx->c[3]; // s3 <= 1_fffffffe
336
const u32 s4 = ctx->h[4] + ctx->c[4]; // s4 <= 5
337
338
// Local all the things!
339
const u32 r0 = ctx->r[0]; // r0 <= 0fffffff
340
const u32 r1 = ctx->r[1]; // r1 <= 0ffffffc
341
const u32 r2 = ctx->r[2]; // r2 <= 0ffffffc
342
const u32 r3 = ctx->r[3]; // r3 <= 0ffffffc
343
const u32 rr0 = (r0 >> 2) * 5; // rr0 <= 13fffffb // lose 2 bits...
344
const u32 rr1 = (r1 >> 2) + r1; // rr1 <= 13fffffb // rr1 == (r1 >> 2) * 5
345
const u32 rr2 = (r2 >> 2) + r2; // rr2 <= 13fffffb // rr1 == (r2 >> 2) * 5
346
const u32 rr3 = (r3 >> 2) + r3; // rr3 <= 13fffffb // rr1 == (r3 >> 2) * 5
347
348
// (h + c) * r, without carry propagation
349
const u64 x0 = s0*r0+ s1*rr3+ s2*rr2+ s3*rr1+ s4*rr0; // <= 97ffffe007fffff8
350
const u64 x1 = s0*r1+ s1*r0 + s2*rr3+ s3*rr2+ s4*rr1; // <= 8fffffe20ffffff6
351
const u64 x2 = s0*r2+ s1*r1 + s2*r0 + s3*rr3+ s4*rr2; // <= 87ffffe417fffff4
352
const u64 x3 = s0*r3+ s1*r2 + s2*r1 + s3*r0 + s4*rr3; // <= 7fffffe61ffffff2
353
const u32 x4 = s4 * (r0 & 3); // ...recover 2 bits // <= f
354
355
// partial reduction modulo 2^130 - 5
356
const u32 u5 = x4 + (x3 >> 32); // u5 <= 7ffffff5
357
const u64 u0 = (u5 >> 2) * 5 + (x0 & 0xffffffff);
358
const u64 u1 = (u0 >> 32) + (x1 & 0xffffffff) + (x0 >> 32);
359
const u64 u2 = (u1 >> 32) + (x2 & 0xffffffff) + (x1 >> 32);
360
const u64 u3 = (u2 >> 32) + (x3 & 0xffffffff) + (x2 >> 32);
361
const u64 u4 = (u3 >> 32) + (u5 & 3);
362
363
// Update the hash
364
ctx->h[0] = (u32)u0; // u0 <= 1_9ffffff0
365
ctx->h[1] = (u32)u1; // u1 <= 1_97ffffe0
366
ctx->h[2] = (u32)u2; // u2 <= 1_8fffffe2
367
ctx->h[3] = (u32)u3; // u3 <= 1_87ffffe4
368
ctx->h[4] = (u32)u4; // u4 <= 4
369
}
370
371
// (re-)initialises the input counter and input buffer
372
static void poly_clear_c(crypto_poly1305_ctx *ctx)
373
{
374
ZERO(ctx->c, 4);
375
ctx->c_idx = 0;
376
}
377
378
static void poly_take_input(crypto_poly1305_ctx *ctx, u8 input)
379
{
380
size_t word = ctx->c_idx >> 2;
381
size_t byte = ctx->c_idx & 3;
382
ctx->c[word] |= (u32)input << (byte * 8);
383
ctx->c_idx++;
384
}
385
386
static void poly_update(crypto_poly1305_ctx *ctx,
387
const u8 *message, size_t message_size)
388
{
389
FOR (i, 0, message_size) {
390
poly_take_input(ctx, message[i]);
391
if (ctx->c_idx == 16) {
392
poly_block(ctx);
393
poly_clear_c(ctx);
394
}
395
}
396
}
397
398
void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const u8 key[32])
399
{
400
// Initial hash is zero
401
ZERO(ctx->h, 5);
402
// add 2^130 to every input block
403
ctx->c[4] = 1;
404
poly_clear_c(ctx);
405
// load r and pad (r has some of its bits cleared)
406
load32_le_buf(ctx->r , key , 4);
407
load32_le_buf(ctx->pad, key+16, 4);
408
FOR (i, 0, 1) { ctx->r[i] &= 0x0fffffff; }
409
FOR (i, 1, 4) { ctx->r[i] &= 0x0ffffffc; }
410
}
411
412
void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
413
const u8 *message, size_t message_size)
414
{
415
if (message_size == 0) {
416
return;
417
}
418
// Align ourselves with block boundaries
419
size_t aligned = MIN(align(ctx->c_idx, 16), message_size);
420
poly_update(ctx, message, aligned);
421
message += aligned;
422
message_size -= aligned;
423
424
// Process the message block by block
425
size_t nb_blocks = message_size >> 4;
426
FOR (i, 0, nb_blocks) {
427
load32_le_buf(ctx->c, message, 4);
428
poly_block(ctx);
429
message += 16;
430
}
431
if (nb_blocks > 0) {
432
poly_clear_c(ctx);
433
}
434
message_size &= 15;
435
436
// remaining bytes
437
poly_update(ctx, message, message_size);
438
}
439
440
void crypto_poly1305_final(crypto_poly1305_ctx *ctx, u8 mac[16])
441
{
442
// Process the last block (if any)
443
if (ctx->c_idx != 0) {
444
// move the final 1 according to remaining input length
445
// (We may add less than 2^130 to the last input block)
446
ctx->c[4] = 0;
447
poly_take_input(ctx, 1);
448
// one last hash update
449
poly_block(ctx);
450
}
451
452
// check if we should subtract 2^130-5 by performing the
453
// corresponding carry propagation.
454
u64 c = 5;
455
FOR (i, 0, 4) {
456
c += ctx->h[i];
457
c >>= 32;
458
}
459
c += ctx->h[4];
460
c = (c >> 2) * 5; // shift the carry back to the beginning
461
// c now indicates how many times we should subtract 2^130-5 (0 or 1)
462
FOR (i, 0, 4) {
463
c += (u64)ctx->h[i] + ctx->pad[i];
464
store32_le(mac + i*4, (u32)c);
465
c = c >> 32;
466
}
467
WIPE_CTX(ctx);
468
}
469
470
void crypto_poly1305(u8 mac[16], const u8 *message,
471
size_t message_size, const u8 key[32])
472
{
473
crypto_poly1305_ctx ctx;
474
crypto_poly1305_init (&ctx, key);
475
crypto_poly1305_update(&ctx, message, message_size);
476
crypto_poly1305_final (&ctx, mac);
477
}
478
479
////////////////
480
/// Blake2 b ///
481
////////////////
482
static const u64 iv[8] = {
483
0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
484
0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
485
0x510e527fade682d1, 0x9b05688c2b3e6c1f,
486
0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
487
};
488
489
// increment the input offset
490
static void blake2b_incr(crypto_blake2b_ctx *ctx)
491
{
492
u64 *x = ctx->input_offset;
493
size_t y = ctx->input_idx;
494
x[0] += y;
495
if (x[0] < y) {
496
x[1]++;
497
}
498
}
499
500
static void blake2b_compress(crypto_blake2b_ctx *ctx, int is_last_block)
501
{
502
static const u8 sigma[12][16] = {
503
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
504
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
505
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
506
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
507
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
508
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
509
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
510
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
511
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
512
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
513
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
514
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
515
};
516
517
// init work vector
518
u64 v0 = ctx->hash[0]; u64 v8 = iv[0];
519
u64 v1 = ctx->hash[1]; u64 v9 = iv[1];
520
u64 v2 = ctx->hash[2]; u64 v10 = iv[2];
521
u64 v3 = ctx->hash[3]; u64 v11 = iv[3];
522
u64 v4 = ctx->hash[4]; u64 v12 = iv[4] ^ ctx->input_offset[0];
523
u64 v5 = ctx->hash[5]; u64 v13 = iv[5] ^ ctx->input_offset[1];
524
u64 v6 = ctx->hash[6]; u64 v14 = iv[6] ^ (u64)~(is_last_block - 1);
525
u64 v7 = ctx->hash[7]; u64 v15 = iv[7];
526
527
// mangle work vector
528
u64 *input = ctx->input;
529
#define BLAKE2_G(a, b, c, d, x, y) \
530
a += b + x; d = rotr64(d ^ a, 32); \
531
c += d; b = rotr64(b ^ c, 24); \
532
a += b + y; d = rotr64(d ^ a, 16); \
533
c += d; b = rotr64(b ^ c, 63)
534
#define BLAKE2_ROUND(i) \
535
BLAKE2_G(v0, v4, v8 , v12, input[sigma[i][ 0]], input[sigma[i][ 1]]); \
536
BLAKE2_G(v1, v5, v9 , v13, input[sigma[i][ 2]], input[sigma[i][ 3]]); \
537
BLAKE2_G(v2, v6, v10, v14, input[sigma[i][ 4]], input[sigma[i][ 5]]); \
538
BLAKE2_G(v3, v7, v11, v15, input[sigma[i][ 6]], input[sigma[i][ 7]]); \
539
BLAKE2_G(v0, v5, v10, v15, input[sigma[i][ 8]], input[sigma[i][ 9]]); \
540
BLAKE2_G(v1, v6, v11, v12, input[sigma[i][10]], input[sigma[i][11]]); \
541
BLAKE2_G(v2, v7, v8 , v13, input[sigma[i][12]], input[sigma[i][13]]); \
542
BLAKE2_G(v3, v4, v9 , v14, input[sigma[i][14]], input[sigma[i][15]])
543
544
#ifdef BLAKE2_NO_UNROLLING
545
FOR (i, 0, 12) {
546
BLAKE2_ROUND(i);
547
}
548
#else
549
BLAKE2_ROUND(0); BLAKE2_ROUND(1); BLAKE2_ROUND(2); BLAKE2_ROUND(3);
550
BLAKE2_ROUND(4); BLAKE2_ROUND(5); BLAKE2_ROUND(6); BLAKE2_ROUND(7);
551
BLAKE2_ROUND(8); BLAKE2_ROUND(9); BLAKE2_ROUND(10); BLAKE2_ROUND(11);
552
#endif
553
554
// update hash
555
ctx->hash[0] ^= v0 ^ v8; ctx->hash[1] ^= v1 ^ v9;
556
ctx->hash[2] ^= v2 ^ v10; ctx->hash[3] ^= v3 ^ v11;
557
ctx->hash[4] ^= v4 ^ v12; ctx->hash[5] ^= v5 ^ v13;
558
ctx->hash[6] ^= v6 ^ v14; ctx->hash[7] ^= v7 ^ v15;
559
}
560
561
static void blake2b_set_input(crypto_blake2b_ctx *ctx, u8 input, size_t index)
562
{
563
if (index == 0) {
564
ZERO(ctx->input, 16);
565
}
566
size_t word = index >> 3;
567
size_t byte = index & 7;
568
ctx->input[word] |= (u64)input << (byte << 3);
569
570
}
571
572
static void blake2b_end_block(crypto_blake2b_ctx *ctx)
573
{
574
if (ctx->input_idx == 128) { // If buffer is full,
575
blake2b_incr(ctx); // update the input offset
576
blake2b_compress(ctx, 0); // and compress the (not last) block
577
ctx->input_idx = 0;
578
}
579
}
580
581
static void blake2b_update(crypto_blake2b_ctx *ctx,
582
const u8 *message, size_t message_size)
583
{
584
FOR (i, 0, message_size) {
585
blake2b_end_block(ctx);
586
blake2b_set_input(ctx, message[i], ctx->input_idx);
587
ctx->input_idx++;
588
}
589
}
590
591
void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t hash_size,
592
const u8 *key, size_t key_size)
593
{
594
// initial hash
595
COPY(ctx->hash, iv, 8);
596
ctx->hash[0] ^= 0x01010000 ^ (key_size << 8) ^ hash_size;
597
598
ctx->input_offset[0] = 0; // beginning of the input, no offset
599
ctx->input_offset[1] = 0; // beginning of the input, no offset
600
ctx->hash_size = hash_size; // remember the hash size we want
601
ctx->input_idx = 0;
602
603
// if there is a key, the first block is that key (padded with zeroes)
604
if (key_size > 0) {
605
u8 key_block[128] = {0};
606
COPY(key_block, key, key_size);
607
// same as calling crypto_blake2b_update(ctx, key_block , 128)
608
load64_le_buf(ctx->input, key_block, 16);
609
ctx->input_idx = 128;
610
}
611
}
612
613
void crypto_blake2b_init(crypto_blake2b_ctx *ctx)
614
{
615
crypto_blake2b_general_init(ctx, 64, 0, 0);
616
}
617
618
void crypto_blake2b_update(crypto_blake2b_ctx *ctx,
619
const u8 *message, size_t message_size)
620
{
621
if (message_size == 0) {
622
return;
623
}
624
// Align ourselves with block boundaries
625
size_t aligned = MIN(align(ctx->input_idx, 128), message_size);
626
blake2b_update(ctx, message, aligned);
627
message += aligned;
628
message_size -= aligned;
629
630
// Process the message block by block
631
FOR (i, 0, message_size >> 7) { // number of blocks
632
blake2b_end_block(ctx);
633
load64_le_buf(ctx->input, message, 16);
634
message += 128;
635
ctx->input_idx = 128;
636
}
637
message_size &= 127;
638
639
// remaining bytes
640
blake2b_update(ctx, message, message_size);
641
}
642
643
void crypto_blake2b_final(crypto_blake2b_ctx *ctx, u8 *hash)
644
{
645
// Pad the end of the block with zeroes
646
FOR (i, ctx->input_idx, 128) {
647
blake2b_set_input(ctx, 0, i);
648
}
649
blake2b_incr(ctx); // update the input offset
650
blake2b_compress(ctx, 1); // compress the last block
651
size_t nb_words = ctx->hash_size >> 3;
652
store64_le_buf(hash, ctx->hash, nb_words);
653
FOR (i, nb_words << 3, ctx->hash_size) {
654
hash[i] = (ctx->hash[i >> 3] >> (8 * (i & 7))) & 0xff;
655
}
656
WIPE_CTX(ctx);
657
}
658
659
void crypto_blake2b_general(u8 *hash , size_t hash_size,
660
const u8 *key , size_t key_size,
661
const u8 *message, size_t message_size)
662
{
663
crypto_blake2b_ctx ctx;
664
crypto_blake2b_general_init(&ctx, hash_size, key, key_size);
665
crypto_blake2b_update(&ctx, message, message_size);
666
crypto_blake2b_final(&ctx, hash);
667
}
668
669
void crypto_blake2b(u8 hash[64], const u8 *message, size_t message_size)
670
{
671
crypto_blake2b_general(hash, 64, 0, 0, message, message_size);
672
}
673
674
static void blake2b_vtable_init(void *ctx) {
675
crypto_blake2b_init(&((crypto_sign_ctx*)ctx)->hash);
676
}
677
static void blake2b_vtable_update(void *ctx, const u8 *m, size_t s) {
678
crypto_blake2b_update(&((crypto_sign_ctx*)ctx)->hash, m, s);
679
}
680
static void blake2b_vtable_final(void *ctx, u8 *h) {
681
crypto_blake2b_final(&((crypto_sign_ctx*)ctx)->hash, h);
682
}
683
const crypto_sign_vtable crypto_blake2b_vtable = {
684
crypto_blake2b,
685
blake2b_vtable_init,
686
blake2b_vtable_update,
687
blake2b_vtable_final,
688
sizeof(crypto_sign_ctx),
689
};
690
691
#if MONOCYPHER_ARGON2_ENABLE
692
////////////////
693
/// Argon2 i ///
694
////////////////
695
// references to R, Z, Q etc. come from the spec
696
697
// Argon2 operates on 1024 byte blocks.
698
typedef struct { u64 a[128]; } block;
699
700
static void wipe_block(block *b)
701
{
702
volatile u64* a = b->a;
703
ZERO(a, 128);
704
}
705
706
// updates a Blake2 hash with a 32 bit word, little endian.
707
static void blake_update_32(crypto_blake2b_ctx *ctx, u32 input)
708
{
709
u8 buf[4];
710
store32_le(buf, input);
711
crypto_blake2b_update(ctx, buf, 4);
712
WIPE_BUFFER(buf);
713
}
714
715
static void load_block(block *b, const u8 bytes[1024])
716
{
717
load64_le_buf(b->a, bytes, 128);
718
}
719
720
static void store_block(u8 bytes[1024], const block *b)
721
{
722
store64_le_buf(bytes, b->a, 128);
723
}
724
725
static void copy_block(block *o,const block*in){FOR(i,0,128)o->a[i] = in->a[i];}
726
static void xor_block(block *o,const block*in){FOR(i,0,128)o->a[i]^= in->a[i];}
727
728
// Hash with a virtually unlimited digest size.
729
// Doesn't extract more entropy than the base hash function.
730
// Mainly used for filling a whole kilobyte block with pseudo-random bytes.
731
// (One could use a stream cipher with a seed hash as the key, but
732
// this would introduce another dependency —and point of failure.)
733
static void extended_hash(u8 *digest, u32 digest_size,
734
const u8 *input , u32 input_size)
735
{
736
crypto_blake2b_ctx ctx;
737
crypto_blake2b_general_init(&ctx, MIN(digest_size, 64), 0, 0);
738
blake_update_32 (&ctx, digest_size);
739
crypto_blake2b_update (&ctx, input, input_size);
740
crypto_blake2b_final (&ctx, digest);
741
742
if (digest_size > 64) {
743
// the conversion to u64 avoids integer overflow on
744
// ludicrously big hash sizes.
745
u32 r = (u32)(((u64)digest_size + 31) >> 5) - 2;
746
u32 i = 1;
747
u32 in = 0;
748
u32 out = 32;
749
while (i < r) {
750
// Input and output overlap. This is intentional
751
crypto_blake2b(digest + out, digest + in, 64);
752
i += 1;
753
in += 32;
754
out += 32;
755
}
756
crypto_blake2b_general(digest + out, digest_size - (32 * r),
757
0, 0, // no key
758
digest + in , 64);
759
}
760
}
761
762
#define LSB(x) ((x) & 0xffffffff)
763
#define G(a, b, c, d) \
764
a += b + 2 * LSB(a) * LSB(b); d ^= a; d = rotr64(d, 32); \
765
c += d + 2 * LSB(c) * LSB(d); b ^= c; b = rotr64(b, 24); \
766
a += b + 2 * LSB(a) * LSB(b); d ^= a; d = rotr64(d, 16); \
767
c += d + 2 * LSB(c) * LSB(d); b ^= c; b = rotr64(b, 63)
768
#define ROUND(v0, v1, v2, v3, v4, v5, v6, v7, \
769
v8, v9, v10, v11, v12, v13, v14, v15) \
770
G(v0, v4, v8, v12); G(v1, v5, v9, v13); \
771
G(v2, v6, v10, v14); G(v3, v7, v11, v15); \
772
G(v0, v5, v10, v15); G(v1, v6, v11, v12); \
773
G(v2, v7, v8, v13); G(v3, v4, v9, v14)
774
775
// Core of the compression function G. Computes Z from R in place.
776
static void g_rounds(block *work_block)
777
{
778
// column rounds (work_block = Q)
779
for (int i = 0; i < 128; i += 16) {
780
ROUND(work_block->a[i ], work_block->a[i + 1],
781
work_block->a[i + 2], work_block->a[i + 3],
782
work_block->a[i + 4], work_block->a[i + 5],
783
work_block->a[i + 6], work_block->a[i + 7],
784
work_block->a[i + 8], work_block->a[i + 9],
785
work_block->a[i + 10], work_block->a[i + 11],
786
work_block->a[i + 12], work_block->a[i + 13],
787
work_block->a[i + 14], work_block->a[i + 15]);
788
}
789
// row rounds (work_block = Z)
790
for (int i = 0; i < 16; i += 2) {
791
ROUND(work_block->a[i ], work_block->a[i + 1],
792
work_block->a[i + 16], work_block->a[i + 17],
793
work_block->a[i + 32], work_block->a[i + 33],
794
work_block->a[i + 48], work_block->a[i + 49],
795
work_block->a[i + 64], work_block->a[i + 65],
796
work_block->a[i + 80], work_block->a[i + 81],
797
work_block->a[i + 96], work_block->a[i + 97],
798
work_block->a[i + 112], work_block->a[i + 113]);
799
}
800
}
801
802
// The compression function G (copy version for the first pass)
803
static void g_copy(block *result, const block *x, const block *y, block* tmp)
804
{
805
copy_block(tmp , x ); // tmp = X
806
xor_block (tmp , y ); // tmp = X ^ Y = R
807
copy_block(result, tmp); // result = R (only difference with g_xor)
808
g_rounds (tmp); // tmp = Z
809
xor_block (result, tmp); // result = R ^ Z
810
}
811
812
// The compression function G (xor version for subsequent passes)
813
static void g_xor(block *result, const block *x, const block *y, block *tmp)
814
{
815
copy_block(tmp , x ); // tmp = X
816
xor_block (tmp , y ); // tmp = X ^ Y = R
817
xor_block (result, tmp); // result = R ^ old (only difference with g_copy)
818
g_rounds (tmp); // tmp = Z
819
xor_block (result, tmp); // result = R ^ old ^ Z
820
}
821
822
// Unary version of the compression function.
823
// The missing argument is implied zero.
824
// Does the transformation in place.
825
static void unary_g(block *work_block, block *tmp)
826
{
827
// work_block == R
828
copy_block(tmp, work_block); // tmp = R
829
g_rounds (work_block); // work_block = Z
830
xor_block (work_block, tmp); // work_block = Z ^ R
831
}
832
833
// Argon2i uses a kind of stream cipher to determine which reference
834
// block it will take to synthesise the next block. This context hold
835
// that stream's state. (It's very similar to Chacha20. The block b
836
// is analogous to Chacha's own pool)
837
typedef struct {
838
block b;
839
u32 pass_number;
840
u32 slice_number;
841
u32 nb_blocks;
842
u32 nb_iterations;
843
u32 ctr;
844
u32 offset;
845
} gidx_ctx;
846
847
// The block in the context will determine array indices. To avoid
848
// timing attacks, it only depends on public information. No looking
849
// at a previous block to seed the next. This makes offline attacks
850
// easier, but timing attacks are the bigger threat in many settings.
851
static void gidx_refresh(gidx_ctx *ctx)
852
{
853
// seed the beginning of the block...
854
ctx->b.a[0] = ctx->pass_number;
855
ctx->b.a[1] = 0; // lane number (we have only one)
856
ctx->b.a[2] = ctx->slice_number;
857
ctx->b.a[3] = ctx->nb_blocks;
858
ctx->b.a[4] = ctx->nb_iterations;
859
ctx->b.a[5] = 1; // type: Argon2i
860
ctx->b.a[6] = ctx->ctr;
861
ZERO(ctx->b.a + 7, 121); // ...then zero the rest out
862
863
// Shuffle the block thus: ctx->b = G((G(ctx->b, zero)), zero)
864
// (G "square" function), to get cheap pseudo-random numbers.
865
block tmp;
866
unary_g(&ctx->b, &tmp);
867
unary_g(&ctx->b, &tmp);
868
wipe_block(&tmp);
869
}
870
871
static void gidx_init(gidx_ctx *ctx,
872
u32 pass_number, u32 slice_number,
873
u32 nb_blocks, u32 nb_iterations)
874
{
875
ctx->pass_number = pass_number;
876
ctx->slice_number = slice_number;
877
ctx->nb_blocks = nb_blocks;
878
ctx->nb_iterations = nb_iterations;
879
ctx->ctr = 0;
880
881
// Offset from the beginning of the segment. For the first slice
882
// of the first pass, we start at the *third* block, so the offset
883
// starts at 2, not 0.
884
if (pass_number != 0 || slice_number != 0) {
885
ctx->offset = 0;
886
} else {
887
ctx->offset = 2;
888
ctx->ctr++; // Compensates for missed lazy creation
889
gidx_refresh(ctx); // at the start of gidx_next()
890
}
891
}
892
893
static u32 gidx_next(gidx_ctx *ctx)
894
{
895
// lazily creates the offset block we need
896
if ((ctx->offset & 127) == 0) {
897
ctx->ctr++;
898
gidx_refresh(ctx);
899
}
900
u32 index = ctx->offset & 127; // save index for current call
901
u32 offset = ctx->offset; // save offset for current call
902
ctx->offset++; // update offset for next call
903
904
// Computes the area size.
905
// Pass 0 : all already finished segments plus already constructed
906
// blocks in this segment
907
// Pass 1+: 3 last segments plus already constructed
908
// blocks in this segment. THE SPEC SUGGESTS OTHERWISE.
909
// I CONFORM TO THE REFERENCE IMPLEMENTATION.
910
int first_pass = ctx->pass_number == 0;
911
u32 slice_size = ctx->nb_blocks >> 2;
912
u32 nb_segments = first_pass ? ctx->slice_number : 3;
913
u32 area_size = nb_segments * slice_size + offset - 1;
914
915
// Computes the starting position of the reference area.
916
// CONTRARY TO WHAT THE SPEC SUGGESTS, IT STARTS AT THE
917
// NEXT SEGMENT, NOT THE NEXT BLOCK.
918
u32 next_slice = ((ctx->slice_number + 1) & 3) * slice_size;
919
u32 start_pos = first_pass ? 0 : next_slice;
920
921
// Generate offset from J1 (no need for J2, there's only one lane)
922
u64 j1 = ctx->b.a[index] & 0xffffffff; // pseudo-random number
923
u64 x = (j1 * j1) >> 32;
924
u64 y = (area_size * x) >> 32;
925
u64 z = (area_size - 1) - y;
926
u64 ref = start_pos + z; // ref < 2 * nb_blocks
927
return (u32)(ref < ctx->nb_blocks ? ref : ref - ctx->nb_blocks);
928
}
929
930
// Main algorithm
931
void crypto_argon2i_general(u8 *hash, u32 hash_size,
932
void *work_area, u32 nb_blocks,
933
u32 nb_iterations,
934
const u8 *password, u32 password_size,
935
const u8 *salt, u32 salt_size,
936
const u8 *key, u32 key_size,
937
const u8 *ad, u32 ad_size)
938
{
939
// work area seen as blocks (must be suitably aligned)
940
block *blocks = (block*)work_area;
941
{
942
crypto_blake2b_ctx ctx;
943
crypto_blake2b_init(&ctx);
944
945
blake_update_32 (&ctx, 1 ); // p: number of threads
946
blake_update_32 (&ctx, hash_size );
947
blake_update_32 (&ctx, nb_blocks );
948
blake_update_32 (&ctx, nb_iterations);
949
blake_update_32 (&ctx, 0x13 ); // v: version number
950
blake_update_32 (&ctx, 1 ); // y: Argon2i
951
blake_update_32 (&ctx, password_size);
952
crypto_blake2b_update(&ctx, password, password_size);
953
blake_update_32 (&ctx, salt_size);
954
crypto_blake2b_update(&ctx, salt, salt_size);
955
blake_update_32 (&ctx, key_size);
956
crypto_blake2b_update(&ctx, key, key_size);
957
blake_update_32 (&ctx, ad_size);
958
crypto_blake2b_update(&ctx, ad, ad_size);
959
960
u8 initial_hash[72]; // 64 bytes plus 2 words for future hashes
961
crypto_blake2b_final(&ctx, initial_hash);
962
963
// fill first 2 blocks
964
block tmp_block;
965
u8 hash_area[1024];
966
store32_le(initial_hash + 64, 0); // first additional word
967
store32_le(initial_hash + 68, 0); // second additional word
968
extended_hash(hash_area, 1024, initial_hash, 72);
969
load_block(&tmp_block, hash_area);
970
copy_block(blocks, &tmp_block);
971
972
store32_le(initial_hash + 64, 1); // slight modification
973
extended_hash(hash_area, 1024, initial_hash, 72);
974
load_block(&tmp_block, hash_area);
975
copy_block(blocks + 1, &tmp_block);
976
977
WIPE_BUFFER(initial_hash);
978
WIPE_BUFFER(hash_area);
979
wipe_block(&tmp_block);
980
}
981
982
// Actual number of blocks
983
nb_blocks -= nb_blocks & 3; // round down to 4 p (p == 1 thread)
984
const u32 segment_size = nb_blocks >> 2;
985
986
// fill (then re-fill) the rest of the blocks
987
block tmp;
988
gidx_ctx ctx; // public information, no need to wipe
989
FOR_T (u32, pass_number, 0, nb_iterations) {
990
int first_pass = pass_number == 0;
991
992
FOR_T (u32, segment, 0, 4) {
993
gidx_init(&ctx, pass_number, segment, nb_blocks, nb_iterations);
994
995
// On the first segment of the first pass,
996
// blocks 0 and 1 are already filled.
997
// We use the offset to skip them.
998
u32 start_offset = first_pass && segment == 0 ? 2 : 0;
999
u32 segment_start = segment * segment_size + start_offset;
1000
u32 segment_end = (segment + 1) * segment_size;
1001
FOR_T (u32, current_block, segment_start, segment_end) {
1002
u32 reference_block = gidx_next(&ctx);
1003
u32 previous_block = current_block == 0
1004
? nb_blocks - 1
1005
: current_block - 1;
1006
block *c = blocks + current_block;
1007
block *p = blocks + previous_block;
1008
block *r = blocks + reference_block;
1009
if (first_pass) { g_copy(c, p, r, &tmp); }
1010
else { g_xor (c, p, r, &tmp); }
1011
}
1012
}
1013
}
1014
wipe_block(&tmp);
1015
u8 final_block[1024];
1016
store_block(final_block, blocks + (nb_blocks - 1));
1017
1018
// wipe work area
1019
volatile u64 *p = (u64*)work_area;
1020
ZERO(p, 128 * nb_blocks);
1021
1022
// hash the very last block with H' into the output hash
1023
extended_hash(hash, hash_size, final_block, 1024);
1024
WIPE_BUFFER(final_block);
1025
}
1026
1027
void crypto_argon2i(u8 *hash, u32 hash_size,
1028
void *work_area, u32 nb_blocks, u32 nb_iterations,
1029
const u8 *password, u32 password_size,
1030
const u8 *salt, u32 salt_size)
1031
{
1032
crypto_argon2i_general(hash, hash_size, work_area, nb_blocks, nb_iterations,
1033
password, password_size, salt , salt_size, 0,0,0,0);
1034
}
1035
1036
#endif // MONOCYPHER_ARGON2_ENABLE
1037
1038
////////////////////////////////////
1039
/// Arithmetic modulo 2^255 - 19 ///
1040
////////////////////////////////////
1041
// Originally taken from SUPERCOP's ref10 implementation.
1042
// A bit bigger than TweetNaCl, over 4 times faster.
1043
1044
// field element
1045
typedef i32 fe[10];
1046
1047
// field constants
1048
//
1049
// fe_one : 1
1050
// sqrtm1 : sqrt(-1)
1051
// d : -121665 / 121666
1052
// D2 : 2 * -121665 / 121666
1053
// lop_x, lop_y: low order point in Edwards coordinates
1054
// ufactor : -sqrt(-1) * 2
1055
// A2 : 486662^2 (A squared)
1056
static const fe fe_one = {1};
1057
static const fe sqrtm1 = {-32595792, -7943725, 9377950, 3500415, 12389472,
1058
-272473, -25146209, -2005654, 326686, 11406482,};
1059
static const fe d = {-10913610, 13857413, -15372611, 6949391, 114729,
1060
-8787816, -6275908, -3247719, -18696448, -12055116,};
1061
static const fe D2 = {-21827239, -5839606, -30745221, 13898782, 229458,
1062
15978800, -12551817, -6495438, 29715968, 9444199,};
1063
static const fe lop_x = {21352778, 5345713, 4660180, -8347857, 24143090,
1064
14568123, 30185756, -12247770, -33528939, 8345319,};
1065
static const fe lop_y = {-6952922, -1265500, 6862341, -7057498, -4037696,
1066
-5447722, 31680899, -15325402, -19365852, 1569102,};
1067
static const fe ufactor = {-1917299, 15887451, -18755900, -7000830, -24778944,
1068
544946, -16816446, 4011309, -653372, 10741468,};
1069
static const fe A2 = {12721188, 3529, 0, 0, 0, 0, 0, 0, 0, 0,};
1070
1071
static void fe_0(fe h) { ZERO(h , 10); }
1072
static void fe_1(fe h) { h[0] = 1; ZERO(h+1, 9); }
1073
1074
static void fe_copy(fe h,const fe f ){FOR(i,0,10) h[i] = f[i]; }
1075
static void fe_neg (fe h,const fe f ){FOR(i,0,10) h[i] = -f[i]; }
1076
static void fe_add (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] + g[i];}
1077
static void fe_sub (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] - g[i];}
1078
1079
static void fe_cswap(fe f, fe g, int b)
1080
{
1081
i32 mask = -b; // -1 = 0xffffffff
1082
FOR (i, 0, 10) {
1083
i32 x = (f[i] ^ g[i]) & mask;
1084
f[i] = f[i] ^ x;
1085
g[i] = g[i] ^ x;
1086
}
1087
}
1088
1089
static void fe_ccopy(fe f, const fe g, int b)
1090
{
1091
i32 mask = -b; // -1 = 0xffffffff
1092
FOR (i, 0, 10) {
1093
i32 x = (f[i] ^ g[i]) & mask;
1094
f[i] = f[i] ^ x;
1095
}
1096
}
1097
1098
1099
// Signed carry propagation
1100
// ------------------------
1101
//
1102
// Let t be a number. It can be uniquely decomposed thus:
1103
//
1104
// t = h*2^26 + l
1105
// such that -2^25 <= l < 2^25
1106
//
1107
// Let c = (t + 2^25) / 2^26 (rounded down)
1108
// c = (h*2^26 + l + 2^25) / 2^26 (rounded down)
1109
// c = h + (l + 2^25) / 2^26 (rounded down)
1110
// c = h (exactly)
1111
// Because 0 <= l + 2^25 < 2^26
1112
//
1113
// Let u = t - c*2^26
1114
// u = h*2^26 + l - h*2^26
1115
// u = l
1116
// Therefore, -2^25 <= u < 2^25
1117
//
1118
// Additionally, if |t| < x, then |h| < x/2^26 (rounded down)
1119
//
1120
// Notations:
1121
// - In C, 1<<25 means 2^25.
1122
// - In C, x>>25 means floor(x / (2^25)).
1123
// - All of the above applies with 25 & 24 as well as 26 & 25.
1124
//
1125
//
1126
// Note on negative right shifts
1127
// -----------------------------
1128
//
1129
// In C, x >> n, where x is a negative integer, is implementation
1130
// defined. In practice, all platforms do arithmetic shift, which is
1131
// equivalent to division by 2^26, rounded down. Some compilers, like
1132
// GCC, even guarantee it.
1133
//
1134
// If we ever stumble upon a platform that does not propagate the sign
1135
// bit (we won't), visible failures will show at the slightest test, and
1136
// the signed shifts can be replaced by the following:
1137
//
1138
// typedef struct { i64 x:39; } s25;
1139
// typedef struct { i64 x:38; } s26;
1140
// i64 shift25(i64 x) { s25 s; s.x = ((u64)x)>>25; return s.x; }
1141
// i64 shift26(i64 x) { s26 s; s.x = ((u64)x)>>26; return s.x; }
1142
//
1143
// Current compilers cannot optimise this, causing a 30% drop in
1144
// performance. Fairly expensive for something that never happens.
1145
//
1146
//
1147
// Precondition
1148
// ------------
1149
//
1150
// |t0| < 2^63
1151
// |t1|..|t9| < 2^62
1152
//
1153
// Algorithm
1154
// ---------
1155
// c = t0 + 2^25 / 2^26 -- |c| <= 2^36
1156
// t0 -= c * 2^26 -- |t0| <= 2^25
1157
// t1 += c -- |t1| <= 2^63
1158
//
1159
// c = t4 + 2^25 / 2^26 -- |c| <= 2^36
1160
// t4 -= c * 2^26 -- |t4| <= 2^25
1161
// t5 += c -- |t5| <= 2^63
1162
//
1163
// c = t1 + 2^24 / 2^25 -- |c| <= 2^38
1164
// t1 -= c * 2^25 -- |t1| <= 2^24
1165
// t2 += c -- |t2| <= 2^63
1166
//
1167
// c = t5 + 2^24 / 2^25 -- |c| <= 2^38
1168
// t5 -= c * 2^25 -- |t5| <= 2^24
1169
// t6 += c -- |t6| <= 2^63
1170
//
1171
// c = t2 + 2^25 / 2^26 -- |c| <= 2^37
1172
// t2 -= c * 2^26 -- |t2| <= 2^25 < 1.1 * 2^25 (final t2)
1173
// t3 += c -- |t3| <= 2^63
1174
//
1175
// c = t6 + 2^25 / 2^26 -- |c| <= 2^37
1176
// t6 -= c * 2^26 -- |t6| <= 2^25 < 1.1 * 2^25 (final t6)
1177
// t7 += c -- |t7| <= 2^63
1178
//
1179
// c = t3 + 2^24 / 2^25 -- |c| <= 2^38
1180
// t3 -= c * 2^25 -- |t3| <= 2^24 < 1.1 * 2^24 (final t3)
1181
// t4 += c -- |t4| <= 2^25 + 2^38 < 2^39
1182
//
1183
// c = t7 + 2^24 / 2^25 -- |c| <= 2^38
1184
// t7 -= c * 2^25 -- |t7| <= 2^24 < 1.1 * 2^24 (final t7)
1185
// t8 += c -- |t8| <= 2^63
1186
//
1187
// c = t4 + 2^25 / 2^26 -- |c| <= 2^13
1188
// t4 -= c * 2^26 -- |t4| <= 2^25 < 1.1 * 2^25 (final t4)
1189
// t5 += c -- |t5| <= 2^24 + 2^13 < 1.1 * 2^24 (final t5)
1190
//
1191
// c = t8 + 2^25 / 2^26 -- |c| <= 2^37
1192
// t8 -= c * 2^26 -- |t8| <= 2^25 < 1.1 * 2^25 (final t8)
1193
// t9 += c -- |t9| <= 2^63
1194
//
1195
// c = t9 + 2^24 / 2^25 -- |c| <= 2^38
1196
// t9 -= c * 2^25 -- |t9| <= 2^24 < 1.1 * 2^24 (final t9)
1197
// t0 += c * 19 -- |t0| <= 2^25 + 2^38*19 < 2^44
1198
//
1199
// c = t0 + 2^25 / 2^26 -- |c| <= 2^18
1200
// t0 -= c * 2^26 -- |t0| <= 2^25 < 1.1 * 2^25 (final t0)
1201
// t1 += c -- |t1| <= 2^24 + 2^18 < 1.1 * 2^24 (final t1)
1202
//
1203
// Postcondition
1204
// -------------
1205
// |t0|, |t2|, |t4|, |t6|, |t8| < 1.1 * 2^25
1206
// |t1|, |t3|, |t5|, |t7|, |t9| < 1.1 * 2^24
1207
#define FE_CARRY \
1208
i64 c; \
1209
c = (t0 + ((i64)1<<25)) >> 26; t0 -= c * ((i64)1 << 26); t1 += c; \
1210
c = (t4 + ((i64)1<<25)) >> 26; t4 -= c * ((i64)1 << 26); t5 += c; \
1211
c = (t1 + ((i64)1<<24)) >> 25; t1 -= c * ((i64)1 << 25); t2 += c; \
1212
c = (t5 + ((i64)1<<24)) >> 25; t5 -= c * ((i64)1 << 25); t6 += c; \
1213
c = (t2 + ((i64)1<<25)) >> 26; t2 -= c * ((i64)1 << 26); t3 += c; \
1214
c = (t6 + ((i64)1<<25)) >> 26; t6 -= c * ((i64)1 << 26); t7 += c; \
1215
c = (t3 + ((i64)1<<24)) >> 25; t3 -= c * ((i64)1 << 25); t4 += c; \
1216
c = (t7 + ((i64)1<<24)) >> 25; t7 -= c * ((i64)1 << 25); t8 += c; \
1217
c = (t4 + ((i64)1<<25)) >> 26; t4 -= c * ((i64)1 << 26); t5 += c; \
1218
c = (t8 + ((i64)1<<25)) >> 26; t8 -= c * ((i64)1 << 26); t9 += c; \
1219
c = (t9 + ((i64)1<<24)) >> 25; t9 -= c * ((i64)1 << 25); t0 += c * 19; \
1220
c = (t0 + ((i64)1<<25)) >> 26; t0 -= c * ((i64)1 << 26); t1 += c; \
1221
h[0]=(i32)t0; h[1]=(i32)t1; h[2]=(i32)t2; h[3]=(i32)t3; h[4]=(i32)t4; \
1222
h[5]=(i32)t5; h[6]=(i32)t6; h[7]=(i32)t7; h[8]=(i32)t8; h[9]=(i32)t9
1223
1224
static void fe_frombytes(fe h, const u8 s[32])
1225
{
1226
i64 t0 = load32_le(s); // t0 < 2^32
1227
i64 t1 = load24_le(s + 4) << 6; // t1 < 2^30
1228
i64 t2 = load24_le(s + 7) << 5; // t2 < 2^29
1229
i64 t3 = load24_le(s + 10) << 3; // t3 < 2^27
1230
i64 t4 = load24_le(s + 13) << 2; // t4 < 2^26
1231
i64 t5 = load32_le(s + 16); // t5 < 2^32
1232
i64 t6 = load24_le(s + 20) << 7; // t6 < 2^31
1233
i64 t7 = load24_le(s + 23) << 5; // t7 < 2^29
1234
i64 t8 = load24_le(s + 26) << 4; // t8 < 2^28
1235
i64 t9 = (load24_le(s + 29) & 0x7fffff) << 2; // t9 < 2^25
1236
FE_CARRY; // Carry recondition OK
1237
}
1238
1239
// Precondition
1240
// |h[0]|, |h[2]|, |h[4]|, |h[6]|, |h[8]| < 1.1 * 2^25
1241
// |h[1]|, |h[3]|, |h[5]|, |h[7]|, |h[9]| < 1.1 * 2^24
1242
//
1243
// Therefore, |h| < 2^255-19
1244
// There are two possibilities:
1245
//
1246
// - If h is positive, all we need to do is reduce its individual
1247
// limbs down to their tight positive range.
1248
// - If h is negative, we also need to add 2^255-19 to it.
1249
// Or just remove 19 and chop off any excess bit.
1250
static void fe_tobytes(u8 s[32], const fe h)
1251
{
1252
i32 t[10];
1253
COPY(t, h, 10);
1254
i32 q = (19 * t[9] + (((i32) 1) << 24)) >> 25;
1255
// |t9| < 1.1 * 2^24
1256
// -1.1 * 2^24 < t9 < 1.1 * 2^24
1257
// -21 * 2^24 < 19 * t9 < 21 * 2^24
1258
// -2^29 < 19 * t9 + 2^24 < 2^29
1259
// -2^29 / 2^25 < (19 * t9 + 2^24) / 2^25 < 2^29 / 2^25
1260
// -16 < (19 * t9 + 2^24) / 2^25 < 16
1261
FOR (i, 0, 5) {
1262
q += t[2*i ]; q >>= 26; // q = 0 or -1
1263
q += t[2*i+1]; q >>= 25; // q = 0 or -1
1264
}
1265
// q = 0 iff h >= 0
1266
// q = -1 iff h < 0
1267
// Adding q * 19 to h reduces h to its proper range.
1268
q *= 19; // Shift carry back to the beginning
1269
FOR (i, 0, 5) {
1270
t[i*2 ] += q; q = t[i*2 ] >> 26; t[i*2 ] -= q * ((i32)1 << 26);
1271
t[i*2+1] += q; q = t[i*2+1] >> 25; t[i*2+1] -= q * ((i32)1 << 25);
1272
}
1273
// h is now fully reduced, and q represents the excess bit.
1274
1275
store32_le(s + 0, ((u32)t[0] >> 0) | ((u32)t[1] << 26));
1276
store32_le(s + 4, ((u32)t[1] >> 6) | ((u32)t[2] << 19));
1277
store32_le(s + 8, ((u32)t[2] >> 13) | ((u32)t[3] << 13));
1278
store32_le(s + 12, ((u32)t[3] >> 19) | ((u32)t[4] << 6));
1279
store32_le(s + 16, ((u32)t[5] >> 0) | ((u32)t[6] << 25));
1280
store32_le(s + 20, ((u32)t[6] >> 7) | ((u32)t[7] << 19));
1281
store32_le(s + 24, ((u32)t[7] >> 13) | ((u32)t[8] << 12));
1282
store32_le(s + 28, ((u32)t[8] >> 20) | ((u32)t[9] << 6));
1283
1284
WIPE_BUFFER(t);
1285
}
1286
1287
// Precondition
1288
// -------------
1289
// |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26
1290
// |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25
1291
//
1292
// |g0|, |g2|, |g4|, |g6|, |g8| < 1.65 * 2^26
1293
// |g1|, |g3|, |g5|, |g7|, |g9| < 1.65 * 2^25
1294
static void fe_mul_small(fe h, const fe f, i32 g)
1295
{
1296
i64 t0 = f[0] * (i64) g; i64 t1 = f[1] * (i64) g;
1297
i64 t2 = f[2] * (i64) g; i64 t3 = f[3] * (i64) g;
1298
i64 t4 = f[4] * (i64) g; i64 t5 = f[5] * (i64) g;
1299
i64 t6 = f[6] * (i64) g; i64 t7 = f[7] * (i64) g;
1300
i64 t8 = f[8] * (i64) g; i64 t9 = f[9] * (i64) g;
1301
// |t0|, |t2|, |t4|, |t6|, |t8| < 1.65 * 2^26 * 2^31 < 2^58
1302
// |t1|, |t3|, |t5|, |t7|, |t9| < 1.65 * 2^25 * 2^31 < 2^57
1303
1304
FE_CARRY; // Carry precondition OK
1305
}
1306
1307
// Precondition
1308
// -------------
1309
// |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26
1310
// |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25
1311
//
1312
// |g0|, |g2|, |g4|, |g6|, |g8| < 1.65 * 2^26
1313
// |g1|, |g3|, |g5|, |g7|, |g9| < 1.65 * 2^25
1314
static void fe_mul(fe h, const fe f, const fe g)
1315
{
1316
// Everything is unrolled and put in temporary variables.
1317
// We could roll the loop, but that would make curve25519 twice as slow.
1318
i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
1319
i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
1320
i32 g0 = g[0]; i32 g1 = g[1]; i32 g2 = g[2]; i32 g3 = g[3]; i32 g4 = g[4];
1321
i32 g5 = g[5]; i32 g6 = g[6]; i32 g7 = g[7]; i32 g8 = g[8]; i32 g9 = g[9];
1322
i32 F1 = f1*2; i32 F3 = f3*2; i32 F5 = f5*2; i32 F7 = f7*2; i32 F9 = f9*2;
1323
i32 G1 = g1*19; i32 G2 = g2*19; i32 G3 = g3*19;
1324
i32 G4 = g4*19; i32 G5 = g5*19; i32 G6 = g6*19;
1325
i32 G7 = g7*19; i32 G8 = g8*19; i32 G9 = g9*19;
1326
// |F1|, |F3|, |F5|, |F7|, |F9| < 1.65 * 2^26
1327
// |G0|, |G2|, |G4|, |G6|, |G8| < 2^31
1328
// |G1|, |G3|, |G5|, |G7|, |G9| < 2^30
1329
1330
i64 t0 = f0*(i64)g0 + F1*(i64)G9 + f2*(i64)G8 + F3*(i64)G7 + f4*(i64)G6
1331
+ F5*(i64)G5 + f6*(i64)G4 + F7*(i64)G3 + f8*(i64)G2 + F9*(i64)G1;
1332
i64 t1 = f0*(i64)g1 + f1*(i64)g0 + f2*(i64)G9 + f3*(i64)G8 + f4*(i64)G7
1333
+ f5*(i64)G6 + f6*(i64)G5 + f7*(i64)G4 + f8*(i64)G3 + f9*(i64)G2;
1334
i64 t2 = f0*(i64)g2 + F1*(i64)g1 + f2*(i64)g0 + F3*(i64)G9 + f4*(i64)G8
1335
+ F5*(i64)G7 + f6*(i64)G6 + F7*(i64)G5 + f8*(i64)G4 + F9*(i64)G3;
1336
i64 t3 = f0*(i64)g3 + f1*(i64)g2 + f2*(i64)g1 + f3*(i64)g0 + f4*(i64)G9
1337
+ f5*(i64)G8 + f6*(i64)G7 + f7*(i64)G6 + f8*(i64)G5 + f9*(i64)G4;
1338
i64 t4 = f0*(i64)g4 + F1*(i64)g3 + f2*(i64)g2 + F3*(i64)g1 + f4*(i64)g0
1339
+ F5*(i64)G9 + f6*(i64)G8 + F7*(i64)G7 + f8*(i64)G6 + F9*(i64)G5;
1340
i64 t5 = f0*(i64)g5 + f1*(i64)g4 + f2*(i64)g3 + f3*(i64)g2 + f4*(i64)g1
1341
+ f5*(i64)g0 + f6*(i64)G9 + f7*(i64)G8 + f8*(i64)G7 + f9*(i64)G6;
1342
i64 t6 = f0*(i64)g6 + F1*(i64)g5 + f2*(i64)g4 + F3*(i64)g3 + f4*(i64)g2
1343
+ F5*(i64)g1 + f6*(i64)g0 + F7*(i64)G9 + f8*(i64)G8 + F9*(i64)G7;
1344
i64 t7 = f0*(i64)g7 + f1*(i64)g6 + f2*(i64)g5 + f3*(i64)g4 + f4*(i64)g3
1345
+ f5*(i64)g2 + f6*(i64)g1 + f7*(i64)g0 + f8*(i64)G9 + f9*(i64)G8;
1346
i64 t8 = f0*(i64)g8 + F1*(i64)g7 + f2*(i64)g6 + F3*(i64)g5 + f4*(i64)g4
1347
+ F5*(i64)g3 + f6*(i64)g2 + F7*(i64)g1 + f8*(i64)g0 + F9*(i64)G9;
1348
i64 t9 = f0*(i64)g9 + f1*(i64)g8 + f2*(i64)g7 + f3*(i64)g6 + f4*(i64)g5
1349
+ f5*(i64)g4 + f6*(i64)g3 + f7*(i64)g2 + f8*(i64)g1 + f9*(i64)g0;
1350
// t0 < 0.67 * 2^61
1351
// t1 < 0.41 * 2^61
1352
// t2 < 0.52 * 2^61
1353
// t3 < 0.32 * 2^61
1354
// t4 < 0.38 * 2^61
1355
// t5 < 0.22 * 2^61
1356
// t6 < 0.23 * 2^61
1357
// t7 < 0.13 * 2^61
1358
// t8 < 0.09 * 2^61
1359
// t9 < 0.03 * 2^61
1360
1361
FE_CARRY; // Everything below 2^62, Carry precondition OK
1362
}
1363
1364
// Precondition
1365
// -------------
1366
// |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26
1367
// |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25
1368
//
1369
// Note: we could use fe_mul() for this, but this is significantly faster
1370
static void fe_sq(fe h, const fe f)
1371
{
1372
i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
1373
i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
1374
i32 f0_2 = f0*2; i32 f1_2 = f1*2; i32 f2_2 = f2*2; i32 f3_2 = f3*2;
1375
i32 f4_2 = f4*2; i32 f5_2 = f5*2; i32 f6_2 = f6*2; i32 f7_2 = f7*2;
1376
i32 f5_38 = f5*38; i32 f6_19 = f6*19; i32 f7_38 = f7*38;
1377
i32 f8_19 = f8*19; i32 f9_38 = f9*38;
1378
// |f0_2| , |f2_2| , |f4_2| , |f6_2| , |f8_2| < 1.65 * 2^27
1379
// |f1_2| , |f3_2| , |f5_2| , |f7_2| , |f9_2| < 1.65 * 2^26
1380
// |f5_38|, |f6_19|, |f7_38|, |f8_19|, |f9_38| < 2^31
1381
1382
i64 t0 = f0 *(i64)f0 + f1_2*(i64)f9_38 + f2_2*(i64)f8_19
1383
+ f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5 *(i64)f5_38;
1384
i64 t1 = f0_2*(i64)f1 + f2 *(i64)f9_38 + f3_2*(i64)f8_19
1385
+ f4 *(i64)f7_38 + f5_2*(i64)f6_19;
1386
i64 t2 = f0_2*(i64)f2 + f1_2*(i64)f1 + f3_2*(i64)f9_38
1387
+ f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6 *(i64)f6_19;
1388
i64 t3 = f0_2*(i64)f3 + f1_2*(i64)f2 + f4 *(i64)f9_38
1389
+ f5_2*(i64)f8_19 + f6 *(i64)f7_38;
1390
i64 t4 = f0_2*(i64)f4 + f1_2*(i64)f3_2 + f2 *(i64)f2
1391
+ f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7 *(i64)f7_38;
1392
i64 t5 = f0_2*(i64)f5 + f1_2*(i64)f4 + f2_2*(i64)f3
1393
+ f6 *(i64)f9_38 + f7_2*(i64)f8_19;
1394
i64 t6 = f0_2*(i64)f6 + f1_2*(i64)f5_2 + f2_2*(i64)f4
1395
+ f3_2*(i64)f3 + f7_2*(i64)f9_38 + f8 *(i64)f8_19;
1396
i64 t7 = f0_2*(i64)f7 + f1_2*(i64)f6 + f2_2*(i64)f5
1397
+ f3_2*(i64)f4 + f8 *(i64)f9_38;
1398
i64 t8 = f0_2*(i64)f8 + f1_2*(i64)f7_2 + f2_2*(i64)f6
1399
+ f3_2*(i64)f5_2 + f4 *(i64)f4 + f9 *(i64)f9_38;
1400
i64 t9 = f0_2*(i64)f9 + f1_2*(i64)f8 + f2_2*(i64)f7
1401
+ f3_2*(i64)f6 + f4 *(i64)f5_2;
1402
// t0 < 0.67 * 2^61
1403
// t1 < 0.41 * 2^61
1404
// t2 < 0.52 * 2^61
1405
// t3 < 0.32 * 2^61
1406
// t4 < 0.38 * 2^61
1407
// t5 < 0.22 * 2^61
1408
// t6 < 0.23 * 2^61
1409
// t7 < 0.13 * 2^61
1410
// t8 < 0.09 * 2^61
1411
// t9 < 0.03 * 2^61
1412
1413
FE_CARRY;
1414
}
1415
1416
// h = 2 * (f^2)
1417
//
1418
// Precondition
1419
// -------------
1420
// |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26
1421
// |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25
1422
//
1423
// Note: we could implement fe_sq2() by copying fe_sq(), multiplying
1424
// each limb by 2, *then* perform the carry. This saves one carry.
1425
// However, doing so with the stated preconditions does not work (t2
1426
// would overflow). There are 3 ways to solve this:
1427
//
1428
// 1. Show that t2 actually never overflows (it really does not).
1429
// 2. Accept an additional carry, at a small lost of performance.
1430
// 3. Make sure the input of fe_sq2() is freshly carried.
1431
//
1432
// SUPERCOP ref10 relies on (1).
1433
// Monocypher chose (2) and (3), mostly to save code.
1434
static void fe_sq2(fe h, const fe f)
1435
{
1436
fe_sq(h, f);
1437
fe_mul_small(h, h, 2);
1438
}
1439
1440
// This could be simplified, but it would be slower
1441
static void fe_pow22523(fe out, const fe z)
1442
{
1443
fe t0, t1, t2;
1444
fe_sq(t0, z);
1445
fe_sq(t1,t0); fe_sq(t1, t1); fe_mul(t1, z, t1);
1446
fe_mul(t0, t0, t1);
1447
fe_sq(t0, t0); fe_mul(t0, t1, t0);
1448
fe_sq(t1, t0); FOR (i, 1, 5) fe_sq(t1, t1); fe_mul(t0, t1, t0);
1449
fe_sq(t1, t0); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t1, t1, t0);
1450
fe_sq(t2, t1); FOR (i, 1, 20) fe_sq(t2, t2); fe_mul(t1, t2, t1);
1451
fe_sq(t1, t1); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t0, t1, t0);
1452
fe_sq(t1, t0); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t1, t1, t0);
1453
fe_sq(t2, t1); FOR (i, 1, 100) fe_sq(t2, t2); fe_mul(t1, t2, t1);
1454
fe_sq(t1, t1); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t0, t1, t0);
1455
fe_sq(t0, t0); FOR (i, 1, 2) fe_sq(t0, t0); fe_mul(out, t0, z);
1456
WIPE_BUFFER(t0);
1457
WIPE_BUFFER(t1);
1458
WIPE_BUFFER(t2);
1459
}
1460
1461
// Inverting means multiplying by 2^255 - 21
1462
// 2^255 - 21 = (2^252 - 3) * 8 + 3
1463
// So we reuse the multiplication chain of fe_pow22523
1464
static void fe_invert(fe out, const fe z)
1465
{
1466
fe tmp;
1467
fe_pow22523(tmp, z);
1468
// tmp2^8 * z^3
1469
fe_sq(tmp, tmp); // 0
1470
fe_sq(tmp, tmp); fe_mul(tmp, tmp, z); // 1
1471
fe_sq(tmp, tmp); fe_mul(out, tmp, z); // 1
1472
WIPE_BUFFER(tmp);
1473
}
1474
1475
// Parity check. Returns 0 if even, 1 if odd
1476
static int fe_isodd(const fe f)
1477
{
1478
u8 s[32];
1479
fe_tobytes(s, f);
1480
u8 isodd = s[0] & 1;
1481
WIPE_BUFFER(s);
1482
return isodd;
1483
}
1484
1485
// Returns 1 if equal, 0 if not equal
1486
static int fe_isequal(const fe f, const fe g)
1487
{
1488
u8 fs[32];
1489
u8 gs[32];
1490
fe_tobytes(fs, f);
1491
fe_tobytes(gs, g);
1492
int isdifferent = crypto_verify32(fs, gs);
1493
WIPE_BUFFER(fs);
1494
WIPE_BUFFER(gs);
1495
return 1 + isdifferent;
1496
}
1497
1498
// Inverse square root.
1499
// Returns true if x is a non zero square, false otherwise.
1500
// After the call:
1501
// isr = sqrt(1/x) if x is non-zero square.
1502
// isr = sqrt(sqrt(-1)/x) if x is not a square.
1503
// isr = 0 if x is zero.
1504
// We do not guarantee the sign of the square root.
1505
//
1506
// Notes:
1507
// Let quartic = x^((p-1)/4)
1508
//
1509
// x^((p-1)/2) = chi(x)
1510
// quartic^2 = chi(x)
1511
// quartic = sqrt(chi(x))
1512
// quartic = 1 or -1 or sqrt(-1) or -sqrt(-1)
1513
//
1514
// Note that x is a square if quartic is 1 or -1
1515
// There are 4 cases to consider:
1516
//
1517
// if quartic = 1 (x is a square)
1518
// then x^((p-1)/4) = 1
1519
// x^((p-5)/4) * x = 1
1520
// x^((p-5)/4) = 1/x
1521
// x^((p-5)/8) = sqrt(1/x) or -sqrt(1/x)
1522
//
1523
// if quartic = -1 (x is a square)
1524
// then x^((p-1)/4) = -1
1525
// x^((p-5)/4) * x = -1
1526
// x^((p-5)/4) = -1/x
1527
// x^((p-5)/8) = sqrt(-1) / sqrt(x)
1528
// x^((p-5)/8) * sqrt(-1) = sqrt(-1)^2 / sqrt(x)
1529
// x^((p-5)/8) * sqrt(-1) = -1/sqrt(x)
1530
// x^((p-5)/8) * sqrt(-1) = -sqrt(1/x) or sqrt(1/x)
1531
//
1532
// if quartic = sqrt(-1) (x is not a square)
1533
// then x^((p-1)/4) = sqrt(-1)
1534
// x^((p-5)/4) * x = sqrt(-1)
1535
// x^((p-5)/4) = sqrt(-1)/x
1536
// x^((p-5)/8) = sqrt(sqrt(-1)/x) or -sqrt(sqrt(-1)/x)
1537
//
1538
// Note that the product of two non-squares is always a square:
1539
// For any non-squares a and b, chi(a) = -1 and chi(b) = -1.
1540
// Since chi(x) = x^((p-1)/2), chi(a)*chi(b) = chi(a*b) = 1.
1541
// Therefore a*b is a square.
1542
//
1543
// Since sqrt(-1) and x are both non-squares, their product is a
1544
// square, and we can compute their square root.
1545
//
1546
// if quartic = -sqrt(-1) (x is not a square)
1547
// then x^((p-1)/4) = -sqrt(-1)
1548
// x^((p-5)/4) * x = -sqrt(-1)
1549
// x^((p-5)/4) = -sqrt(-1)/x
1550
// x^((p-5)/8) = sqrt(-sqrt(-1)/x)
1551
// x^((p-5)/8) = sqrt( sqrt(-1)/x) * sqrt(-1)
1552
// x^((p-5)/8) * sqrt(-1) = sqrt( sqrt(-1)/x) * sqrt(-1)^2
1553
// x^((p-5)/8) * sqrt(-1) = sqrt( sqrt(-1)/x) * -1
1554
// x^((p-5)/8) * sqrt(-1) = -sqrt(sqrt(-1)/x) or sqrt(sqrt(-1)/x)
1555
static int invsqrt(fe isr, const fe x)
1556
{
1557
fe check, quartic;
1558
fe_copy(check, x);
1559
fe_pow22523(isr, check);
1560
fe_sq (quartic, isr);
1561
fe_mul(quartic, quartic, check);
1562
fe_1 (check); int p1 = fe_isequal(quartic, check);
1563
fe_neg(check, check ); int m1 = fe_isequal(quartic, check);
1564
fe_neg(check, sqrtm1); int ms = fe_isequal(quartic, check);
1565
fe_mul(check, isr, sqrtm1);
1566
fe_ccopy(isr, check, m1 | ms);
1567
WIPE_BUFFER(quartic);
1568
WIPE_BUFFER(check);
1569
return p1 | m1;
1570
}
1571
1572
// trim a scalar for scalar multiplication
1573
static void trim_scalar(u8 scalar[32])
1574
{
1575
scalar[ 0] &= 248;
1576
scalar[31] &= 127;
1577
scalar[31] |= 64;
1578
}
1579
1580
// get bit from scalar at position i
1581
static int scalar_bit(const u8 s[32], int i)
1582
{
1583
if (i < 0) { return 0; } // handle -1 for sliding windows
1584
return (s[i>>3] >> (i&7)) & 1;
1585
}
1586
1587
///////////////
1588
/// X-25519 /// Taken from SUPERCOP's ref10 implementation.
1589
///////////////
1590
static void scalarmult(u8 q[32], const u8 scalar[32], const u8 p[32],
1591
int nb_bits)
1592
{
1593
// computes the scalar product
1594
fe x1;
1595
fe_frombytes(x1, p);
1596
1597
// computes the actual scalar product (the result is in x2 and z2)
1598
fe x2, z2, x3, z3, t0, t1;
1599
// Montgomery ladder
1600
// In projective coordinates, to avoid divisions: x = X / Z
1601
// We don't care about the y coordinate, it's only 1 bit of information
1602
fe_1(x2); fe_0(z2); // "zero" point
1603
fe_copy(x3, x1); fe_1(z3); // "one" point
1604
int swap = 0;
1605
for (int pos = nb_bits-1; pos >= 0; --pos) {
1606
// constant time conditional swap before ladder step
1607
int b = scalar_bit(scalar, pos);
1608
swap ^= b; // xor trick avoids swapping at the end of the loop
1609
fe_cswap(x2, x3, swap);
1610
fe_cswap(z2, z3, swap);
1611
swap = b; // anticipates one last swap after the loop
1612
1613
// Montgomery ladder step: replaces (P2, P3) by (P2*2, P2+P3)
1614
// with differential addition
1615
fe_sub(t0, x3, z3);
1616
fe_sub(t1, x2, z2);
1617
fe_add(x2, x2, z2);
1618
fe_add(z2, x3, z3);
1619
fe_mul(z3, t0, x2);
1620
fe_mul(z2, z2, t1);
1621
fe_sq (t0, t1 );
1622
fe_sq (t1, x2 );
1623
fe_add(x3, z3, z2);
1624
fe_sub(z2, z3, z2);
1625
fe_mul(x2, t1, t0);
1626
fe_sub(t1, t1, t0);
1627
fe_sq (z2, z2 );
1628
fe_mul_small(z3, t1, 121666);
1629
fe_sq (x3, x3 );
1630
fe_add(t0, t0, z3);
1631
fe_mul(z3, x1, z2);
1632
fe_mul(z2, t1, t0);
1633
}
1634
// last swap is necessary to compensate for the xor trick
1635
// Note: after this swap, P3 == P2 + P1.
1636
fe_cswap(x2, x3, swap);
1637
fe_cswap(z2, z3, swap);
1638
1639
// normalises the coordinates: x == X / Z
1640
fe_invert(z2, z2);
1641
fe_mul(x2, x2, z2);
1642
fe_tobytes(q, x2);
1643
1644
WIPE_BUFFER(x1);
1645
WIPE_BUFFER(x2); WIPE_BUFFER(z2); WIPE_BUFFER(t0);
1646
WIPE_BUFFER(x3); WIPE_BUFFER(z3); WIPE_BUFFER(t1);
1647
}
1648
1649
void crypto_x25519(u8 raw_shared_secret[32],
1650
const u8 your_secret_key [32],
1651
const u8 their_public_key [32])
1652
{
1653
// restrict the possible scalar values
1654
u8 e[32];
1655
COPY(e, your_secret_key, 32);
1656
trim_scalar(e);
1657
scalarmult(raw_shared_secret, e, their_public_key, 255);
1658
WIPE_BUFFER(e);
1659
}
1660
1661
void crypto_x25519_public_key(u8 public_key[32],
1662
const u8 secret_key[32])
1663
{
1664
static const u8 base_point[32] = {9};
1665
crypto_x25519(public_key, secret_key, base_point);
1666
}
1667
1668
///////////////////////////
1669
/// Arithmetic modulo L ///
1670
///////////////////////////
1671
static const u32 L[8] = {0x5cf5d3ed, 0x5812631a, 0xa2f79cd6, 0x14def9de,
1672
0x00000000, 0x00000000, 0x00000000, 0x10000000,};
1673
1674
// p = a*b + p
1675
static void multiply(u32 p[16], const u32 a[8], const u32 b[8])
1676
{
1677
FOR (i, 0, 8) {
1678
u64 carry = 0;
1679
FOR (j, 0, 8) {
1680
carry += p[i+j] + (u64)a[i] * b[j];
1681
p[i+j] = (u32)carry;
1682
carry >>= 32;
1683
}
1684
p[i+8] = (u32)carry;
1685
}
1686
}
1687
1688
static int is_above_l(const u32 x[8])
1689
{
1690
// We work with L directly, in a 2's complement encoding
1691
// (-L == ~L + 1)
1692
u64 carry = 1;
1693
FOR (i, 0, 8) {
1694
carry += (u64)x[i] + ~L[i];
1695
carry >>= 32;
1696
}
1697
return carry;
1698
}
1699
1700
// Final reduction modulo L, by conditionally removing L.
1701
// if x < l , then r = x
1702
// if l <= x 2*l, then r = x-l
1703
// otherwise the result will be wrong
1704
static void remove_l(u32 r[8], const u32 x[8])
1705
{
1706
u64 carry = is_above_l(x);
1707
u32 mask = ~(u32)carry + 1; // carry == 0 or 1
1708
FOR (i, 0, 8) {
1709
carry += (u64)x[i] + (~L[i] & mask);
1710
r[i] = (u32)carry;
1711
carry >>= 32;
1712
}
1713
}
1714
1715
// Full reduction modulo L (Barrett reduction)
1716
static void mod_l(u8 reduced[32], const u32 x[16])
1717
{
1718
static const u32 r[9] = {0x0a2c131b,0xed9ce5a3,0x086329a7,0x2106215d,
1719
0xffffffeb,0xffffffff,0xffffffff,0xffffffff,0xf,};
1720
// xr = x * r
1721
u32 xr[25] = {0};
1722
FOR (i, 0, 9) {
1723
u64 carry = 0;
1724
FOR (j, 0, 16) {
1725
carry += xr[i+j] + (u64)r[i] * x[j];
1726
xr[i+j] = (u32)carry;
1727
carry >>= 32;
1728
}
1729
xr[i+16] = (u32)carry;
1730
}
1731
// xr = floor(xr / 2^512) * L
1732
// Since the result is guaranteed to be below 2*L,
1733
// it is enough to only compute the first 256 bits.
1734
// The division is performed by saying xr[i+16]. (16 * 32 = 512)
1735
ZERO(xr, 8);
1736
FOR (i, 0, 8) {
1737
u64 carry = 0;
1738
FOR (j, 0, 8-i) {
1739
carry += xr[i+j] + (u64)xr[i+16] * L[j];
1740
xr[i+j] = (u32)carry;
1741
carry >>= 32;
1742
}
1743
}
1744
// xr = x - xr
1745
u64 carry = 1;
1746
FOR (i, 0, 8) {
1747
carry += (u64)x[i] + ~xr[i];
1748
xr[i] = (u32)carry;
1749
carry >>= 32;
1750
}
1751
// Final reduction modulo L (conditional subtraction)
1752
remove_l(xr, xr);
1753
store32_le_buf(reduced, xr, 8);
1754
1755
WIPE_BUFFER(xr);
1756
}
1757
1758
static void reduce(u8 r[64])
1759
{
1760
u32 x[16];
1761
load32_le_buf(x, r, 16);
1762
mod_l(r, x);
1763
WIPE_BUFFER(x);
1764
}
1765
1766
// r = (a * b) + c
1767
static void mul_add(u8 r[32], const u8 a[32], const u8 b[32], const u8 c[32])
1768
{
1769
u32 A[8]; load32_le_buf(A, a, 8);
1770
u32 B[8]; load32_le_buf(B, b, 8);
1771
u32 p[16];
1772
load32_le_buf(p, c, 8);
1773
ZERO(p + 8, 8);
1774
multiply(p, A, B);
1775
mod_l(r, p);
1776
WIPE_BUFFER(p);
1777
WIPE_BUFFER(A);
1778
WIPE_BUFFER(B);
1779
}
1780
1781
///////////////
1782
/// Ed25519 ///
1783
///////////////
1784
1785
// Point (group element, ge) in a twisted Edwards curve,
1786
// in extended projective coordinates.
1787
// ge : x = X/Z, y = Y/Z, T = XY/Z
1788
// ge_cached : Yp = X+Y, Ym = X-Y, T2 = T*D2
1789
// ge_precomp: Z = 1
1790
typedef struct { fe X; fe Y; fe Z; fe T; } ge;
1791
typedef struct { fe Yp; fe Ym; fe Z; fe T2; } ge_cached;
1792
typedef struct { fe Yp; fe Ym; fe T2; } ge_precomp;
1793
1794
static void ge_zero(ge *p)
1795
{
1796
fe_0(p->X);
1797
fe_1(p->Y);
1798
fe_1(p->Z);
1799
fe_0(p->T);
1800
}
1801
1802
static void ge_tobytes(u8 s[32], const ge *h)
1803
{
1804
fe recip, x, y;
1805
fe_invert(recip, h->Z);
1806
fe_mul(x, h->X, recip);
1807
fe_mul(y, h->Y, recip);
1808
fe_tobytes(s, y);
1809
s[31] ^= fe_isodd(x) << 7;
1810
1811
WIPE_BUFFER(recip);
1812
WIPE_BUFFER(x);
1813
WIPE_BUFFER(y);
1814
}
1815
1816
// h = s, where s is a point encoded in 32 bytes
1817
//
1818
// Variable time! Inputs must not be secret!
1819
// => Use only to *check* signatures.
1820
//
1821
// From the specifications:
1822
// The encoding of s contains y and the sign of x
1823
// x = sqrt((y^2 - 1) / (d*y^2 + 1))
1824
// In extended coordinates:
1825
// X = x, Y = y, Z = 1, T = x*y
1826
//
1827
// Note that num * den is a square iff num / den is a square
1828
// If num * den is not a square, the point was not on the curve.
1829
// From the above:
1830
// Let num = y^2 - 1
1831
// Let den = d*y^2 + 1
1832
// x = sqrt((y^2 - 1) / (d*y^2 + 1))
1833
// x = sqrt(num / den)
1834
// x = sqrt(num^2 / (num * den))
1835
// x = num * sqrt(1 / (num * den))
1836
//
1837
// Therefore, we can just compute:
1838
// num = y^2 - 1
1839
// den = d*y^2 + 1
1840
// isr = invsqrt(num * den) // abort if not square
1841
// x = num * isr
1842
// Finally, negate x if its sign is not as specified.
1843
static int ge_frombytes_vartime(ge *h, const u8 s[32])
1844
{
1845
fe_frombytes(h->Y, s);
1846
fe_1(h->Z);
1847
fe_sq (h->T, h->Y); // t = y^2
1848
fe_mul(h->X, h->T, d ); // x = d*y^2
1849
fe_sub(h->T, h->T, h->Z); // t = y^2 - 1
1850
fe_add(h->X, h->X, h->Z); // x = d*y^2 + 1
1851
fe_mul(h->X, h->T, h->X); // x = (y^2 - 1) * (d*y^2 + 1)
1852
int is_square = invsqrt(h->X, h->X);
1853
if (!is_square) {
1854
return -1; // Not on the curve, abort
1855
}
1856
fe_mul(h->X, h->T, h->X); // x = sqrt((y^2 - 1) / (d*y^2 + 1))
1857
if (fe_isodd(h->X) != (s[31] >> 7)) {
1858
fe_neg(h->X, h->X);
1859
}
1860
fe_mul(h->T, h->X, h->Y);
1861
return 0;
1862
}
1863
1864
static void ge_cache(ge_cached *c, const ge *p)
1865
{
1866
fe_add (c->Yp, p->Y, p->X);
1867
fe_sub (c->Ym, p->Y, p->X);
1868
fe_copy(c->Z , p->Z );
1869
fe_mul (c->T2, p->T, D2 );
1870
}
1871
1872
// Internal buffers are not wiped! Inputs must not be secret!
1873
// => Use only to *check* signatures.
1874
static void ge_add(ge *s, const ge *p, const ge_cached *q)
1875
{
1876
fe a, b;
1877
fe_add(a , p->Y, p->X );
1878
fe_sub(b , p->Y, p->X );
1879
fe_mul(a , a , q->Yp);
1880
fe_mul(b , b , q->Ym);
1881
fe_add(s->Y, a , b );
1882
fe_sub(s->X, a , b );
1883
1884
fe_add(s->Z, p->Z, p->Z );
1885
fe_mul(s->Z, s->Z, q->Z );
1886
fe_mul(s->T, p->T, q->T2);
1887
fe_add(a , s->Z, s->T );
1888
fe_sub(b , s->Z, s->T );
1889
1890
fe_mul(s->T, s->X, s->Y);
1891
fe_mul(s->X, s->X, b );
1892
fe_mul(s->Y, s->Y, a );
1893
fe_mul(s->Z, a , b );
1894
}
1895
1896
// Internal buffers are not wiped! Inputs must not be secret!
1897
// => Use only to *check* signatures.
1898
static void ge_sub(ge *s, const ge *p, const ge_cached *q)
1899
{
1900
ge_cached neg;
1901
fe_copy(neg.Ym, q->Yp);
1902
fe_copy(neg.Yp, q->Ym);
1903
fe_copy(neg.Z , q->Z );
1904
fe_neg (neg.T2, q->T2);
1905
ge_add(s, p, &neg);
1906
}
1907
1908
static void ge_madd(ge *s, const ge *p, const ge_precomp *q, fe a, fe b)
1909
{
1910
fe_add(a , p->Y, p->X );
1911
fe_sub(b , p->Y, p->X );
1912
fe_mul(a , a , q->Yp);
1913
fe_mul(b , b , q->Ym);
1914
fe_add(s->Y, a , b );
1915
fe_sub(s->X, a , b );
1916
1917
fe_add(s->Z, p->Z, p->Z );
1918
fe_mul(s->T, p->T, q->T2);
1919
fe_add(a , s->Z, s->T );
1920
fe_sub(b , s->Z, s->T );
1921
1922
fe_mul(s->T, s->X, s->Y);
1923
fe_mul(s->X, s->X, b );
1924
fe_mul(s->Y, s->Y, a );
1925
fe_mul(s->Z, a , b );
1926
}
1927
1928
static void ge_msub(ge *s, const ge *p, const ge_precomp *q, fe a, fe b)
1929
{
1930
fe_add(a , p->Y, p->X );
1931
fe_sub(b , p->Y, p->X );
1932
fe_mul(a , a , q->Ym);
1933
fe_mul(b , b , q->Yp);
1934
fe_add(s->Y, a , b );
1935
fe_sub(s->X, a , b );
1936
1937
fe_add(s->Z, p->Z, p->Z );
1938
fe_mul(s->T, p->T, q->T2);
1939
fe_sub(a , s->Z, s->T );
1940
fe_add(b , s->Z, s->T );
1941
1942
fe_mul(s->T, s->X, s->Y);
1943
fe_mul(s->X, s->X, b );
1944
fe_mul(s->Y, s->Y, a );
1945
fe_mul(s->Z, a , b );
1946
}
1947
1948
static void ge_double(ge *s, const ge *p, ge *q)
1949
{
1950
fe_sq (q->X, p->X);
1951
fe_sq (q->Y, p->Y);
1952
fe_sq2(q->Z, p->Z);
1953
fe_add(q->T, p->X, p->Y);
1954
fe_sq (s->T, q->T);
1955
fe_add(q->T, q->Y, q->X);
1956
fe_sub(q->Y, q->Y, q->X);
1957
fe_sub(q->X, s->T, q->T);
1958
fe_sub(q->Z, q->Z, q->Y);
1959
1960
fe_mul(s->X, q->X , q->Z);
1961
fe_mul(s->Y, q->T , q->Y);
1962
fe_mul(s->Z, q->Y , q->Z);
1963
fe_mul(s->T, q->X , q->T);
1964
}
1965
1966
// 5-bit signed window in cached format (Niels coordinates, Z=1)
1967
static const ge_precomp b_window[8] = {
1968
{{25967493,-14356035,29566456,3660896,-12694345,
1969
4014787,27544626,-11754271,-6079156,2047605,},
1970
{-12545711,934262,-2722910,3049990,-727428,
1971
9406986,12720692,5043384,19500929,-15469378,},
1972
{-8738181,4489570,9688441,-14785194,10184609,
1973
-12363380,29287919,11864899,-24514362,-4438546,},},
1974
{{15636291,-9688557,24204773,-7912398,616977,
1975
-16685262,27787600,-14772189,28944400,-1550024,},
1976
{16568933,4717097,-11556148,-1102322,15682896,
1977
-11807043,16354577,-11775962,7689662,11199574,},
1978
{30464156,-5976125,-11779434,-15670865,23220365,
1979
15915852,7512774,10017326,-17749093,-9920357,},},
1980
{{10861363,11473154,27284546,1981175,-30064349,
1981
12577861,32867885,14515107,-15438304,10819380,},
1982
{4708026,6336745,20377586,9066809,-11272109,
1983
6594696,-25653668,12483688,-12668491,5581306,},
1984
{19563160,16186464,-29386857,4097519,10237984,
1985
-4348115,28542350,13850243,-23678021,-15815942,},},
1986
{{5153746,9909285,1723747,-2777874,30523605,
1987
5516873,19480852,5230134,-23952439,-15175766,},
1988
{-30269007,-3463509,7665486,10083793,28475525,
1989
1649722,20654025,16520125,30598449,7715701,},
1990
{28881845,14381568,9657904,3680757,-20181635,
1991
7843316,-31400660,1370708,29794553,-1409300,},},
1992
{{-22518993,-6692182,14201702,-8745502,-23510406,
1993
8844726,18474211,-1361450,-13062696,13821877,},
1994
{-6455177,-7839871,3374702,-4740862,-27098617,
1995
-10571707,31655028,-7212327,18853322,-14220951,},
1996
{4566830,-12963868,-28974889,-12240689,-7602672,
1997
-2830569,-8514358,-10431137,2207753,-3209784,},},
1998
{{-25154831,-4185821,29681144,7868801,-6854661,
1999
-9423865,-12437364,-663000,-31111463,-16132436,},
2000
{25576264,-2703214,7349804,-11814844,16472782,
2001
9300885,3844789,15725684,171356,6466918,},
2002
{23103977,13316479,9739013,-16149481,817875,
2003
-15038942,8965339,-14088058,-30714912,16193877,},},
2004
{{-33521811,3180713,-2394130,14003687,-16903474,
2005
-16270840,17238398,4729455,-18074513,9256800,},
2006
{-25182317,-4174131,32336398,5036987,-21236817,
2007
11360617,22616405,9761698,-19827198,630305,},
2008
{-13720693,2639453,-24237460,-7406481,9494427,
2009
-5774029,-6554551,-15960994,-2449256,-14291300,},},
2010
{{-3151181,-5046075,9282714,6866145,-31907062,
2011
-863023,-18940575,15033784,25105118,-7894876,},
2012
{-24326370,15950226,-31801215,-14592823,-11662737,
2013
-5090925,1573892,-2625887,2198790,-15804619,},
2014
{-3099351,10324967,-2241613,7453183,-5446979,
2015
-2735503,-13812022,-16236442,-32461234,-12290683,},},
2016
};
2017
2018
// Incremental sliding windows (left to right)
2019
// Based on Roberto Maria Avanzi[2005]
2020
typedef struct {
2021
i16 next_index; // position of the next signed digit
2022
i8 next_digit; // next signed digit (odd number below 2^window_width)
2023
u8 next_check; // point at which we must check for a new window
2024
} slide_ctx;
2025
2026
static void slide_init(slide_ctx *ctx, const u8 scalar[32])
2027
{
2028
// scalar is guaranteed to be below L, either because we checked (s),
2029
// or because we reduced it modulo L (h_ram). L is under 2^253, so
2030
// so bits 253 to 255 are guaranteed to be zero. No need to test them.
2031
//
2032
// Note however that L is very close to 2^252, so bit 252 is almost
2033
// always zero. If we were to start at bit 251, the tests wouldn't
2034
// catch the off-by-one error (constructing one that does would be
2035
// prohibitively expensive).
2036
//
2037
// We should still check bit 252, though.
2038
int i = 252;
2039
while (i > 0 && scalar_bit(scalar, i) == 0) {
2040
i--;
2041
}
2042
ctx->next_check = (u8)(i + 1);
2043
ctx->next_index = -1;
2044
ctx->next_digit = -1;
2045
}
2046
2047
static int slide_step(slide_ctx *ctx, int width, int i, const u8 scalar[32])
2048
{
2049
if (i == ctx->next_check) {
2050
if (scalar_bit(scalar, i) == scalar_bit(scalar, i - 1)) {
2051
ctx->next_check--;
2052
} else {
2053
// compute digit of next window
2054
int w = MIN(width, i + 1);
2055
int v = -(scalar_bit(scalar, i) << (w-1));
2056
FOR_T (int, j, 0, w-1) {
2057
v += scalar_bit(scalar, i-(w-1)+j) << j;
2058
}
2059
v += scalar_bit(scalar, i-w);
2060
int lsb = v & (~v + 1); // smallest bit of v
2061
int s = ( ((lsb & 0xAA) != 0) // log2(lsb)
2062
| (((lsb & 0xCC) != 0) << 1)
2063
| (((lsb & 0xF0) != 0) << 2));
2064
ctx->next_index = (i16)(i-(w-1)+s);
2065
ctx->next_digit = (i8) (v >> s );
2066
ctx->next_check -= (u8) w;
2067
}
2068
}
2069
return i == ctx->next_index ? ctx->next_digit: 0;
2070
}
2071
2072
#define P_W_WIDTH 3 // Affects the size of the stack
2073
#define B_W_WIDTH 5 // Affects the size of the binary
2074
#define P_W_SIZE (1<<(P_W_WIDTH-2))
2075
2076
// P = [b]B + [p]P, where B is the base point
2077
//
2078
// Variable time! Internal buffers are not wiped! Inputs must not be secret!
2079
// => Use only to *check* signatures.
2080
static void ge_double_scalarmult_vartime(ge *P, const u8 p[32], const u8 b[32])
2081
{
2082
// cache P window for addition
2083
ge_cached cP[P_W_SIZE];
2084
{
2085
ge P2, tmp;
2086
ge_double(&P2, P, &tmp);
2087
ge_cache(&cP[0], P);
2088
FOR (i, 1, P_W_SIZE) {
2089
ge_add(&tmp, &P2, &cP[i-1]);
2090
ge_cache(&cP[i], &tmp);
2091
}
2092
}
2093
2094
// Merged double and add ladder, fused with sliding
2095
slide_ctx p_slide; slide_init(&p_slide, p);
2096
slide_ctx b_slide; slide_init(&b_slide, b);
2097
int i = MAX(p_slide.next_check, b_slide.next_check);
2098
ge *sum = P;
2099
ge_zero(sum);
2100
while (i >= 0) {
2101
ge tmp;
2102
ge_double(sum, sum, &tmp);
2103
int p_digit = slide_step(&p_slide, P_W_WIDTH, i, p);
2104
int b_digit = slide_step(&b_slide, B_W_WIDTH, i, b);
2105
if (p_digit > 0) { ge_add(sum, sum, &cP[ p_digit / 2]); }
2106
if (p_digit < 0) { ge_sub(sum, sum, &cP[-p_digit / 2]); }
2107
fe t1, t2;
2108
if (b_digit > 0) { ge_madd(sum, sum, b_window + b_digit/2, t1, t2); }
2109
if (b_digit < 0) { ge_msub(sum, sum, b_window + -b_digit/2, t1, t2); }
2110
i--;
2111
}
2112
}
2113
2114
// R_check = s[B] - h_ram[pk], where B is the base point
2115
//
2116
// Variable time! Internal buffers are not wiped! Inputs must not be secret!
2117
// => Use only to *check* signatures.
2118
static int ge_r_check(u8 R_check[32], u8 s[32], u8 h_ram[32], u8 pk[32])
2119
{
2120
ge A; // not secret, not wiped
2121
u32 s32[8]; // not secret, not wiped
2122
load32_le_buf(s32, s, 8);
2123
if (ge_frombytes_vartime(&A, pk) || // A = pk
2124
is_above_l(s32)) { // prevent s malleability
2125
return -1;
2126
}
2127
fe_neg(A.X, A.X);
2128
fe_neg(A.T, A.T); // A = -pk
2129
ge_double_scalarmult_vartime(&A, h_ram, s); // A = [s]B - [h_ram]pk
2130
ge_tobytes(R_check, &A); // R_check = A
2131
return 0;
2132
}
2133
2134
// 5-bit signed comb in cached format (Niels coordinates, Z=1)
2135
static const ge_precomp b_comb_low[8] = {
2136
{{-6816601,-2324159,-22559413,124364,18015490,
2137
8373481,19993724,1979872,-18549925,9085059,},
2138
{10306321,403248,14839893,9633706,8463310,
2139
-8354981,-14305673,14668847,26301366,2818560,},
2140
{-22701500,-3210264,-13831292,-2927732,-16326337,
2141
-14016360,12940910,177905,12165515,-2397893,},},
2142
{{-12282262,-7022066,9920413,-3064358,-32147467,
2143
2927790,22392436,-14852487,2719975,16402117,},
2144
{-7236961,-4729776,2685954,-6525055,-24242706,
2145
-15940211,-6238521,14082855,10047669,12228189,},
2146
{-30495588,-12893761,-11161261,3539405,-11502464,
2147
16491580,-27286798,-15030530,-7272871,-15934455,},},
2148
{{17650926,582297,-860412,-187745,-12072900,
2149
-10683391,-20352381,15557840,-31072141,-5019061,},
2150
{-6283632,-2259834,-4674247,-4598977,-4089240,
2151
12435688,-31278303,1060251,6256175,10480726,},
2152
{-13871026,2026300,-21928428,-2741605,-2406664,
2153
-8034988,7355518,15733500,-23379862,7489131,},},
2154
{{6883359,695140,23196907,9644202,-33430614,
2155
11354760,-20134606,6388313,-8263585,-8491918,},
2156
{-7716174,-13605463,-13646110,14757414,-19430591,
2157
-14967316,10359532,-11059670,-21935259,12082603,},
2158
{-11253345,-15943946,10046784,5414629,24840771,
2159
8086951,-6694742,9868723,15842692,-16224787,},},
2160
{{9639399,11810955,-24007778,-9320054,3912937,
2161
-9856959,996125,-8727907,-8919186,-14097242,},
2162
{7248867,14468564,25228636,-8795035,14346339,
2163
8224790,6388427,-7181107,6468218,-8720783,},
2164
{15513115,15439095,7342322,-10157390,18005294,
2165
-7265713,2186239,4884640,10826567,7135781,},},
2166
{{-14204238,5297536,-5862318,-6004934,28095835,
2167
4236101,-14203318,1958636,-16816875,3837147,},
2168
{-5511166,-13176782,-29588215,12339465,15325758,
2169
-15945770,-8813185,11075932,-19608050,-3776283,},
2170
{11728032,9603156,-4637821,-5304487,-7827751,
2171
2724948,31236191,-16760175,-7268616,14799772,},},
2172
{{-28842672,4840636,-12047946,-9101456,-1445464,
2173
381905,-30977094,-16523389,1290540,12798615,},
2174
{27246947,-10320914,14792098,-14518944,5302070,
2175
-8746152,-3403974,-4149637,-27061213,10749585,},
2176
{25572375,-6270368,-15353037,16037944,1146292,
2177
32198,23487090,9585613,24714571,-1418265,},},
2178
{{19844825,282124,-17583147,11004019,-32004269,
2179
-2716035,6105106,-1711007,-21010044,14338445,},
2180
{8027505,8191102,-18504907,-12335737,25173494,
2181
-5923905,15446145,7483684,-30440441,10009108,},
2182
{-14134701,-4174411,10246585,-14677495,33553567,
2183
-14012935,23366126,15080531,-7969992,7663473,},},
2184
};
2185
2186
static const ge_precomp b_comb_high[8] = {
2187
{{33055887,-4431773,-521787,6654165,951411,
2188
-6266464,-5158124,6995613,-5397442,-6985227,},
2189
{4014062,6967095,-11977872,3960002,8001989,
2190
5130302,-2154812,-1899602,-31954493,-16173976,},
2191
{16271757,-9212948,23792794,731486,-25808309,
2192
-3546396,6964344,-4767590,10976593,10050757,},},
2193
{{2533007,-4288439,-24467768,-12387405,-13450051,
2194
14542280,12876301,13893535,15067764,8594792,},
2195
{20073501,-11623621,3165391,-13119866,13188608,
2196
-11540496,-10751437,-13482671,29588810,2197295,},
2197
{-1084082,11831693,6031797,14062724,14748428,
2198
-8159962,-20721760,11742548,31368706,13161200,},},
2199
{{2050412,-6457589,15321215,5273360,25484180,
2200
124590,-18187548,-7097255,-6691621,-14604792,},
2201
{9938196,2162889,-6158074,-1711248,4278932,
2202
-2598531,-22865792,-7168500,-24323168,11746309,},
2203
{-22691768,-14268164,5965485,9383325,20443693,
2204
5854192,28250679,-1381811,-10837134,13717818,},},
2205
{{-8495530,16382250,9548884,-4971523,-4491811,
2206
-3902147,6182256,-12832479,26628081,10395408,},
2207
{27329048,-15853735,7715764,8717446,-9215518,
2208
-14633480,28982250,-5668414,4227628,242148,},
2209
{-13279943,-7986904,-7100016,8764468,-27276630,
2210
3096719,29678419,-9141299,3906709,11265498,},},
2211
{{11918285,15686328,-17757323,-11217300,-27548967,
2212
4853165,-27168827,6807359,6871949,-1075745,},
2213
{-29002610,13984323,-27111812,-2713442,28107359,
2214
-13266203,6155126,15104658,3538727,-7513788,},
2215
{14103158,11233913,-33165269,9279850,31014152,
2216
4335090,-1827936,4590951,13960841,12787712,},},
2217
{{1469134,-16738009,33411928,13942824,8092558,
2218
-8778224,-11165065,1437842,22521552,-2792954,},
2219
{31352705,-4807352,-25327300,3962447,12541566,
2220
-9399651,-27425693,7964818,-23829869,5541287,},
2221
{-25732021,-6864887,23848984,3039395,-9147354,
2222
6022816,-27421653,10590137,25309915,-1584678,},},
2223
{{-22951376,5048948,31139401,-190316,-19542447,
2224
-626310,-17486305,-16511925,-18851313,-12985140,},
2225
{-9684890,14681754,30487568,7717771,-10829709,
2226
9630497,30290549,-10531496,-27798994,-13812825,},
2227
{5827835,16097107,-24501327,12094619,7413972,
2228
11447087,28057551,-1793987,-14056981,4359312,},},
2229
{{26323183,2342588,-21887793,-1623758,-6062284,
2230
2107090,-28724907,9036464,-19618351,-13055189,},
2231
{-29697200,14829398,-4596333,14220089,-30022969,
2232
2955645,12094100,-13693652,-5941445,7047569,},
2233
{-3201977,14413268,-12058324,-16417589,-9035655,
2234
-7224648,9258160,1399236,30397584,-5684634,},},
2235
};
2236
2237
static void lookup_add(ge *p, ge_precomp *tmp_c, fe tmp_a, fe tmp_b,
2238
const ge_precomp comb[8], const u8 scalar[32], int i)
2239
{
2240
u8 teeth = (u8)((scalar_bit(scalar, i) ) +
2241
(scalar_bit(scalar, i + 32) << 1) +
2242
(scalar_bit(scalar, i + 64) << 2) +
2243
(scalar_bit(scalar, i + 96) << 3));
2244
u8 high = teeth >> 3;
2245
u8 index = (teeth ^ (high - 1)) & 7;
2246
FOR (j, 0, 8) {
2247
i32 select = 1 & (((j ^ index) - 1) >> 8);
2248
fe_ccopy(tmp_c->Yp, comb[j].Yp, select);
2249
fe_ccopy(tmp_c->Ym, comb[j].Ym, select);
2250
fe_ccopy(tmp_c->T2, comb[j].T2, select);
2251
}
2252
fe_neg(tmp_a, tmp_c->T2);
2253
fe_cswap(tmp_c->T2, tmp_a , high ^ 1);
2254
fe_cswap(tmp_c->Yp, tmp_c->Ym, high ^ 1);
2255
ge_madd(p, p, tmp_c, tmp_a, tmp_b);
2256
}
2257
2258
// p = [scalar]B, where B is the base point
2259
static void ge_scalarmult_base(ge *p, const u8 scalar[32])
2260
{
2261
// twin 4-bits signed combs, from Mike Hamburg's
2262
// Fast and compact elliptic-curve cryptography (2012)
2263
// 1 / 2 modulo L
2264
static const u8 half_mod_L[32] = {
2265
247,233,122,46,141,49,9,44,107,206,123,81,239,124,111,10,
2266
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8, };
2267
// (2^256 - 1) / 2 modulo L
2268
static const u8 half_ones[32] = {
2269
142,74,204,70,186,24,118,107,184,231,190,57,250,173,119,99,
2270
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,7, };
2271
2272
// All bits set form: 1 means 1, 0 means -1
2273
u8 s_scalar[32];
2274
mul_add(s_scalar, scalar, half_mod_L, half_ones);
2275
2276
// Double and add ladder
2277
fe tmp_a, tmp_b; // temporaries for addition
2278
ge_precomp tmp_c; // temporary for comb lookup
2279
ge tmp_d; // temporary for doubling
2280
fe_1(tmp_c.Yp);
2281
fe_1(tmp_c.Ym);
2282
fe_0(tmp_c.T2);
2283
2284
// Save a double on the first iteration
2285
ge_zero(p);
2286
lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, 31);
2287
lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, 31+128);
2288
// Regular double & add for the rest
2289
for (int i = 30; i >= 0; i--) {
2290
ge_double(p, p, &tmp_d);
2291
lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, i);
2292
lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, i+128);
2293
}
2294
// Note: we could save one addition at the end if we assumed the
2295
// scalar fit in 252 bit. Which it does in practice if it is
2296
// selected at random. However, non-random, non-hashed scalars
2297
// *can* overflow 252 bits in practice. Better account for that
2298
// than leaving that kind of subtle corner case.
2299
2300
WIPE_BUFFER(tmp_a); WIPE_CTX(&tmp_d);
2301
WIPE_BUFFER(tmp_b); WIPE_CTX(&tmp_c);
2302
WIPE_BUFFER(s_scalar);
2303
}
2304
2305
void crypto_sign_public_key_custom_hash(u8 public_key[32],
2306
const u8 secret_key[32],
2307
const crypto_sign_vtable *hash)
2308
{
2309
u8 a[64];
2310
hash->hash(a, secret_key, 32);
2311
trim_scalar(a);
2312
ge A;
2313
ge_scalarmult_base(&A, a);
2314
ge_tobytes(public_key, &A);
2315
WIPE_BUFFER(a);
2316
WIPE_CTX(&A);
2317
}
2318
2319
void crypto_sign_public_key(u8 public_key[32], const u8 secret_key[32])
2320
{
2321
crypto_sign_public_key_custom_hash(public_key, secret_key,
2322
&crypto_blake2b_vtable);
2323
}
2324
2325
void crypto_sign_init_first_pass_custom_hash(crypto_sign_ctx_abstract *ctx,
2326
const u8 secret_key[32],
2327
const u8 public_key[32],
2328
const crypto_sign_vtable *hash)
2329
{
2330
ctx->hash = hash; // set vtable
2331
u8 *a = ctx->buf;
2332
u8 *prefix = ctx->buf + 32;
2333
ctx->hash->hash(a, secret_key, 32);
2334
trim_scalar(a);
2335
2336
if (public_key == 0) {
2337
crypto_sign_public_key_custom_hash(ctx->pk, secret_key, ctx->hash);
2338
} else {
2339
COPY(ctx->pk, public_key, 32);
2340
}
2341
2342
// Deterministic part of EdDSA: Construct a nonce by hashing the message
2343
// instead of generating a random number.
2344
// An actual random number would work just fine, and would save us
2345
// the trouble of hashing the message twice. If we did that
2346
// however, the user could fuck it up and reuse the nonce.
2347
ctx->hash->init (ctx);
2348
ctx->hash->update(ctx, prefix , 32);
2349
}
2350
2351
void crypto_sign_init_first_pass(crypto_sign_ctx_abstract *ctx,
2352
const u8 secret_key[32],
2353
const u8 public_key[32])
2354
{
2355
crypto_sign_init_first_pass_custom_hash(ctx, secret_key, public_key,
2356
&crypto_blake2b_vtable);
2357
}
2358
2359
void crypto_sign_update(crypto_sign_ctx_abstract *ctx,
2360
const u8 *msg, size_t msg_size)
2361
{
2362
ctx->hash->update(ctx, msg, msg_size);
2363
}
2364
2365
void crypto_sign_init_second_pass(crypto_sign_ctx_abstract *ctx)
2366
{
2367
u8 *r = ctx->buf + 32;
2368
u8 *half_sig = ctx->buf + 64;
2369
ctx->hash->final(ctx, r);
2370
reduce(r);
2371
2372
// first half of the signature = "random" nonce times the base point
2373
ge R;
2374
ge_scalarmult_base(&R, r);
2375
ge_tobytes(half_sig, &R);
2376
WIPE_CTX(&R);
2377
2378
// Hash R, the public key, and the message together.
2379
// It cannot be done in parallel with the first hash.
2380
ctx->hash->init (ctx);
2381
ctx->hash->update(ctx, half_sig, 32);
2382
ctx->hash->update(ctx, ctx->pk , 32);
2383
}
2384
2385
void crypto_sign_final(crypto_sign_ctx_abstract *ctx, u8 signature[64])
2386
{
2387
u8 *a = ctx->buf;
2388
u8 *r = ctx->buf + 32;
2389
u8 *half_sig = ctx->buf + 64;
2390
u8 h_ram[64];
2391
ctx->hash->final(ctx, h_ram);
2392
reduce(h_ram);
2393
COPY(signature, half_sig, 32);
2394
mul_add(signature + 32, h_ram, a, r); // s = h_ram * a + r
2395
WIPE_BUFFER(h_ram);
2396
crypto_wipe(ctx, ctx->hash->ctx_size);
2397
}
2398
2399
void crypto_sign(u8 signature[64],
2400
const u8 secret_key[32],
2401
const u8 public_key[32],
2402
const u8 *message, size_t message_size)
2403
{
2404
crypto_sign_ctx ctx;
2405
crypto_sign_ctx_abstract *actx = (crypto_sign_ctx_abstract*)&ctx;
2406
crypto_sign_init_first_pass (actx, secret_key, public_key);
2407
crypto_sign_update (actx, message, message_size);
2408
crypto_sign_init_second_pass(actx);
2409
crypto_sign_update (actx, message, message_size);
2410
crypto_sign_final (actx, signature);
2411
}
2412
2413
void crypto_check_init_custom_hash(crypto_check_ctx_abstract *ctx,
2414
const u8 signature[64],
2415
const u8 public_key[32],
2416
const crypto_sign_vtable *hash)
2417
{
2418
ctx->hash = hash; // set vtable
2419
COPY(ctx->buf, signature , 64);
2420
COPY(ctx->pk , public_key, 32);
2421
ctx->hash->init (ctx);
2422
ctx->hash->update(ctx, signature , 32);
2423
ctx->hash->update(ctx, public_key, 32);
2424
}
2425
2426
void crypto_check_init(crypto_check_ctx_abstract *ctx, const u8 signature[64],
2427
const u8 public_key[32])
2428
{
2429
crypto_check_init_custom_hash(ctx, signature, public_key,
2430
&crypto_blake2b_vtable);
2431
}
2432
2433
void crypto_check_update(crypto_check_ctx_abstract *ctx,
2434
const u8 *msg, size_t msg_size)
2435
{
2436
ctx->hash->update(ctx, msg, msg_size);
2437
}
2438
2439
int crypto_check_final(crypto_check_ctx_abstract *ctx)
2440
{
2441
u8 h_ram[64];
2442
ctx->hash->final(ctx, h_ram);
2443
reduce(h_ram);
2444
u8 *R = ctx->buf; // R
2445
u8 *s = ctx->buf + 32; // s
2446
u8 *R_check = ctx->pk; // overwrite ctx->pk to save stack space
2447
if (ge_r_check(R_check, s, h_ram, ctx->pk)) {
2448
return -1;
2449
}
2450
return crypto_verify32(R, R_check); // R == R_check ? OK : fail
2451
}
2452
2453
int crypto_check(const u8 signature[64], const u8 public_key[32],
2454
const u8 *message, size_t message_size)
2455
{
2456
crypto_check_ctx ctx;
2457
crypto_check_ctx_abstract *actx = (crypto_check_ctx_abstract*)&ctx;
2458
crypto_check_init (actx, signature, public_key);
2459
crypto_check_update(actx, message, message_size);
2460
return crypto_check_final(actx);
2461
}
2462
2463
///////////////////////
2464
/// EdDSA to X25519 ///
2465
///////////////////////
2466
void crypto_from_eddsa_private(u8 x25519[32], const u8 eddsa[32])
2467
{
2468
u8 a[64];
2469
crypto_blake2b(a, eddsa, 32);
2470
COPY(x25519, a, 32);
2471
WIPE_BUFFER(a);
2472
}
2473
2474
void crypto_from_eddsa_public(u8 x25519[32], const u8 eddsa[32])
2475
{
2476
fe t1, t2;
2477
fe_frombytes(t2, eddsa);
2478
fe_add(t1, fe_one, t2);
2479
fe_sub(t2, fe_one, t2);
2480
fe_invert(t2, t2);
2481
fe_mul(t1, t1, t2);
2482
fe_tobytes(x25519, t1);
2483
WIPE_BUFFER(t1);
2484
WIPE_BUFFER(t2);
2485
}
2486
2487
/////////////////////////////////////////////
2488
/// Dirty ephemeral public key generation ///
2489
/////////////////////////////////////////////
2490
2491
// Those functions generates a public key, *without* clearing the
2492
// cofactor. Sending that key over the network leaks 3 bits of the
2493
// private key. Use only to generate ephemeral keys that will be hidden
2494
// with crypto_curve_to_hidden().
2495
//
2496
// The public key is otherwise compatible with crypto_x25519() and
2497
// crypto_key_exchange() (those properly clear the cofactor).
2498
//
2499
// Note that the distribution of the resulting public keys is almost
2500
// uniform. Flipping the sign of the v coordinate (not provided by this
2501
// function), covers the entire key space almost perfectly, where
2502
// "almost" means a 2^-128 bias (undetectable). This uniformity is
2503
// needed to ensure the proper randomness of the resulting
2504
// representatives (once we apply crypto_curve_to_hidden()).
2505
//
2506
// Recall that Curve25519 has order C = 2^255 + e, with e < 2^128 (not
2507
// to be confused with the prime order of the main subgroup, L, which is
2508
// 8 times less than that).
2509
//
2510
// Generating all points would require us to multiply a point of order C
2511
// (the base point plus any point of order 8) by all scalars from 0 to
2512
// C-1. Clamping limits us to scalars between 2^254 and 2^255 - 1. But
2513
// by negating the resulting point at random, we also cover scalars from
2514
// -2^255 + 1 to -2^254 (which modulo C is congruent to e+1 to 2^254 + e).
2515
//
2516
// In practice:
2517
// - Scalars from 0 to e + 1 are never generated
2518
// - Scalars from 2^255 to 2^255 + e are never generated
2519
// - Scalars from 2^254 + 1 to 2^254 + e are generated twice
2520
//
2521
// Since e < 2^128, detecting this bias requires observing over 2^100
2522
// representatives from a given source (this will never happen), *and*
2523
// recovering enough of the private key to determine that they do, or do
2524
// not, belong to the biased set (this practically requires solving
2525
// discrete logarithm, which is conjecturally intractable).
2526
//
2527
// In practice, this means the bias is impossible to detect.
2528
2529
// s + (x*L) % 8*L
2530
// Guaranteed to fit in 256 bits iff s fits in 255 bits.
2531
// L < 2^253
2532
// x%8 < 2^3
2533
// L * (x%8) < 2^255
2534
// s < 2^255
2535
// s + L * (x%8) < 2^256
2536
static void add_xl(u8 s[32], u8 x)
2537
{
2538
u64 mod8 = x & 7;
2539
u64 carry = 0;
2540
FOR (i , 0, 8) {
2541
carry = carry + load32_le(s + 4*i) + L[i] * mod8;
2542
store32_le(s + 4*i, (u32)carry);
2543
carry >>= 32;
2544
}
2545
}
2546
2547
// "Small" dirty ephemeral key.
2548
// Use if you need to shrink the size of the binary, and can afford to
2549
// slow down by a factor of two (compared to the fast version)
2550
//
2551
// This version works by decoupling the cofactor from the main factor.
2552
//
2553
// - The trimmed scalar determines the main factor
2554
// - The clamped bits of the scalar determine the cofactor.
2555
//
2556
// Cofactor and main factor are combined into a single scalar, which is
2557
// then multiplied by a point of order 8*L (unlike the base point, which
2558
// has prime order). That "dirty" base point is the addition of the
2559
// regular base point (9), and a point of order 8.
2560
void crypto_x25519_dirty_small(u8 public_key[32], const u8 secret_key[32])
2561
{
2562
// Base point of order 8*L
2563
// Raw scalar multiplication with it does not clear the cofactor,
2564
// and the resulting public key will reveal 3 bits of the scalar.
2565
static const u8 dirty_base_point[32] = {
2566
0x34, 0xfc, 0x6c, 0xb7, 0xc8, 0xde, 0x58, 0x97, 0x77, 0x70, 0xd9, 0x52,
2567
0x16, 0xcc, 0xdc, 0x6c, 0x85, 0x90, 0xbe, 0xcd, 0x91, 0x9c, 0x07, 0x59,
2568
0x94, 0x14, 0x56, 0x3b, 0x4b, 0xa4, 0x47, 0x0f, };
2569
// separate the main factor & the cofactor of the scalar
2570
u8 scalar[32];
2571
COPY(scalar, secret_key, 32);
2572
trim_scalar(scalar);
2573
2574
// Separate the main factor and the cofactor
2575
//
2576
// The scalar is trimmed, so its cofactor is cleared. The three
2577
// least significant bits however still have a main factor. We must
2578
// remove it for X25519 compatibility.
2579
//
2580
// We exploit the fact that 5*L = 1 (modulo 8)
2581
// cofactor = lsb * 5 * L (modulo 8*L)
2582
// combined = scalar + cofactor (modulo 8*L)
2583
// combined = scalar + (lsb * 5 * L) (modulo 8*L)
2584
add_xl(scalar, secret_key[0] * 5);
2585
scalarmult(public_key, scalar, dirty_base_point, 256);
2586
WIPE_BUFFER(scalar);
2587
}
2588
2589
// "Fast" dirty ephemeral key
2590
// We use this one by default.
2591
//
2592
// This version works by performing a regular scalar multiplication,
2593
// then add a low order point. The scalar multiplication is done in
2594
// Edwards space for more speed (*2 compared to the "small" version).
2595
// The cost is a bigger binary for programs that don't also sign messages.
2596
void crypto_x25519_dirty_fast(u8 public_key[32], const u8 secret_key[32])
2597
{
2598
u8 scalar[32];
2599
ge pk;
2600
COPY(scalar, secret_key, 32);
2601
trim_scalar(scalar);
2602
ge_scalarmult_base(&pk, scalar);
2603
2604
// Select low order point
2605
// We're computing the [cofactor]lop scalar multiplication, where:
2606
// cofactor = tweak & 7.
2607
// lop = (lop_x, lop_y)
2608
// lop_x = sqrt((sqrt(d + 1) + 1) / d)
2609
// lop_y = -lop_x * sqrtm1
2610
// Notes:
2611
// - A (single) Montgomery ladder would be twice as slow.
2612
// - An actual scalar multiplication would hurt performance.
2613
// - A full table lookup would take more code.
2614
u8 cofactor = secret_key[0] & 7;
2615
int a = (cofactor >> 2) & 1;
2616
int b = (cofactor >> 1) & 1;
2617
int c = (cofactor >> 0) & 1;
2618
fe t1, t2, t3;
2619
fe_0(t1);
2620
fe_ccopy(t1, sqrtm1, b);
2621
fe_ccopy(t1, lop_x , c);
2622
fe_neg (t3, t1);
2623
fe_ccopy(t1, t3, a);
2624
fe_1(t2);
2625
fe_0(t3);
2626
fe_ccopy(t2, t3 , b);
2627
fe_ccopy(t2, lop_y, c);
2628
fe_neg (t3, t2);
2629
fe_ccopy(t2, t3, a^b);
2630
ge_precomp low_order_point;
2631
fe_add(low_order_point.Yp, t2, t1);
2632
fe_sub(low_order_point.Ym, t2, t1);
2633
fe_mul(low_order_point.T2, t2, t1);
2634
fe_mul(low_order_point.T2, low_order_point.T2, D2);
2635
2636
// Add low order point to the public key
2637
ge_madd(&pk, &pk, &low_order_point, t1, t2);
2638
2639
// Convert to Montgomery u coordinate (we ignore the sign)
2640
fe_add(t1, pk.Z, pk.Y);
2641
fe_sub(t2, pk.Z, pk.Y);
2642
fe_invert(t2, t2);
2643
fe_mul(t1, t1, t2);
2644
2645
fe_tobytes(public_key, t1);
2646
2647
WIPE_BUFFER(t1); WIPE_BUFFER(scalar);
2648
WIPE_BUFFER(t2); WIPE_CTX(&pk);
2649
WIPE_BUFFER(t3); WIPE_CTX(&low_order_point);
2650
}
2651
2652
///////////////////
2653
/// Elligator 2 ///
2654
///////////////////
2655
static const fe A = {486662};
2656
2657
// Elligator direct map
2658
//
2659
// Computes the point corresponding to a representative, encoded in 32
2660
// bytes (little Endian). Since positive representatives fits in 254
2661
// bits, The two most significant bits are ignored.
2662
//
2663
// From the paper:
2664
// w = -A / (fe(1) + non_square * r^2)
2665
// e = chi(w^3 + A*w^2 + w)
2666
// u = e*w - (fe(1)-e)*(A//2)
2667
// v = -e * sqrt(u^3 + A*u^2 + u)
2668
//
2669
// We ignore v because we don't need it for X25519 (the Montgomery
2670
// ladder only uses u).
2671
//
2672
// Note that e is either 0, 1 or -1
2673
// if e = 0 u = 0 and v = 0
2674
// if e = 1 u = w
2675
// if e = -1 u = -w - A = w * non_square * r^2
2676
//
2677
// Let r1 = non_square * r^2
2678
// Let r2 = 1 + r1
2679
// Note that r2 cannot be zero, -1/non_square is not a square.
2680
// We can (tediously) verify that:
2681
// w^3 + A*w^2 + w = (A^2*r1 - r2^2) * A / r2^3
2682
// Therefore:
2683
// chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3))
2684
// chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * 1
2685
// chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * chi(r2^6)
2686
// chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3) * r2^6)
2687
// chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * A * r2^3)
2688
// Corollary:
2689
// e = 1 if (A^2*r1 - r2^2) * A * r2^3) is a non-zero square
2690
// e = -1 if (A^2*r1 - r2^2) * A * r2^3) is not a square
2691
// Note that w^3 + A*w^2 + w (and therefore e) can never be zero:
2692
// w^3 + A*w^2 + w = w * (w^2 + A*w + 1)
2693
// w^3 + A*w^2 + w = w * (w^2 + A*w + A^2/4 - A^2/4 + 1)
2694
// w^3 + A*w^2 + w = w * (w + A/2)^2 - A^2/4 + 1)
2695
// which is zero only if:
2696
// w = 0 (impossible)
2697
// (w + A/2)^2 = A^2/4 - 1 (impossible, because A^2/4-1 is not a square)
2698
//
2699
// Let isr = invsqrt((A^2*r1 - r2^2) * A * r2^3)
2700
// isr = sqrt(1 / ((A^2*r1 - r2^2) * A * r2^3)) if e = 1
2701
// isr = sqrt(sqrt(-1) / ((A^2*r1 - r2^2) * A * r2^3)) if e = -1
2702
//
2703
// if e = 1
2704
// let u1 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2
2705
// u1 = w
2706
// u1 = u
2707
//
2708
// if e = -1
2709
// let ufactor = -non_square * sqrt(-1) * r^2
2710
// let vfactor = sqrt(ufactor)
2711
// let u2 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2 * ufactor
2712
// u2 = w * -1 * -non_square * r^2
2713
// u2 = w * non_square * r^2
2714
// u2 = u
2715
void crypto_hidden_to_curve(uint8_t curve[32], const uint8_t hidden[32])
2716
{
2717
// Representatives are encoded in 254 bits.
2718
// The two most significant ones are random padding that must be ignored.
2719
u8 clamped[32];
2720
COPY(clamped, hidden, 32);
2721
clamped[31] &= 0x3f;
2722
2723
fe r, u, t1, t2, t3;
2724
fe_frombytes(r, clamped);
2725
fe_sq2(t1, r);
2726
fe_add(u, t1, fe_one);
2727
fe_sq (t2, u);
2728
fe_mul(t3, A2, t1);
2729
fe_sub(t3, t3, t2);
2730
fe_mul(t3, t3, A);
2731
fe_mul(t1, t2, u);
2732
fe_mul(t1, t3, t1);
2733
int is_square = invsqrt(t1, t1);
2734
fe_sq(u, r);
2735
fe_mul(u, u, ufactor);
2736
fe_ccopy(u, fe_one, is_square);
2737
fe_sq (t1, t1);
2738
fe_mul(u, u, A);
2739
fe_mul(u, u, t3);
2740
fe_mul(u, u, t2);
2741
fe_mul(u, u, t1);
2742
fe_neg(u, u);
2743
fe_tobytes(curve, u);
2744
2745
WIPE_BUFFER(t1); WIPE_BUFFER(r);
2746
WIPE_BUFFER(t2); WIPE_BUFFER(u);
2747
WIPE_BUFFER(t3); WIPE_BUFFER(clamped);
2748
}
2749
2750
// Elligator inverse map
2751
//
2752
// Computes the representative of a point, if possible. If not, it does
2753
// nothing and returns -1. Note that the success of the operation
2754
// depends only on the point (more precisely its u coordinate). The
2755
// tweak parameter is used only upon success
2756
//
2757
// The tweak should be a random byte. Beyond that, its contents are an
2758
// implementation detail. Currently, the tweak comprises:
2759
// - Bit 1 : sign of the v coordinate (0 if positive, 1 if negative)
2760
// - Bit 2-5: not used
2761
// - Bits 6-7: random padding
2762
//
2763
// From the paper:
2764
// Let sq = -non_square * u * (u+A)
2765
// if sq is not a square, or u = -A, there is no mapping
2766
// Assuming there is a mapping:
2767
// if v is positive: r = sqrt(-(u+A) / u)
2768
// if v is negative: r = sqrt(-u / (u+A))
2769
//
2770
// We compute isr = invsqrt(-non_square * u * (u+A))
2771
// if it wasn't a non-zero square, abort.
2772
// else, isr = sqrt(-1 / (non_square * u * (u+A))
2773
//
2774
// This causes us to abort if u is zero, even though we shouldn't. This
2775
// never happens in practice, because (i) a random point in the curve has
2776
// a negligible chance of being zero, and (ii) scalar multiplication with
2777
// a trimmed scalar *never* yields zero.
2778
//
2779
// Since:
2780
// isr * (u+A) = sqrt(-1 / (non_square * u * (u+A)) * (u+A)
2781
// isr * (u+A) = sqrt(-(u+A) / (non_square * u * (u+A))
2782
// and:
2783
// isr = u = sqrt(-1 / (non_square * u * (u+A)) * u
2784
// isr = u = sqrt(-u / (non_square * u * (u+A))
2785
// Therefore:
2786
// if v is positive: r = isr * (u+A)
2787
// if v is negative: r = isr * u
2788
int crypto_curve_to_hidden(u8 hidden[32], const u8 public_key[32], u8 tweak)
2789
{
2790
fe t1, t2, t3;
2791
fe_frombytes(t1, public_key);
2792
2793
fe_add(t2, t1, A);
2794
fe_mul(t3, t1, t2);
2795
fe_mul_small(t3, t3, -2);
2796
int is_square = invsqrt(t3, t3);
2797
if (!is_square) {
2798
// The only variable time bit. This ultimately reveals how many
2799
// tries it took us to find a representable key.
2800
// This does not affect security as long as we try keys at random.
2801
WIPE_BUFFER(t1);
2802
WIPE_BUFFER(t2);
2803
WIPE_BUFFER(t3);
2804
return -1;
2805
}
2806
fe_ccopy (t1, t2, tweak & 1);
2807
fe_mul (t3, t1, t3);
2808
fe_mul_small(t1, t3, 2);
2809
fe_neg (t2, t3);
2810
fe_ccopy (t3, t2, fe_isodd(t1));
2811
fe_tobytes(hidden, t3);
2812
2813
// Pad with two random bits
2814
hidden[31] |= tweak & 0xc0;
2815
2816
WIPE_BUFFER(t1);
2817
WIPE_BUFFER(t2);
2818
WIPE_BUFFER(t3);
2819
return 0;
2820
}
2821
2822
void crypto_hidden_key_pair(u8 hidden[32], u8 secret_key[32], u8 seed[32])
2823
{
2824
u8 pk [32]; // public key
2825
u8 buf[64]; // seed + representative
2826
COPY(buf + 32, seed, 32);
2827
do {
2828
crypto_chacha20(buf, 0, 64, buf+32, zero);
2829
crypto_x25519_dirty_fast(pk, buf); // or the "small" version
2830
} while(crypto_curve_to_hidden(buf+32, pk, buf[32]));
2831
// Note that the return value of crypto_curve_to_hidden() is
2832
// independent from its tweak parameter.
2833
// Therefore, buf[32] is not actually reused. Either we loop one
2834
// more time and buf[32] is used for the new seed, or we succeeded,
2835
// and buf[32] becomes the tweak parameter.
2836
2837
crypto_wipe(seed, 32);
2838
COPY(hidden , buf + 32, 32);
2839
COPY(secret_key, buf , 32);
2840
WIPE_BUFFER(buf);
2841
WIPE_BUFFER(pk);
2842
}
2843
2844
////////////////////
2845
/// Key exchange ///
2846
////////////////////
2847
void crypto_key_exchange(u8 shared_key[32],
2848
const u8 your_secret_key [32],
2849
const u8 their_public_key[32])
2850
{
2851
crypto_x25519(shared_key, your_secret_key, their_public_key);
2852
crypto_hchacha20(shared_key, shared_key, zero);
2853
}
2854
2855
///////////////////////
2856
/// Scalar division ///
2857
///////////////////////
2858
2859
// Montgomery reduction.
2860
// Divides x by (2^256), and reduces the result modulo L
2861
//
2862
// Precondition:
2863
// x < L * 2^256
2864
// Constants:
2865
// r = 2^256 (makes division by r trivial)
2866
// k = (r * (1/r) - 1) // L (1/r is computed modulo L )
2867
// Algorithm:
2868
// s = (x * k) % r
2869
// t = x + s*L (t is always a multiple of r)
2870
// u = (t/r) % L (u is always below 2*L, conditional subtraction is enough)
2871
static void redc(u32 u[8], u32 x[16])
2872
{
2873
static const u32 k[8] = { 0x12547e1b, 0xd2b51da3, 0xfdba84ff, 0xb1a206f2,
2874
0xffa36bea, 0x14e75438, 0x6fe91836, 0x9db6c6f2,};
2875
static const u32 l[8] = { 0x5cf5d3ed, 0x5812631a, 0xa2f79cd6, 0x14def9de,
2876
0x00000000, 0x00000000, 0x00000000, 0x10000000,};
2877
// s = x * k (modulo 2^256)
2878
// This is cheaper than the full multiplication.
2879
u32 s[8] = {0};
2880
FOR (i, 0, 8) {
2881
u64 carry = 0;
2882
FOR (j, 0, 8-i) {
2883
carry += s[i+j] + (u64)x[i] * k[j];
2884
s[i+j] = (u32)carry;
2885
carry >>= 32;
2886
}
2887
}
2888
u32 t[16] = {0};
2889
multiply(t, s, l);
2890
2891
// t = t + x
2892
u64 carry = 0;
2893
FOR (i, 0, 16) {
2894
carry += (u64)t[i] + x[i];
2895
t[i] = (u32)carry;
2896
carry >>= 32;
2897
}
2898
2899
// u = (t / 2^256) % L
2900
// Note that t / 2^256 is always below 2*L,
2901
// So a constant time conditional subtraction is enough
2902
// We work with L directly, in a 2's complement encoding
2903
// (-L == ~L + 1)
2904
remove_l(u, t+8);
2905
2906
WIPE_BUFFER(s);
2907
WIPE_BUFFER(t);
2908
}
2909
2910
void crypto_x25519_inverse(u8 blind_salt [32], const u8 private_key[32],
2911
const u8 curve_point[32])
2912
{
2913
static const u8 Lm2[32] = { // L - 2
2914
0xeb, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58, 0xd6, 0x9c, 0xf7, 0xa2,
2915
0xde, 0xf9, 0xde, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
2916
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, };
2917
// 1 in Montgomery form
2918
u32 m_inv [8] = {0x8d98951d, 0xd6ec3174, 0x737dcf70, 0xc6ef5bf4,
2919
0xfffffffe, 0xffffffff, 0xffffffff, 0x0fffffff,};
2920
2921
u8 scalar[32];
2922
COPY(scalar, private_key, 32);
2923
trim_scalar(scalar);
2924
2925
// Convert the scalar in Montgomery form
2926
// m_scl = scalar * 2^256 (modulo L)
2927
u32 m_scl[8];
2928
{
2929
u32 tmp[16];
2930
ZERO(tmp, 8);
2931
load32_le_buf(tmp+8, scalar, 8);
2932
mod_l(scalar, tmp);
2933
load32_le_buf(m_scl, scalar, 8);
2934
WIPE_BUFFER(tmp); // Wipe ASAP to save stack space
2935
}
2936
2937
u32 product[16];
2938
for (int i = 252; i >= 0; i--) {
2939
ZERO(product, 16);
2940
multiply(product, m_inv, m_inv);
2941
redc(m_inv, product);
2942
if (scalar_bit(Lm2, i)) {
2943
ZERO(product, 16);
2944
multiply(product, m_inv, m_scl);
2945
redc(m_inv, product);
2946
}
2947
}
2948
// Convert the inverse *out* of Montgomery form
2949
// scalar = m_inv / 2^256 (modulo L)
2950
COPY(product, m_inv, 8);
2951
ZERO(product + 8, 8);
2952
redc(m_inv, product);
2953
store32_le_buf(scalar, m_inv, 8); // the *inverse* of the scalar
2954
2955
// Clear the cofactor of scalar:
2956
// cleared = scalar * (3*L + 1) (modulo 8*L)
2957
// cleared = scalar + scalar * 3 * L (modulo 8*L)
2958
// Note that (scalar * 3) is reduced modulo 8, so we only need the
2959
// first byte.
2960
add_xl(scalar, scalar[0] * 3);
2961
2962
// Recall that 8*L < 2^256. However it is also very close to
2963
// 2^255. If we spanned the ladder over 255 bits, random tests
2964
// wouldn't catch the off-by-one error.
2965
scalarmult(blind_salt, scalar, curve_point, 256);
2966
2967
WIPE_BUFFER(scalar); WIPE_BUFFER(m_scl);
2968
WIPE_BUFFER(product); WIPE_BUFFER(m_inv);
2969
}
2970
2971
////////////////////////////////
2972
/// Authenticated encryption ///
2973
////////////////////////////////
2974
static void lock_auth(u8 mac[16], const u8 auth_key[32],
2975
const u8 *ad , size_t ad_size,
2976
const u8 *cipher_text, size_t text_size)
2977
{
2978
u8 sizes[16]; // Not secret, not wiped
2979
store64_le(sizes + 0, ad_size);
2980
store64_le(sizes + 8, text_size);
2981
crypto_poly1305_ctx poly_ctx; // auto wiped...
2982
crypto_poly1305_init (&poly_ctx, auth_key);
2983
crypto_poly1305_update(&poly_ctx, ad , ad_size);
2984
crypto_poly1305_update(&poly_ctx, zero , align(ad_size, 16));
2985
crypto_poly1305_update(&poly_ctx, cipher_text, text_size);
2986
crypto_poly1305_update(&poly_ctx, zero , align(text_size, 16));
2987
crypto_poly1305_update(&poly_ctx, sizes , 16);
2988
crypto_poly1305_final (&poly_ctx, mac); // ...here
2989
}
2990
2991
void crypto_lock_aead(u8 mac[16], u8 *cipher_text,
2992
const u8 key[32], const u8 nonce[24],
2993
const u8 *ad , size_t ad_size,
2994
const u8 *plain_text, size_t text_size)
2995
{
2996
u8 sub_key[32];
2997
u8 auth_key[64]; // "Wasting" the whole Chacha block is faster
2998
crypto_hchacha20(sub_key, key, nonce);
2999
crypto_chacha20(auth_key, 0, 64, sub_key, nonce + 16);
3000
crypto_chacha20_ctr(cipher_text, plain_text, text_size,
3001
sub_key, nonce + 16, 1);
3002
lock_auth(mac, auth_key, ad, ad_size, cipher_text, text_size);
3003
WIPE_BUFFER(sub_key);
3004
WIPE_BUFFER(auth_key);
3005
}
3006
3007
int crypto_unlock_aead(u8 *plain_text, const u8 key[32], const u8 nonce[24],
3008
const u8 mac[16],
3009
const u8 *ad , size_t ad_size,
3010
const u8 *cipher_text, size_t text_size)
3011
{
3012
u8 sub_key[32];
3013
u8 auth_key[64]; // "Wasting" the whole Chacha block is faster
3014
crypto_hchacha20(sub_key, key, nonce);
3015
crypto_chacha20(auth_key, 0, 64, sub_key, nonce + 16);
3016
u8 real_mac[16];
3017
lock_auth(real_mac, auth_key, ad, ad_size, cipher_text, text_size);
3018
WIPE_BUFFER(auth_key);
3019
if (crypto_verify16(mac, real_mac)) {
3020
WIPE_BUFFER(sub_key);
3021
WIPE_BUFFER(real_mac);
3022
return -1;
3023
}
3024
crypto_chacha20_ctr(plain_text, cipher_text, text_size,
3025
sub_key, nonce + 16, 1);
3026
WIPE_BUFFER(sub_key);
3027
WIPE_BUFFER(real_mac);
3028
return 0;
3029
}
3030
3031
void crypto_lock(u8 mac[16], u8 *cipher_text,
3032
const u8 key[32], const u8 nonce[24],
3033
const u8 *plain_text, size_t text_size)
3034
{
3035
crypto_lock_aead(mac, cipher_text, key, nonce, 0, 0, plain_text, text_size);
3036
}
3037
3038
int crypto_unlock(u8 *plain_text,
3039
const u8 key[32], const u8 nonce[24], const u8 mac[16],
3040
const u8 *cipher_text, size_t text_size)
3041
{
3042
return crypto_unlock_aead(plain_text, key, nonce, mac, 0, 0,
3043
cipher_text, text_size);
3044
}
3045
3046