CoCalc -- monocypher.cpp

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: Ardupilot/ardupilot
Path: blob/master/libraries/AP_CheckFirmware/monocypher.cpp
Views: ¹⁷⁹⁸
1
// Monocypher version 3.1.2
2
//
3
// This file is dual-licensed.  Choose whichever licence you want from
4
// the two licences listed below.
5
//
6
// The first licence is a regular 2-clause BSD licence.  The second licence
7
// is the CC-0 from Creative Commons. It is intended to release Monocypher
8
// to the public domain.  The BSD licence serves as a fallback option.
9
//
10
// SPDX-License-Identifier: BSD-2-Clause OR CC0-1.0
11
//
12
// ------------------------------------------------------------------------
13
//
14
// Copyright (c) 2017-2020, Loup Vaillant
15
// All rights reserved.
16
//
17
//
18
// Redistribution and use in source and binary forms, with or without
19
// modification, are permitted provided that the following conditions are
20
// met:
21
//
22
// 1. Redistributions of source code must retain the above copyright
23
//    notice, this list of conditions and the following disclaimer.
24
//
25
// 2. Redistributions in binary form must reproduce the above copyright
26
//    notice, this list of conditions and the following disclaimer in the
27
//    documentation and/or other materials provided with the
28
//    distribution.
29
//
30
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
31
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
32
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
33
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
34
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
35
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
36
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
37
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
38
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
39
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
40
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41
//
42
// ------------------------------------------------------------------------
43
//
44
// Written in 2017-2020 by Loup Vaillant
45
//
46
// To the extent possible under law, the author(s) have dedicated all copyright
47
// and related neighboring rights to this software to the public domain
48
// worldwide.  This software is distributed without any warranty.
49
//
50
// You should have received a copy of the CC0 Public Domain Dedication along
51
// with this software.  If not, see
52
// <https://creativecommons.org/publicdomain/zero/1.0/>
53

54
#include "monocypher.h"
55

56
// we don't need Argon2
57
#define MONOCYPHER_ARGON2_ENABLE 0
58

59
// we want the bootloader to be as small as possible
60
#define BLAKE2_NO_UNROLLING 1
61

62
/////////////////
63
/// Utilities ///
64
/////////////////
65
#define FOR_T(type, i0, start, end) for (type i0 = (start); i0 < (end); i0++)
66
#define FOR(i1, start, end)        FOR_T(size_t, i1, start, end)
67
#define COPY(dst, src, size)       FOR(i2, 0, size) (dst)[i2] = (src)[i2]
68
#define ZERO(buf, size)            FOR(i3, 0, size) (buf)[i3] = 0
69
#define WIPE_CTX(ctx)              crypto_wipe(ctx   , sizeof(*(ctx)))
70
#define WIPE_BUFFER(buffer)        crypto_wipe(buffer, sizeof(buffer))
71
#define MIN(a, b)                  ((a) <= (b) ? (a) : (b))
72
#define MAX(a, b)                  ((a) >= (b) ? (a) : (b))
73

74
typedef int8_t   i8;
75
typedef uint8_t  u8;
76
typedef int16_t  i16;
77
typedef uint32_t u32;
78
typedef int32_t  i32;
79
typedef int64_t  i64;
80
typedef uint64_t u64;
81

82
static const u8 zero[128] = {0};
83

84
// returns the smallest positive integer y such that
85
// (x + y) % pow_2  == 0
86
// Basically, it's how many bytes we need to add to "align" x.
87
// Only works when pow_2 is a power of 2.
88
// Note: we use ~x+1 instead of -x to avoid compiler warnings
89
static size_t align(size_t x, size_t pow_2)
90
{
91
    return (~x + 1) & (pow_2 - 1);
92
}
93

94
static u32 load24_le(const u8 s[3])
95
{
96
    return (u32)s[0]
97
        | ((u32)s[1] <<  8)
98
        | ((u32)s[2] << 16);
99
}
100

101
static u32 load32_le(const u8 s[4])
102
{
103
    return (u32)s[0]
104
        | ((u32)s[1] <<  8)
105
        | ((u32)s[2] << 16)
106
        | ((u32)s[3] << 24);
107
}
108

109
static u64 load64_le(const u8 s[8])
110
{
111
    return load32_le(s) | ((u64)load32_le(s+4) << 32);
112
}
113

114
static void store32_le(u8 out[4], u32 in)
115
{
116
    out[0] =  in        & 0xff;
117
    out[1] = (in >>  8) & 0xff;
118
    out[2] = (in >> 16) & 0xff;
119
    out[3] = (in >> 24) & 0xff;
120
}
121

122
static void store64_le(u8 out[8], u64 in)
123
{
124
    store32_le(out    , (u32)in );
125
    store32_le(out + 4, in >> 32);
126
}
127

128
static void load32_le_buf (u32 *dst, const u8 *src, size_t size) {
129
    FOR(i, 0, size) { dst[i] = load32_le(src + i*4); }
130
}
131
static void load64_le_buf (u64 *dst, const u8 *src, size_t size) {
132
    FOR(i, 0, size) { dst[i] = load64_le(src + i*8); }
133
}
134
static void store32_le_buf(u8 *dst, const u32 *src, size_t size) {
135
    FOR(i, 0, size) { store32_le(dst + i*4, src[i]); }
136
}
137
static void store64_le_buf(u8 *dst, const u64 *src, size_t size) {
138
    FOR(i, 0, size) { store64_le(dst + i*8, src[i]); }
139
}
140

141
static u64 rotr64(u64 x, u64 n) { return (x >> n) ^ (x << (64 - n)); }
142
static u32 rotl32(u32 x, u32 n) { return (x << n) ^ (x >> (32 - n)); }
143

144
static int neq0(u64 diff)
145
{   // constant time comparison to zero
146
    // return diff != 0 ? -1 : 0
147
    u64 half = (diff >> 32) | ((u32)diff);
148
    return (1 & ((half - 1) >> 32)) - 1;
149
}
150

151
static u64 x16(const u8 a[16], const u8 b[16])
152
{
153
    return (load64_le(a + 0) ^ load64_le(b + 0))
154
        |  (load64_le(a + 8) ^ load64_le(b + 8));
155
}
156
static u64 x32(const u8 a[32],const u8 b[32]){return x16(a,b)| x16(a+16, b+16);}
157
static u64 x64(const u8 a[64],const u8 b[64]){return x32(a,b)| x32(a+32, b+32);}
158
int crypto_verify16(const u8 a[16], const u8 b[16]){ return neq0(x16(a, b)); }
159
int crypto_verify32(const u8 a[32], const u8 b[32]){ return neq0(x32(a, b)); }
160
int crypto_verify64(const u8 a[64], const u8 b[64]){ return neq0(x64(a, b)); }
161

162
void crypto_wipe(void *secret, size_t size)
163
{
164
    volatile u8 *v_secret = (u8*)secret;
165
    ZERO(v_secret, size);
166
}
167

168
/////////////////
169
/// Chacha 20 ///
170
/////////////////
171
#define QUARTERROUND(a, b, c, d)     \
172
    a += b;  d = rotl32(d ^ a, 16);  \
173
    c += d;  b = rotl32(b ^ c, 12);  \
174
    a += b;  d = rotl32(d ^ a,  8);  \
175
    c += d;  b = rotl32(b ^ c,  7)
176

177
static void chacha20_rounds(u32 out[16], const u32 in[16])
178
{
179
    // The temporary variables make Chacha20 10% faster.
180
    u32 t0  = in[ 0];  u32 t1  = in[ 1];  u32 t2  = in[ 2];  u32 t3  = in[ 3];
181
    u32 t4  = in[ 4];  u32 t5  = in[ 5];  u32 t6  = in[ 6];  u32 t7  = in[ 7];
182
    u32 t8  = in[ 8];  u32 t9  = in[ 9];  u32 t10 = in[10];  u32 t11 = in[11];
183
    u32 t12 = in[12];  u32 t13 = in[13];  u32 t14 = in[14];  u32 t15 = in[15];
184

185
    FOR (i, 0, 10) { // 20 rounds, 2 rounds per loop.
186
        QUARTERROUND(t0, t4, t8 , t12); // column 0
187
        QUARTERROUND(t1, t5, t9 , t13); // column 1
188
        QUARTERROUND(t2, t6, t10, t14); // column 2
189
        QUARTERROUND(t3, t7, t11, t15); // column 3
190
        QUARTERROUND(t0, t5, t10, t15); // diagonal 0
191
        QUARTERROUND(t1, t6, t11, t12); // diagonal 1
192
        QUARTERROUND(t2, t7, t8 , t13); // diagonal 2
193
        QUARTERROUND(t3, t4, t9 , t14); // diagonal 3
194
    }
195
    out[ 0] = t0;   out[ 1] = t1;   out[ 2] = t2;   out[ 3] = t3;
196
    out[ 4] = t4;   out[ 5] = t5;   out[ 6] = t6;   out[ 7] = t7;
197
    out[ 8] = t8;   out[ 9] = t9;   out[10] = t10;  out[11] = t11;
198
    out[12] = t12;  out[13] = t13;  out[14] = t14;  out[15] = t15;
199
}
200

201
static void chacha20_init_key(u32 block[16], const u8 key[32])
202
{
203
    load32_le_buf(block  , (const u8*)"expand 32-byte k", 4); // constant
204
    load32_le_buf(block+4, key                          , 8); // key
205
}
206

207
void crypto_hchacha20(u8 out[32], const u8 key[32], const u8 in [16])
208
{
209
    u32 block[16];
210
    chacha20_init_key(block, key);
211
    // input
212
    load32_le_buf(block + 12, in, 4);
213
    chacha20_rounds(block, block);
214
    // prevent reversal of the rounds by revealing only half of the buffer.
215
    store32_le_buf(out   , block   , 4); // constant
216
    store32_le_buf(out+16, block+12, 4); // counter and nonce
217
    WIPE_BUFFER(block);
218
}
219

220
u64 crypto_chacha20_ctr(u8 *cipher_text, const u8 *plain_text,
221
                        size_t text_size, const u8 key[32], const u8 nonce[8],
222
                        u64 ctr)
223
{
224
    u32 input[16];
225
    chacha20_init_key(input, key);
226
    input[12] = (u32) ctr;
227
    input[13] = (u32)(ctr >> 32);
228
    load32_le_buf(input+14, nonce, 2);
229

230
    // Whole blocks
231
    u32    pool[16];
232
    size_t nb_blocks = text_size >> 6;
233
    FOR (i, 0, nb_blocks) {
234
        chacha20_rounds(pool, input);
235
        if (plain_text != 0) {
236
            FOR (j, 0, 16) {
237
                u32 p = pool[j] + input[j];
238
                store32_le(cipher_text, p ^ load32_le(plain_text));
239
                cipher_text += 4;
240
                plain_text  += 4;
241
            }
242
        } else {
243
            FOR (j, 0, 16) {
244
                u32 p = pool[j] + input[j];
245
                store32_le(cipher_text, p);
246
                cipher_text += 4;
247
            }
248
        }
249
        input[12]++;
250
        if (input[12] == 0) {
251
            input[13]++;
252
        }
253
    }
254
    text_size &= 63;
255

256
    // Last (incomplete) block
257
    if (text_size > 0) {
258
        if (plain_text == 0) {
259
            plain_text = zero;
260
        }
261
        chacha20_rounds(pool, input);
262
        u8 tmp[64];
263
        FOR (i, 0, 16) {
264
            store32_le(tmp + i*4, pool[i] + input[i]);
265
        }
266
        FOR (i, 0, text_size) {
267
            cipher_text[i] = tmp[i] ^ plain_text[i];
268
        }
269
        WIPE_BUFFER(tmp);
270
    }
271
    ctr = input[12] + ((u64)input[13] << 32) + (text_size > 0);
272

273
    WIPE_BUFFER(pool);
274
    WIPE_BUFFER(input);
275
    return ctr;
276
}
277

278
u32 crypto_ietf_chacha20_ctr(u8 *cipher_text, const u8 *plain_text,
279
                             size_t text_size,
280
                             const u8 key[32], const u8 nonce[12], u32 ctr)
281
{
282
    u64 big_ctr = ctr + ((u64)load32_le(nonce) << 32);
283
    return (u32)crypto_chacha20_ctr(cipher_text, plain_text, text_size,
284
                                    key, nonce + 4, big_ctr);
285
}
286

287
u64 crypto_xchacha20_ctr(u8 *cipher_text, const u8 *plain_text,
288
                         size_t text_size,
289
                         const u8 key[32], const u8 nonce[24], u64 ctr)
290
{
291
    u8 sub_key[32];
292
    crypto_hchacha20(sub_key, key, nonce);
293
    ctr = crypto_chacha20_ctr(cipher_text, plain_text, text_size,
294
                              sub_key, nonce+16, ctr);
295
    WIPE_BUFFER(sub_key);
296
    return ctr;
297
}
298

299
void crypto_chacha20(u8 *cipher_text, const u8 *plain_text, size_t text_size,
300
                     const u8 key[32], const u8 nonce[8])
301
{
302
    crypto_chacha20_ctr(cipher_text, plain_text, text_size, key, nonce, 0);
303

304
}
305
void crypto_ietf_chacha20(u8 *cipher_text, const u8 *plain_text,
306
                          size_t text_size,
307
                          const u8 key[32], const u8 nonce[12])
308
{
309
    crypto_ietf_chacha20_ctr(cipher_text, plain_text, text_size, key, nonce, 0);
310
}
311

312
void crypto_xchacha20(u8 *cipher_text, const u8 *plain_text, size_t text_size,
313
                      const u8 key[32], const u8 nonce[24])
314
{
315
    crypto_xchacha20_ctr(cipher_text, plain_text, text_size, key, nonce, 0);
316
}
317

318
/////////////////
319
/// Poly 1305 ///
320
/////////////////
321

322
// h = (h + c) * r
323
// preconditions:
324
//   ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
325
//   ctx->c <= 1_ffffffff_ffffffff_ffffffff_ffffffff
326
//   ctx->r <=   0ffffffc_0ffffffc_0ffffffc_0fffffff
327
// Postcondition:
328
//   ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
329
static void poly_block(crypto_poly1305_ctx *ctx)
330
{
331
    // s = h + c, without carry propagation
332
    const u64 s0 = ctx->h[0] + (u64)ctx->c[0]; // s0 <= 1_fffffffe
333
    const u64 s1 = ctx->h[1] + (u64)ctx->c[1]; // s1 <= 1_fffffffe
334
    const u64 s2 = ctx->h[2] + (u64)ctx->c[2]; // s2 <= 1_fffffffe
335
    const u64 s3 = ctx->h[3] + (u64)ctx->c[3]; // s3 <= 1_fffffffe
336
    const u32 s4 = ctx->h[4] +      ctx->c[4]; // s4 <=          5
337

338
    // Local all the things!
339
    const u32 r0 = ctx->r[0];       // r0  <= 0fffffff
340
    const u32 r1 = ctx->r[1];       // r1  <= 0ffffffc
341
    const u32 r2 = ctx->r[2];       // r2  <= 0ffffffc
342
    const u32 r3 = ctx->r[3];       // r3  <= 0ffffffc
343
    const u32 rr0 = (r0 >> 2) * 5;  // rr0 <= 13fffffb // lose 2 bits...
344
    const u32 rr1 = (r1 >> 2) + r1; // rr1 <= 13fffffb // rr1 == (r1 >> 2) * 5
345
    const u32 rr2 = (r2 >> 2) + r2; // rr2 <= 13fffffb // rr1 == (r2 >> 2) * 5
346
    const u32 rr3 = (r3 >> 2) + r3; // rr3 <= 13fffffb // rr1 == (r3 >> 2) * 5
347

348
    // (h + c) * r, without carry propagation
349
    const u64 x0 = s0*r0+ s1*rr3+ s2*rr2+ s3*rr1+ s4*rr0; // <= 97ffffe007fffff8
350
    const u64 x1 = s0*r1+ s1*r0 + s2*rr3+ s3*rr2+ s4*rr1; // <= 8fffffe20ffffff6
351
    const u64 x2 = s0*r2+ s1*r1 + s2*r0 + s3*rr3+ s4*rr2; // <= 87ffffe417fffff4
352
    const u64 x3 = s0*r3+ s1*r2 + s2*r1 + s3*r0 + s4*rr3; // <= 7fffffe61ffffff2
353
    const u32 x4 = s4 * (r0 & 3); // ...recover 2 bits    // <=                f
354

355
    // partial reduction modulo 2^130 - 5
356
    const u32 u5 = x4 + (x3 >> 32); // u5 <= 7ffffff5
357
    const u64 u0 = (u5 >>  2) * 5 + (x0 & 0xffffffff);
358
    const u64 u1 = (u0 >> 32)     + (x1 & 0xffffffff) + (x0 >> 32);
359
    const u64 u2 = (u1 >> 32)     + (x2 & 0xffffffff) + (x1 >> 32);
360
    const u64 u3 = (u2 >> 32)     + (x3 & 0xffffffff) + (x2 >> 32);
361
    const u64 u4 = (u3 >> 32)     + (u5 & 3);
362

363
    // Update the hash
364
    ctx->h[0] = (u32)u0; // u0 <= 1_9ffffff0
365
    ctx->h[1] = (u32)u1; // u1 <= 1_97ffffe0
366
    ctx->h[2] = (u32)u2; // u2 <= 1_8fffffe2
367
    ctx->h[3] = (u32)u3; // u3 <= 1_87ffffe4
368
    ctx->h[4] = (u32)u4; // u4 <=          4
369
}
370

371
// (re-)initialises the input counter and input buffer
372
static void poly_clear_c(crypto_poly1305_ctx *ctx)
373
{
374
    ZERO(ctx->c, 4);
375
    ctx->c_idx = 0;
376
}
377

378
static void poly_take_input(crypto_poly1305_ctx *ctx, u8 input)
379
{
380
    size_t word = ctx->c_idx >> 2;
381
    size_t byte = ctx->c_idx & 3;
382
    ctx->c[word] |= (u32)input << (byte * 8);
383
    ctx->c_idx++;
384
}
385

386
static void poly_update(crypto_poly1305_ctx *ctx,
387
                        const u8 *message, size_t message_size)
388
{
389
    FOR (i, 0, message_size) {
390
        poly_take_input(ctx, message[i]);
391
        if (ctx->c_idx == 16) {
392
            poly_block(ctx);
393
            poly_clear_c(ctx);
394
        }
395
    }
396
}
397

398
void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const u8 key[32])
399
{
400
    // Initial hash is zero
401
    ZERO(ctx->h, 5);
402
    // add 2^130 to every input block
403
    ctx->c[4] = 1;
404
    poly_clear_c(ctx);
405
    // load r and pad (r has some of its bits cleared)
406
    load32_le_buf(ctx->r  , key   , 4);
407
    load32_le_buf(ctx->pad, key+16, 4);
408
    FOR (i, 0, 1) { ctx->r[i] &= 0x0fffffff; }
409
    FOR (i, 1, 4) { ctx->r[i] &= 0x0ffffffc; }
410
}
411

412
void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
413
                            const u8 *message, size_t message_size)
414
{
415
    if (message_size == 0) {
416
        return;
417
    }
418
    // Align ourselves with block boundaries
419
    size_t aligned = MIN(align(ctx->c_idx, 16), message_size);
420
    poly_update(ctx, message, aligned);
421
    message      += aligned;
422
    message_size -= aligned;
423

424
    // Process the message block by block
425
    size_t nb_blocks = message_size >> 4;
426
    FOR (i, 0, nb_blocks) {
427
        load32_le_buf(ctx->c, message, 4);
428
        poly_block(ctx);
429
        message += 16;
430
    }
431
    if (nb_blocks > 0) {
432
        poly_clear_c(ctx);
433
    }
434
    message_size &= 15;
435

436
    // remaining bytes
437
    poly_update(ctx, message, message_size);
438
}
439

440
void crypto_poly1305_final(crypto_poly1305_ctx *ctx, u8 mac[16])
441
{
442
    // Process the last block (if any)
443
    if (ctx->c_idx != 0) {
444
        // move the final 1 according to remaining input length
445
        // (We may add less than 2^130 to the last input block)
446
        ctx->c[4] = 0;
447
        poly_take_input(ctx, 1);
448
        // one last hash update
449
        poly_block(ctx);
450
    }
451

452
    // check if we should subtract 2^130-5 by performing the
453
    // corresponding carry propagation.
454
    u64 c = 5;
455
    FOR (i, 0, 4) {
456
        c  += ctx->h[i];
457
        c >>= 32;
458
    }
459
    c += ctx->h[4];
460
    c  = (c >> 2) * 5; // shift the carry back to the beginning
461
    // c now indicates how many times we should subtract 2^130-5 (0 or 1)
462
    FOR (i, 0, 4) {
463
        c += (u64)ctx->h[i] + ctx->pad[i];
464
        store32_le(mac + i*4, (u32)c);
465
        c = c >> 32;
466
    }
467
    WIPE_CTX(ctx);
468
}
469

470
void crypto_poly1305(u8     mac[16],  const u8 *message,
471
                     size_t message_size, const u8  key[32])
472
{
473
    crypto_poly1305_ctx ctx;
474
    crypto_poly1305_init  (&ctx, key);
475
    crypto_poly1305_update(&ctx, message, message_size);
476
    crypto_poly1305_final (&ctx, mac);
477
}
478

479
////////////////
480
/// Blake2 b ///
481
////////////////
482
static const u64 iv[8] = {
483
    0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
484
    0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
485
    0x510e527fade682d1, 0x9b05688c2b3e6c1f,
486
    0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
487
};
488

489
// increment the input offset
490
static void blake2b_incr(crypto_blake2b_ctx *ctx)
491
{
492
    u64   *x = ctx->input_offset;
493
    size_t y = ctx->input_idx;
494
    x[0] += y;
495
    if (x[0] < y) {
496
        x[1]++;
497
    }
498
}
499

500
static void blake2b_compress(crypto_blake2b_ctx *ctx, int is_last_block)
501
{
502
    static const u8 sigma[12][16] = {
503
        {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
504
        { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
505
        { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
506
        {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
507
        {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
508
        {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
509
        { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
510
        { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
511
        {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
512
        { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
513
        {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
514
        { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
515
    };
516

517
    // init work vector
518
    u64 v0 = ctx->hash[0];  u64 v8  = iv[0];
519
    u64 v1 = ctx->hash[1];  u64 v9  = iv[1];
520
    u64 v2 = ctx->hash[2];  u64 v10 = iv[2];
521
    u64 v3 = ctx->hash[3];  u64 v11 = iv[3];
522
    u64 v4 = ctx->hash[4];  u64 v12 = iv[4] ^ ctx->input_offset[0];
523
    u64 v5 = ctx->hash[5];  u64 v13 = iv[5] ^ ctx->input_offset[1];
524
    u64 v6 = ctx->hash[6];  u64 v14 = iv[6] ^ (u64)~(is_last_block - 1);
525
    u64 v7 = ctx->hash[7];  u64 v15 = iv[7];
526

527
    // mangle work vector
528
    u64 *input = ctx->input;
529
#define BLAKE2_G(a, b, c, d, x, y)      \
530
    a += b + x;  d = rotr64(d ^ a, 32); \
531
    c += d;      b = rotr64(b ^ c, 24); \
532
    a += b + y;  d = rotr64(d ^ a, 16); \
533
    c += d;      b = rotr64(b ^ c, 63)
534
#define BLAKE2_ROUND(i)                                                 \
535
    BLAKE2_G(v0, v4, v8 , v12, input[sigma[i][ 0]], input[sigma[i][ 1]]); \
536
    BLAKE2_G(v1, v5, v9 , v13, input[sigma[i][ 2]], input[sigma[i][ 3]]); \
537
    BLAKE2_G(v2, v6, v10, v14, input[sigma[i][ 4]], input[sigma[i][ 5]]); \
538
    BLAKE2_G(v3, v7, v11, v15, input[sigma[i][ 6]], input[sigma[i][ 7]]); \
539
    BLAKE2_G(v0, v5, v10, v15, input[sigma[i][ 8]], input[sigma[i][ 9]]); \
540
    BLAKE2_G(v1, v6, v11, v12, input[sigma[i][10]], input[sigma[i][11]]); \
541
    BLAKE2_G(v2, v7, v8 , v13, input[sigma[i][12]], input[sigma[i][13]]); \
542
    BLAKE2_G(v3, v4, v9 , v14, input[sigma[i][14]], input[sigma[i][15]])
543

544
#ifdef BLAKE2_NO_UNROLLING
545
    FOR (i, 0, 12) {
546
        BLAKE2_ROUND(i);
547
    }
548
#else
549
    BLAKE2_ROUND(0);  BLAKE2_ROUND(1);  BLAKE2_ROUND(2);  BLAKE2_ROUND(3);
550
    BLAKE2_ROUND(4);  BLAKE2_ROUND(5);  BLAKE2_ROUND(6);  BLAKE2_ROUND(7);
551
    BLAKE2_ROUND(8);  BLAKE2_ROUND(9);  BLAKE2_ROUND(10); BLAKE2_ROUND(11);
552
#endif
553

554
    // update hash
555
    ctx->hash[0] ^= v0 ^ v8;   ctx->hash[1] ^= v1 ^ v9;
556
    ctx->hash[2] ^= v2 ^ v10;  ctx->hash[3] ^= v3 ^ v11;
557
    ctx->hash[4] ^= v4 ^ v12;  ctx->hash[5] ^= v5 ^ v13;
558
    ctx->hash[6] ^= v6 ^ v14;  ctx->hash[7] ^= v7 ^ v15;
559
}
560

561
static void blake2b_set_input(crypto_blake2b_ctx *ctx, u8 input, size_t index)
562
{
563
    if (index == 0) {
564
        ZERO(ctx->input, 16);
565
    }
566
    size_t word = index >> 3;
567
    size_t byte = index & 7;
568
    ctx->input[word] |= (u64)input << (byte << 3);
569

570
}
571

572
static void blake2b_end_block(crypto_blake2b_ctx *ctx)
573
{
574
    if (ctx->input_idx == 128) {  // If buffer is full,
575
        blake2b_incr(ctx);        // update the input offset
576
        blake2b_compress(ctx, 0); // and compress the (not last) block
577
        ctx->input_idx = 0;
578
    }
579
}
580

581
static void blake2b_update(crypto_blake2b_ctx *ctx,
582
                           const u8 *message, size_t message_size)
583
{
584
    FOR (i, 0, message_size) {
585
        blake2b_end_block(ctx);
586
        blake2b_set_input(ctx, message[i], ctx->input_idx);
587
        ctx->input_idx++;
588
    }
589
}
590

591
void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t hash_size,
592
                                 const u8           *key, size_t key_size)
593
{
594
    // initial hash
595
    COPY(ctx->hash, iv, 8);
596
    ctx->hash[0] ^= 0x01010000 ^ (key_size << 8) ^ hash_size;
597

598
    ctx->input_offset[0] = 0;         // beginning of the input, no offset
599
    ctx->input_offset[1] = 0;         // beginning of the input, no offset
600
    ctx->hash_size       = hash_size; // remember the hash size we want
601
    ctx->input_idx       = 0;
602

603
    // if there is a key, the first block is that key (padded with zeroes)
604
    if (key_size > 0) {
605
        u8 key_block[128] = {0};
606
        COPY(key_block, key, key_size);
607
        // same as calling crypto_blake2b_update(ctx, key_block , 128)
608
        load64_le_buf(ctx->input, key_block, 16);
609
        ctx->input_idx = 128;
610
    }
611
}
612

613
void crypto_blake2b_init(crypto_blake2b_ctx *ctx)
614
{
615
    crypto_blake2b_general_init(ctx, 64, 0, 0);
616
}
617

618
void crypto_blake2b_update(crypto_blake2b_ctx *ctx,
619
                           const u8 *message, size_t message_size)
620
{
621
    if (message_size == 0) {
622
        return;
623
    }
624
    // Align ourselves with block boundaries
625
    size_t aligned = MIN(align(ctx->input_idx, 128), message_size);
626
    blake2b_update(ctx, message, aligned);
627
    message      += aligned;
628
    message_size -= aligned;
629

630
    // Process the message block by block
631
    FOR (i, 0, message_size >> 7) { // number of blocks
632
        blake2b_end_block(ctx);
633
        load64_le_buf(ctx->input, message, 16);
634
        message += 128;
635
        ctx->input_idx = 128;
636
    }
637
    message_size &= 127;
638

639
    // remaining bytes
640
    blake2b_update(ctx, message, message_size);
641
}
642

643
void crypto_blake2b_final(crypto_blake2b_ctx *ctx, u8 *hash)
644
{
645
    // Pad the end of the block with zeroes
646
    FOR (i, ctx->input_idx, 128) {
647
        blake2b_set_input(ctx, 0, i);
648
    }
649
    blake2b_incr(ctx);        // update the input offset
650
    blake2b_compress(ctx, 1); // compress the last block
651
    size_t nb_words = ctx->hash_size >> 3;
652
    store64_le_buf(hash, ctx->hash, nb_words);
653
    FOR (i, nb_words << 3, ctx->hash_size) {
654
        hash[i] = (ctx->hash[i >> 3] >> (8 * (i & 7))) & 0xff;
655
    }
656
    WIPE_CTX(ctx);
657
}
658

659
void crypto_blake2b_general(u8       *hash   , size_t hash_size,
660
                            const u8 *key    , size_t key_size,
661
                            const u8 *message, size_t message_size)
662
{
663
    crypto_blake2b_ctx ctx;
664
    crypto_blake2b_general_init(&ctx, hash_size, key, key_size);
665
    crypto_blake2b_update(&ctx, message, message_size);
666
    crypto_blake2b_final(&ctx, hash);
667
}
668

669
void crypto_blake2b(u8 hash[64], const u8 *message, size_t message_size)
670
{
671
    crypto_blake2b_general(hash, 64, 0, 0, message, message_size);
672
}
673

674
static void blake2b_vtable_init(void *ctx) {
675
    crypto_blake2b_init(&((crypto_sign_ctx*)ctx)->hash);
676
}
677
static void blake2b_vtable_update(void *ctx, const u8 *m, size_t s) {
678
    crypto_blake2b_update(&((crypto_sign_ctx*)ctx)->hash, m, s);
679
}
680
static void blake2b_vtable_final(void *ctx, u8 *h) {
681
    crypto_blake2b_final(&((crypto_sign_ctx*)ctx)->hash, h);
682
}
683
const crypto_sign_vtable crypto_blake2b_vtable = {
684
    crypto_blake2b,
685
    blake2b_vtable_init,
686
    blake2b_vtable_update,
687
    blake2b_vtable_final,
688
    sizeof(crypto_sign_ctx),
689
};
690

691
#if MONOCYPHER_ARGON2_ENABLE
692
////////////////
693
/// Argon2 i ///
694
////////////////
695
// references to R, Z, Q etc. come from the spec
696

697
// Argon2 operates on 1024 byte blocks.
698
typedef struct { u64 a[128]; } block;
699

700
static void wipe_block(block *b)
701
{
702
    volatile u64* a = b->a;
703
    ZERO(a, 128);
704
}
705

706
// updates a Blake2 hash with a 32 bit word, little endian.
707
static void blake_update_32(crypto_blake2b_ctx *ctx, u32 input)
708
{
709
    u8 buf[4];
710
    store32_le(buf, input);
711
    crypto_blake2b_update(ctx, buf, 4);
712
    WIPE_BUFFER(buf);
713
}
714

715
static void load_block(block *b, const u8 bytes[1024])
716
{
717
    load64_le_buf(b->a, bytes, 128);
718
}
719

720
static void store_block(u8 bytes[1024], const block *b)
721
{
722
    store64_le_buf(bytes, b->a, 128);
723
}
724

725
static void copy_block(block *o,const block*in){FOR(i,0,128)o->a[i] = in->a[i];}
726
static void  xor_block(block *o,const block*in){FOR(i,0,128)o->a[i]^= in->a[i];}
727

728
// Hash with a virtually unlimited digest size.
729
// Doesn't extract more entropy than the base hash function.
730
// Mainly used for filling a whole kilobyte block with pseudo-random bytes.
731
// (One could use a stream cipher with a seed hash as the key, but
732
//  this would introduce another dependency —and point of failure.)
733
static void extended_hash(u8       *digest, u32 digest_size,
734
                          const u8 *input , u32 input_size)
735
{
736
    crypto_blake2b_ctx ctx;
737
    crypto_blake2b_general_init(&ctx, MIN(digest_size, 64), 0, 0);
738
    blake_update_32            (&ctx, digest_size);
739
    crypto_blake2b_update      (&ctx, input, input_size);
740
    crypto_blake2b_final       (&ctx, digest);
741

742
    if (digest_size > 64) {
743
        // the conversion to u64 avoids integer overflow on
744
        // ludicrously big hash sizes.
745
        u32 r   = (u32)(((u64)digest_size + 31) >> 5) - 2;
746
        u32 i   =  1;
747
        u32 in  =  0;
748
        u32 out = 32;
749
        while (i < r) {
750
            // Input and output overlap. This is intentional
751
            crypto_blake2b(digest + out, digest + in, 64);
752
            i   +=  1;
753
            in  += 32;
754
            out += 32;
755
        }
756
        crypto_blake2b_general(digest + out, digest_size - (32 * r),
757
                               0, 0, // no key
758
                               digest + in , 64);
759
    }
760
}
761

762
#define LSB(x) ((x) & 0xffffffff)
763
#define G(a, b, c, d)                                            \
764
    a += b + 2 * LSB(a) * LSB(b);  d ^= a;  d = rotr64(d, 32);   \
765
    c += d + 2 * LSB(c) * LSB(d);  b ^= c;  b = rotr64(b, 24);   \
766
    a += b + 2 * LSB(a) * LSB(b);  d ^= a;  d = rotr64(d, 16);   \
767
    c += d + 2 * LSB(c) * LSB(d);  b ^= c;  b = rotr64(b, 63)
768
#define ROUND(v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7,    \
769
              v8,  v9, v10, v11, v12, v13, v14, v15)    \
770
    G(v0, v4,  v8, v12);  G(v1, v5,  v9, v13);          \
771
    G(v2, v6, v10, v14);  G(v3, v7, v11, v15);          \
772
    G(v0, v5, v10, v15);  G(v1, v6, v11, v12);          \
773
    G(v2, v7,  v8, v13);  G(v3, v4,  v9, v14)
774

775
// Core of the compression function G.  Computes Z from R in place.
776
static void g_rounds(block *work_block)
777
{
778
    // column rounds (work_block = Q)
779
    for (int i = 0; i < 128; i += 16) {
780
        ROUND(work_block->a[i     ], work_block->a[i +  1],
781
              work_block->a[i +  2], work_block->a[i +  3],
782
              work_block->a[i +  4], work_block->a[i +  5],
783
              work_block->a[i +  6], work_block->a[i +  7],
784
              work_block->a[i +  8], work_block->a[i +  9],
785
              work_block->a[i + 10], work_block->a[i + 11],
786
              work_block->a[i + 12], work_block->a[i + 13],
787
              work_block->a[i + 14], work_block->a[i + 15]);
788
    }
789
    // row rounds (work_block = Z)
790
    for (int i = 0; i < 16; i += 2) {
791
        ROUND(work_block->a[i      ], work_block->a[i +   1],
792
              work_block->a[i +  16], work_block->a[i +  17],
793
              work_block->a[i +  32], work_block->a[i +  33],
794
              work_block->a[i +  48], work_block->a[i +  49],
795
              work_block->a[i +  64], work_block->a[i +  65],
796
              work_block->a[i +  80], work_block->a[i +  81],
797
              work_block->a[i +  96], work_block->a[i +  97],
798
              work_block->a[i + 112], work_block->a[i + 113]);
799
    }
800
}
801

802
// The compression function G (copy version for the first pass)
803
static void g_copy(block *result, const block *x, const block *y, block* tmp)
804
{
805
    copy_block(tmp   , x  ); // tmp    = X
806
    xor_block (tmp   , y  ); // tmp    = X ^ Y = R
807
    copy_block(result, tmp); // result = R         (only difference with g_xor)
808
    g_rounds  (tmp);         // tmp    = Z
809
    xor_block (result, tmp); // result = R ^ Z
810
}
811

812
// The compression function G (xor version for subsequent passes)
813
static void g_xor(block *result, const block *x, const block *y, block *tmp)
814
{
815
    copy_block(tmp   , x  ); // tmp    = X
816
    xor_block (tmp   , y  ); // tmp    = X ^ Y = R
817
    xor_block (result, tmp); // result = R ^ old   (only difference with g_copy)
818
    g_rounds  (tmp);         // tmp    = Z
819
    xor_block (result, tmp); // result = R ^ old ^ Z
820
}
821

822
// Unary version of the compression function.
823
// The missing argument is implied zero.
824
// Does the transformation in place.
825
static void unary_g(block *work_block, block *tmp)
826
{
827
    // work_block == R
828
    copy_block(tmp, work_block); // tmp        = R
829
    g_rounds  (work_block);      // work_block = Z
830
    xor_block (work_block, tmp); // work_block = Z ^ R
831
}
832

833
// Argon2i uses a kind of stream cipher to determine which reference
834
// block it will take to synthesise the next block.  This context hold
835
// that stream's state.  (It's very similar to Chacha20.  The block b
836
// is analogous to Chacha's own pool)
837
typedef struct {
838
    block b;
839
    u32 pass_number;
840
    u32 slice_number;
841
    u32 nb_blocks;
842
    u32 nb_iterations;
843
    u32 ctr;
844
    u32 offset;
845
} gidx_ctx;
846

847
// The block in the context will determine array indices. To avoid
848
// timing attacks, it only depends on public information.  No looking
849
// at a previous block to seed the next.  This makes offline attacks
850
// easier, but timing attacks are the bigger threat in many settings.
851
static void gidx_refresh(gidx_ctx *ctx)
852
{
853
    // seed the beginning of the block...
854
    ctx->b.a[0] = ctx->pass_number;
855
    ctx->b.a[1] = 0;  // lane number (we have only one)
856
    ctx->b.a[2] = ctx->slice_number;
857
    ctx->b.a[3] = ctx->nb_blocks;
858
    ctx->b.a[4] = ctx->nb_iterations;
859
    ctx->b.a[5] = 1;  // type: Argon2i
860
    ctx->b.a[6] = ctx->ctr;
861
    ZERO(ctx->b.a + 7, 121); // ...then zero the rest out
862

863
    // Shuffle the block thus: ctx->b = G((G(ctx->b, zero)), zero)
864
    // (G "square" function), to get cheap pseudo-random numbers.
865
    block tmp;
866
    unary_g(&ctx->b, &tmp);
867
    unary_g(&ctx->b, &tmp);
868
    wipe_block(&tmp);
869
}
870

871
static void gidx_init(gidx_ctx *ctx,
872
                      u32 pass_number, u32 slice_number,
873
                      u32 nb_blocks,   u32 nb_iterations)
874
{
875
    ctx->pass_number   = pass_number;
876
    ctx->slice_number  = slice_number;
877
    ctx->nb_blocks     = nb_blocks;
878
    ctx->nb_iterations = nb_iterations;
879
    ctx->ctr           = 0;
880

881
    // Offset from the beginning of the segment.  For the first slice
882
    // of the first pass, we start at the *third* block, so the offset
883
    // starts at 2, not 0.
884
    if (pass_number != 0 || slice_number != 0) {
885
        ctx->offset = 0;
886
    } else {
887
        ctx->offset = 2;
888
        ctx->ctr++;         // Compensates for missed lazy creation
889
        gidx_refresh(ctx);  // at the start of gidx_next()
890
    }
891
}
892

893
static u32 gidx_next(gidx_ctx *ctx)
894
{
895
    // lazily creates the offset block we need
896
    if ((ctx->offset & 127) == 0) {
897
        ctx->ctr++;
898
        gidx_refresh(ctx);
899
    }
900
    u32 index  = ctx->offset & 127; // save index  for current call
901
    u32 offset = ctx->offset;       // save offset for current call
902
    ctx->offset++;                  // update offset for next call
903

904
    // Computes the area size.
905
    // Pass 0 : all already finished segments plus already constructed
906
    //          blocks in this segment
907
    // Pass 1+: 3 last segments plus already constructed
908
    //          blocks in this segment.  THE SPEC SUGGESTS OTHERWISE.
909
    //          I CONFORM TO THE REFERENCE IMPLEMENTATION.
910
    int first_pass  = ctx->pass_number == 0;
911
    u32 slice_size  = ctx->nb_blocks >> 2;
912
    u32 nb_segments = first_pass ? ctx->slice_number : 3;
913
    u32 area_size   = nb_segments * slice_size + offset - 1;
914

915
    // Computes the starting position of the reference area.
916
    // CONTRARY TO WHAT THE SPEC SUGGESTS, IT STARTS AT THE
917
    // NEXT SEGMENT, NOT THE NEXT BLOCK.
918
    u32 next_slice = ((ctx->slice_number + 1) & 3) * slice_size;
919
    u32 start_pos  = first_pass ? 0 : next_slice;
920

921
    // Generate offset from J1 (no need for J2, there's only one lane)
922
    u64 j1  = ctx->b.a[index] & 0xffffffff; // pseudo-random number
923
    u64 x   = (j1 * j1)       >> 32;
924
    u64 y   = (area_size * x) >> 32;
925
    u64 z   = (area_size - 1) - y;
926
    u64 ref = start_pos + z;                // ref < 2 * nb_blocks
927
    return (u32)(ref < ctx->nb_blocks ? ref : ref - ctx->nb_blocks);
928
}
929

930
// Main algorithm
931
void crypto_argon2i_general(u8       *hash,      u32 hash_size,
932
                            void     *work_area, u32 nb_blocks,
933
                            u32 nb_iterations,
934
                            const u8 *password,  u32 password_size,
935
                            const u8 *salt,      u32 salt_size,
936
                            const u8 *key,       u32 key_size,
937
                            const u8 *ad,        u32 ad_size)
938
{
939
    // work area seen as blocks (must be suitably aligned)
940
    block *blocks = (block*)work_area;
941
    {
942
        crypto_blake2b_ctx ctx;
943
        crypto_blake2b_init(&ctx);
944

945
        blake_update_32      (&ctx, 1            ); // p: number of threads
946
        blake_update_32      (&ctx, hash_size    );
947
        blake_update_32      (&ctx, nb_blocks    );
948
        blake_update_32      (&ctx, nb_iterations);
949
        blake_update_32      (&ctx, 0x13         ); // v: version number
950
        blake_update_32      (&ctx, 1            ); // y: Argon2i
951
        blake_update_32      (&ctx,           password_size);
952
        crypto_blake2b_update(&ctx, password, password_size);
953
        blake_update_32      (&ctx,           salt_size);
954
        crypto_blake2b_update(&ctx, salt,     salt_size);
955
        blake_update_32      (&ctx,           key_size);
956
        crypto_blake2b_update(&ctx, key,      key_size);
957
        blake_update_32      (&ctx,           ad_size);
958
        crypto_blake2b_update(&ctx, ad,       ad_size);
959

960
        u8 initial_hash[72]; // 64 bytes plus 2 words for future hashes
961
        crypto_blake2b_final(&ctx, initial_hash);
962

963
        // fill first 2 blocks
964
        block tmp_block;
965
        u8    hash_area[1024];
966
        store32_le(initial_hash + 64, 0); // first  additional word
967
        store32_le(initial_hash + 68, 0); // second additional word
968
        extended_hash(hash_area, 1024, initial_hash, 72);
969
        load_block(&tmp_block, hash_area);
970
        copy_block(blocks, &tmp_block);
971

972
        store32_le(initial_hash + 64, 1); // slight modification
973
        extended_hash(hash_area, 1024, initial_hash, 72);
974
        load_block(&tmp_block, hash_area);
975
        copy_block(blocks + 1, &tmp_block);
976

977
        WIPE_BUFFER(initial_hash);
978
        WIPE_BUFFER(hash_area);
979
        wipe_block(&tmp_block);
980
    }
981

982
    // Actual number of blocks
983
    nb_blocks -= nb_blocks & 3; // round down to 4 p (p == 1 thread)
984
    const u32 segment_size = nb_blocks >> 2;
985

986
    // fill (then re-fill) the rest of the blocks
987
    block tmp;
988
    gidx_ctx ctx; // public information, no need to wipe
989
    FOR_T (u32, pass_number, 0, nb_iterations) {
990
        int first_pass = pass_number == 0;
991

992
        FOR_T (u32, segment, 0, 4) {
993
            gidx_init(&ctx, pass_number, segment, nb_blocks, nb_iterations);
994

995
            // On the first segment of the first pass,
996
            // blocks 0 and 1 are already filled.
997
            // We use the offset to skip them.
998
            u32 start_offset  = first_pass && segment == 0 ? 2 : 0;
999
            u32 segment_start = segment * segment_size + start_offset;
1000
            u32 segment_end   = (segment + 1) * segment_size;
1001
            FOR_T (u32, current_block, segment_start, segment_end) {
1002
                u32 reference_block = gidx_next(&ctx);
1003
                u32 previous_block  = current_block == 0
1004
                                    ? nb_blocks - 1
1005
                                    : current_block - 1;
1006
                block *c = blocks + current_block;
1007
                block *p = blocks + previous_block;
1008
                block *r = blocks + reference_block;
1009
                if (first_pass) { g_copy(c, p, r, &tmp); }
1010
                else            { g_xor (c, p, r, &tmp); }
1011
            }
1012
        }
1013
    }
1014
    wipe_block(&tmp);
1015
    u8 final_block[1024];
1016
    store_block(final_block, blocks + (nb_blocks - 1));
1017

1018
    // wipe work area
1019
    volatile u64 *p = (u64*)work_area;
1020
    ZERO(p, 128 * nb_blocks);
1021

1022
    // hash the very last block with H' into the output hash
1023
    extended_hash(hash, hash_size, final_block, 1024);
1024
    WIPE_BUFFER(final_block);
1025
}
1026

1027
void crypto_argon2i(u8   *hash,      u32 hash_size,
1028
                    void *work_area, u32 nb_blocks, u32 nb_iterations,
1029
                    const u8 *password,  u32 password_size,
1030
                    const u8 *salt,      u32 salt_size)
1031
{
1032
    crypto_argon2i_general(hash, hash_size, work_area, nb_blocks, nb_iterations,
1033
                           password, password_size, salt , salt_size, 0,0,0,0);
1034
}
1035

1036
#endif // MONOCYPHER_ARGON2_ENABLE
1037

1038
////////////////////////////////////
1039
/// Arithmetic modulo 2^255 - 19 ///
1040
////////////////////////////////////
1041
//  Originally taken from SUPERCOP's ref10 implementation.
1042
//  A bit bigger than TweetNaCl, over 4 times faster.
1043

1044
// field element
1045
typedef i32 fe[10];
1046

1047
// field constants
1048
//
1049
// fe_one      : 1
1050
// sqrtm1      : sqrt(-1)
1051
// d           :     -121665 / 121666
1052
// D2          : 2 * -121665 / 121666
1053
// lop_x, lop_y: low order point in Edwards coordinates
1054
// ufactor     : -sqrt(-1) * 2
1055
// A2          : 486662^2  (A squared)
1056
static const fe fe_one  = {1};
1057
static const fe sqrtm1  = {-32595792, -7943725, 9377950, 3500415, 12389472,
1058
                           -272473, -25146209, -2005654, 326686, 11406482,};
1059
static const fe d       = {-10913610, 13857413, -15372611, 6949391, 114729,
1060
                           -8787816, -6275908, -3247719, -18696448, -12055116,};
1061
static const fe D2      = {-21827239, -5839606, -30745221, 13898782, 229458,
1062
                           15978800, -12551817, -6495438, 29715968, 9444199,};
1063
static const fe lop_x   = {21352778, 5345713, 4660180, -8347857, 24143090,
1064
                           14568123, 30185756, -12247770, -33528939, 8345319,};
1065
static const fe lop_y   = {-6952922, -1265500, 6862341, -7057498, -4037696,
1066
                           -5447722, 31680899, -15325402, -19365852, 1569102,};
1067
static const fe ufactor = {-1917299, 15887451, -18755900, -7000830, -24778944,
1068
                           544946, -16816446, 4011309, -653372, 10741468,};
1069
static const fe A2      = {12721188, 3529, 0, 0, 0, 0, 0, 0, 0, 0,};
1070

1071
static void fe_0(fe h) {           ZERO(h  , 10); }
1072
static void fe_1(fe h) { h[0] = 1; ZERO(h+1,  9); }
1073

1074
static void fe_copy(fe h,const fe f           ){FOR(i,0,10) h[i] =  f[i];      }
1075
static void fe_neg (fe h,const fe f           ){FOR(i,0,10) h[i] = -f[i];      }
1076
static void fe_add (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] + g[i];}
1077
static void fe_sub (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] - g[i];}
1078

1079
static void fe_cswap(fe f, fe g, int b)
1080
{
1081
    i32 mask = -b; // -1 = 0xffffffff
1082
    FOR (i, 0, 10) {
1083
        i32 x = (f[i] ^ g[i]) & mask;
1084
        f[i] = f[i] ^ x;
1085
        g[i] = g[i] ^ x;
1086
    }
1087
}
1088

1089
static void fe_ccopy(fe f, const fe g, int b)
1090
{
1091
    i32 mask = -b; // -1 = 0xffffffff
1092
    FOR (i, 0, 10) {
1093
        i32 x = (f[i] ^ g[i]) & mask;
1094
        f[i] = f[i] ^ x;
1095
    }
1096
}
1097

1098

1099
// Signed carry propagation
1100
// ------------------------
1101
//
1102
// Let t be a number.  It can be uniquely decomposed thus:
1103
//
1104
//    t = h*2^26 + l
1105
//    such that -2^25 <= l < 2^25
1106
//
1107
// Let c = (t + 2^25) / 2^26            (rounded down)
1108
//     c = (h*2^26 + l + 2^25) / 2^26   (rounded down)
1109
//     c =  h   +   (l + 2^25) / 2^26   (rounded down)
1110
//     c =  h                           (exactly)
1111
// Because 0 <= l + 2^25 < 2^26
1112
//
1113
// Let u = t          - c*2^26
1114
//     u = h*2^26 + l - h*2^26
1115
//     u = l
1116
// Therefore, -2^25 <= u < 2^25
1117
//
1118
// Additionally, if |t| < x, then |h| < x/2^26 (rounded down)
1119
//
1120
// Notations:
1121
// - In C, 1<<25 means 2^25.
1122
// - In C, x>>25 means floor(x / (2^25)).
1123
// - All of the above applies with 25 & 24 as well as 26 & 25.
1124
//
1125
//
1126
// Note on negative right shifts
1127
// -----------------------------
1128
//
1129
// In C, x >> n, where x is a negative integer, is implementation
1130
// defined.  In practice, all platforms do arithmetic shift, which is
1131
// equivalent to division by 2^26, rounded down.  Some compilers, like
1132
// GCC, even guarantee it.
1133
//
1134
// If we ever stumble upon a platform that does not propagate the sign
1135
// bit (we won't), visible failures will show at the slightest test, and
1136
// the signed shifts can be replaced by the following:
1137
//
1138
//     typedef struct { i64 x:39; } s25;
1139
//     typedef struct { i64 x:38; } s26;
1140
//     i64 shift25(i64 x) { s25 s; s.x = ((u64)x)>>25; return s.x; }
1141
//     i64 shift26(i64 x) { s26 s; s.x = ((u64)x)>>26; return s.x; }
1142
//
1143
// Current compilers cannot optimise this, causing a 30% drop in
1144
// performance.  Fairly expensive for something that never happens.
1145
//
1146
//
1147
// Precondition
1148
// ------------
1149
//
1150
// |t0|       < 2^63
1151
// |t1|..|t9| < 2^62
1152
//
1153
// Algorithm
1154
// ---------
1155
// c   = t0 + 2^25 / 2^26   -- |c|  <= 2^36
1156
// t0 -= c * 2^26           -- |t0| <= 2^25
1157
// t1 += c                  -- |t1| <= 2^63
1158
//
1159
// c   = t4 + 2^25 / 2^26   -- |c|  <= 2^36
1160
// t4 -= c * 2^26           -- |t4| <= 2^25
1161
// t5 += c                  -- |t5| <= 2^63
1162
//
1163
// c   = t1 + 2^24 / 2^25   -- |c|  <= 2^38
1164
// t1 -= c * 2^25           -- |t1| <= 2^24
1165
// t2 += c                  -- |t2| <= 2^63
1166
//
1167
// c   = t5 + 2^24 / 2^25   -- |c|  <= 2^38
1168
// t5 -= c * 2^25           -- |t5| <= 2^24
1169
// t6 += c                  -- |t6| <= 2^63
1170
//
1171
// c   = t2 + 2^25 / 2^26   -- |c|  <= 2^37
1172
// t2 -= c * 2^26           -- |t2| <= 2^25        < 1.1 * 2^25  (final t2)
1173
// t3 += c                  -- |t3| <= 2^63
1174
//
1175
// c   = t6 + 2^25 / 2^26   -- |c|  <= 2^37
1176
// t6 -= c * 2^26           -- |t6| <= 2^25        < 1.1 * 2^25  (final t6)
1177
// t7 += c                  -- |t7| <= 2^63
1178
//
1179
// c   = t3 + 2^24 / 2^25   -- |c|  <= 2^38
1180
// t3 -= c * 2^25           -- |t3| <= 2^24        < 1.1 * 2^24  (final t3)
1181
// t4 += c                  -- |t4| <= 2^25 + 2^38 < 2^39
1182
//
1183
// c   = t7 + 2^24 / 2^25   -- |c|  <= 2^38
1184
// t7 -= c * 2^25           -- |t7| <= 2^24        < 1.1 * 2^24  (final t7)
1185
// t8 += c                  -- |t8| <= 2^63
1186
//
1187
// c   = t4 + 2^25 / 2^26   -- |c|  <= 2^13
1188
// t4 -= c * 2^26           -- |t4| <= 2^25        < 1.1 * 2^25  (final t4)
1189
// t5 += c                  -- |t5| <= 2^24 + 2^13 < 1.1 * 2^24  (final t5)
1190
//
1191
// c   = t8 + 2^25 / 2^26   -- |c|  <= 2^37
1192
// t8 -= c * 2^26           -- |t8| <= 2^25        < 1.1 * 2^25  (final t8)
1193
// t9 += c                  -- |t9| <= 2^63
1194
//
1195
// c   = t9 + 2^24 / 2^25   -- |c|  <= 2^38
1196
// t9 -= c * 2^25           -- |t9| <= 2^24        < 1.1 * 2^24  (final t9)
1197
// t0 += c * 19             -- |t0| <= 2^25 + 2^38*19 < 2^44
1198
//
1199
// c   = t0 + 2^25 / 2^26   -- |c|  <= 2^18
1200
// t0 -= c * 2^26           -- |t0| <= 2^25        < 1.1 * 2^25  (final t0)
1201
// t1 += c                  -- |t1| <= 2^24 + 2^18 < 1.1 * 2^24  (final t1)
1202
//
1203
// Postcondition
1204
// -------------
1205
//   |t0|, |t2|, |t4|, |t6|, |t8|  <  1.1 * 2^25
1206
//   |t1|, |t3|, |t5|, |t7|, |t9|  <  1.1 * 2^24
1207
#define FE_CARRY                                                        \
1208
    i64 c;                                                              \
1209
    c = (t0 + ((i64)1<<25)) >> 26;  t0 -= c * ((i64)1 << 26);  t1 += c; \
1210
    c = (t4 + ((i64)1<<25)) >> 26;  t4 -= c * ((i64)1 << 26);  t5 += c; \
1211
    c = (t1 + ((i64)1<<24)) >> 25;  t1 -= c * ((i64)1 << 25);  t2 += c; \
1212
    c = (t5 + ((i64)1<<24)) >> 25;  t5 -= c * ((i64)1 << 25);  t6 += c; \
1213
    c = (t2 + ((i64)1<<25)) >> 26;  t2 -= c * ((i64)1 << 26);  t3 += c; \
1214
    c = (t6 + ((i64)1<<25)) >> 26;  t6 -= c * ((i64)1 << 26);  t7 += c; \
1215
    c = (t3 + ((i64)1<<24)) >> 25;  t3 -= c * ((i64)1 << 25);  t4 += c; \
1216
    c = (t7 + ((i64)1<<24)) >> 25;  t7 -= c * ((i64)1 << 25);  t8 += c; \
1217
    c = (t4 + ((i64)1<<25)) >> 26;  t4 -= c * ((i64)1 << 26);  t5 += c; \
1218
    c = (t8 + ((i64)1<<25)) >> 26;  t8 -= c * ((i64)1 << 26);  t9 += c; \
1219
    c = (t9 + ((i64)1<<24)) >> 25;  t9 -= c * ((i64)1 << 25);  t0 += c * 19; \
1220
    c = (t0 + ((i64)1<<25)) >> 26;  t0 -= c * ((i64)1 << 26);  t1 += c; \
1221
    h[0]=(i32)t0;  h[1]=(i32)t1;  h[2]=(i32)t2;  h[3]=(i32)t3;  h[4]=(i32)t4; \
1222
    h[5]=(i32)t5;  h[6]=(i32)t6;  h[7]=(i32)t7;  h[8]=(i32)t8;  h[9]=(i32)t9
1223

1224
static void fe_frombytes(fe h, const u8 s[32])
1225
{
1226
    i64 t0 =  load32_le(s);                        // t0 < 2^32
1227
    i64 t1 =  load24_le(s +  4) << 6;              // t1 < 2^30
1228
    i64 t2 =  load24_le(s +  7) << 5;              // t2 < 2^29
1229
    i64 t3 =  load24_le(s + 10) << 3;              // t3 < 2^27
1230
    i64 t4 =  load24_le(s + 13) << 2;              // t4 < 2^26
1231
    i64 t5 =  load32_le(s + 16);                   // t5 < 2^32
1232
    i64 t6 =  load24_le(s + 20) << 7;              // t6 < 2^31
1233
    i64 t7 =  load24_le(s + 23) << 5;              // t7 < 2^29
1234
    i64 t8 =  load24_le(s + 26) << 4;              // t8 < 2^28
1235
    i64 t9 = (load24_le(s + 29) & 0x7fffff) << 2;  // t9 < 2^25
1236
    FE_CARRY;                                      // Carry recondition OK
1237
}
1238

1239
// Precondition
1240
//   |h[0]|, |h[2]|, |h[4]|, |h[6]|, |h[8]|  <  1.1 * 2^25
1241
//   |h[1]|, |h[3]|, |h[5]|, |h[7]|, |h[9]|  <  1.1 * 2^24
1242
//
1243
// Therefore, |h| < 2^255-19
1244
// There are two possibilities:
1245
//
1246
// - If h is positive, all we need to do is reduce its individual
1247
//   limbs down to their tight positive range.
1248
// - If h is negative, we also need to add 2^255-19 to it.
1249
//   Or just remove 19 and chop off any excess bit.
1250
static void fe_tobytes(u8 s[32], const fe h)
1251
{
1252
    i32 t[10];
1253
    COPY(t, h, 10);
1254
    i32 q = (19 * t[9] + (((i32) 1) << 24)) >> 25;
1255
    //                 |t9|                    < 1.1 * 2^24
1256
    //  -1.1 * 2^24  <  t9                     < 1.1 * 2^24
1257
    //  -21  * 2^24  <  19 * t9                < 21  * 2^24
1258
    //  -2^29        <  19 * t9 + 2^24         < 2^29
1259
    //  -2^29 / 2^25 < (19 * t9 + 2^24) / 2^25 < 2^29 / 2^25
1260
    //  -16          < (19 * t9 + 2^24) / 2^25 < 16
1261
    FOR (i, 0, 5) {
1262
        q += t[2*i  ]; q >>= 26; // q = 0 or -1
1263
        q += t[2*i+1]; q >>= 25; // q = 0 or -1
1264
    }
1265
    // q =  0 iff h >= 0
1266
    // q = -1 iff h <  0
1267
    // Adding q * 19 to h reduces h to its proper range.
1268
    q *= 19;  // Shift carry back to the beginning
1269
    FOR (i, 0, 5) {
1270
        t[i*2  ] += q;  q = t[i*2  ] >> 26;  t[i*2  ] -= q * ((i32)1 << 26);
1271
        t[i*2+1] += q;  q = t[i*2+1] >> 25;  t[i*2+1] -= q * ((i32)1 << 25);
1272
    }
1273
    // h is now fully reduced, and q represents the excess bit.
1274

1275
    store32_le(s +  0, ((u32)t[0] >>  0) | ((u32)t[1] << 26));
1276
    store32_le(s +  4, ((u32)t[1] >>  6) | ((u32)t[2] << 19));
1277
    store32_le(s +  8, ((u32)t[2] >> 13) | ((u32)t[3] << 13));
1278
    store32_le(s + 12, ((u32)t[3] >> 19) | ((u32)t[4] <<  6));
1279
    store32_le(s + 16, ((u32)t[5] >>  0) | ((u32)t[6] << 25));
1280
    store32_le(s + 20, ((u32)t[6] >>  7) | ((u32)t[7] << 19));
1281
    store32_le(s + 24, ((u32)t[7] >> 13) | ((u32)t[8] << 12));
1282
    store32_le(s + 28, ((u32)t[8] >> 20) | ((u32)t[9] <<  6));
1283

1284
    WIPE_BUFFER(t);
1285
}
1286

1287
// Precondition
1288
// -------------
1289
//   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
1290
//   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
1291
//
1292
//   |g0|, |g2|, |g4|, |g6|, |g8|  <  1.65 * 2^26
1293
//   |g1|, |g3|, |g5|, |g7|, |g9|  <  1.65 * 2^25
1294
static void fe_mul_small(fe h, const fe f, i32 g)
1295
{
1296
    i64 t0 = f[0] * (i64) g;  i64 t1 = f[1] * (i64) g;
1297
    i64 t2 = f[2] * (i64) g;  i64 t3 = f[3] * (i64) g;
1298
    i64 t4 = f[4] * (i64) g;  i64 t5 = f[5] * (i64) g;
1299
    i64 t6 = f[6] * (i64) g;  i64 t7 = f[7] * (i64) g;
1300
    i64 t8 = f[8] * (i64) g;  i64 t9 = f[9] * (i64) g;
1301
    // |t0|, |t2|, |t4|, |t6|, |t8|  <  1.65 * 2^26 * 2^31  < 2^58
1302
    // |t1|, |t3|, |t5|, |t7|, |t9|  <  1.65 * 2^25 * 2^31  < 2^57
1303

1304
    FE_CARRY; // Carry precondition OK
1305
}
1306

1307
// Precondition
1308
// -------------
1309
//   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
1310
//   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
1311
//
1312
//   |g0|, |g2|, |g4|, |g6|, |g8|  <  1.65 * 2^26
1313
//   |g1|, |g3|, |g5|, |g7|, |g9|  <  1.65 * 2^25
1314
static void fe_mul(fe h, const fe f, const fe g)
1315
{
1316
    // Everything is unrolled and put in temporary variables.
1317
    // We could roll the loop, but that would make curve25519 twice as slow.
1318
    i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
1319
    i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
1320
    i32 g0 = g[0]; i32 g1 = g[1]; i32 g2 = g[2]; i32 g3 = g[3]; i32 g4 = g[4];
1321
    i32 g5 = g[5]; i32 g6 = g[6]; i32 g7 = g[7]; i32 g8 = g[8]; i32 g9 = g[9];
1322
    i32 F1 = f1*2; i32 F3 = f3*2; i32 F5 = f5*2; i32 F7 = f7*2; i32 F9 = f9*2;
1323
    i32 G1 = g1*19;  i32 G2 = g2*19;  i32 G3 = g3*19;
1324
    i32 G4 = g4*19;  i32 G5 = g5*19;  i32 G6 = g6*19;
1325
    i32 G7 = g7*19;  i32 G8 = g8*19;  i32 G9 = g9*19;
1326
    // |F1|, |F3|, |F5|, |F7|, |F9|  <  1.65 * 2^26
1327
    // |G0|, |G2|, |G4|, |G6|, |G8|  <  2^31
1328
    // |G1|, |G3|, |G5|, |G7|, |G9|  <  2^30
1329

1330
    i64 t0 = f0*(i64)g0 + F1*(i64)G9 + f2*(i64)G8 + F3*(i64)G7 + f4*(i64)G6
1331
        +    F5*(i64)G5 + f6*(i64)G4 + F7*(i64)G3 + f8*(i64)G2 + F9*(i64)G1;
1332
    i64 t1 = f0*(i64)g1 + f1*(i64)g0 + f2*(i64)G9 + f3*(i64)G8 + f4*(i64)G7
1333
        +    f5*(i64)G6 + f6*(i64)G5 + f7*(i64)G4 + f8*(i64)G3 + f9*(i64)G2;
1334
    i64 t2 = f0*(i64)g2 + F1*(i64)g1 + f2*(i64)g0 + F3*(i64)G9 + f4*(i64)G8
1335
        +    F5*(i64)G7 + f6*(i64)G6 + F7*(i64)G5 + f8*(i64)G4 + F9*(i64)G3;
1336
    i64 t3 = f0*(i64)g3 + f1*(i64)g2 + f2*(i64)g1 + f3*(i64)g0 + f4*(i64)G9
1337
        +    f5*(i64)G8 + f6*(i64)G7 + f7*(i64)G6 + f8*(i64)G5 + f9*(i64)G4;
1338
    i64 t4 = f0*(i64)g4 + F1*(i64)g3 + f2*(i64)g2 + F3*(i64)g1 + f4*(i64)g0
1339
        +    F5*(i64)G9 + f6*(i64)G8 + F7*(i64)G7 + f8*(i64)G6 + F9*(i64)G5;
1340
    i64 t5 = f0*(i64)g5 + f1*(i64)g4 + f2*(i64)g3 + f3*(i64)g2 + f4*(i64)g1
1341
        +    f5*(i64)g0 + f6*(i64)G9 + f7*(i64)G8 + f8*(i64)G7 + f9*(i64)G6;
1342
    i64 t6 = f0*(i64)g6 + F1*(i64)g5 + f2*(i64)g4 + F3*(i64)g3 + f4*(i64)g2
1343
        +    F5*(i64)g1 + f6*(i64)g0 + F7*(i64)G9 + f8*(i64)G8 + F9*(i64)G7;
1344
    i64 t7 = f0*(i64)g7 + f1*(i64)g6 + f2*(i64)g5 + f3*(i64)g4 + f4*(i64)g3
1345
        +    f5*(i64)g2 + f6*(i64)g1 + f7*(i64)g0 + f8*(i64)G9 + f9*(i64)G8;
1346
    i64 t8 = f0*(i64)g8 + F1*(i64)g7 + f2*(i64)g6 + F3*(i64)g5 + f4*(i64)g4
1347
        +    F5*(i64)g3 + f6*(i64)g2 + F7*(i64)g1 + f8*(i64)g0 + F9*(i64)G9;
1348
    i64 t9 = f0*(i64)g9 + f1*(i64)g8 + f2*(i64)g7 + f3*(i64)g6 + f4*(i64)g5
1349
        +    f5*(i64)g4 + f6*(i64)g3 + f7*(i64)g2 + f8*(i64)g1 + f9*(i64)g0;
1350
    // t0 < 0.67 * 2^61
1351
    // t1 < 0.41 * 2^61
1352
    // t2 < 0.52 * 2^61
1353
    // t3 < 0.32 * 2^61
1354
    // t4 < 0.38 * 2^61
1355
    // t5 < 0.22 * 2^61
1356
    // t6 < 0.23 * 2^61
1357
    // t7 < 0.13 * 2^61
1358
    // t8 < 0.09 * 2^61
1359
    // t9 < 0.03 * 2^61
1360

1361
    FE_CARRY; // Everything below 2^62, Carry precondition OK
1362
}
1363

1364
// Precondition
1365
// -------------
1366
//   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
1367
//   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
1368
//
1369
// Note: we could use fe_mul() for this, but this is significantly faster
1370
static void fe_sq(fe h, const fe f)
1371
{
1372
    i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
1373
    i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
1374
    i32 f0_2  = f0*2;   i32 f1_2  = f1*2;   i32 f2_2  = f2*2;   i32 f3_2 = f3*2;
1375
    i32 f4_2  = f4*2;   i32 f5_2  = f5*2;   i32 f6_2  = f6*2;   i32 f7_2 = f7*2;
1376
    i32 f5_38 = f5*38;  i32 f6_19 = f6*19;  i32 f7_38 = f7*38;
1377
    i32 f8_19 = f8*19;  i32 f9_38 = f9*38;
1378
    // |f0_2| , |f2_2| , |f4_2| , |f6_2| , |f8_2|  <  1.65 * 2^27
1379
    // |f1_2| , |f3_2| , |f5_2| , |f7_2| , |f9_2|  <  1.65 * 2^26
1380
    // |f5_38|, |f6_19|, |f7_38|, |f8_19|, |f9_38| <  2^31
1381

1382
    i64 t0 = f0  *(i64)f0    + f1_2*(i64)f9_38 + f2_2*(i64)f8_19
1383
        +    f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5  *(i64)f5_38;
1384
    i64 t1 = f0_2*(i64)f1    + f2  *(i64)f9_38 + f3_2*(i64)f8_19
1385
        +    f4  *(i64)f7_38 + f5_2*(i64)f6_19;
1386
    i64 t2 = f0_2*(i64)f2    + f1_2*(i64)f1    + f3_2*(i64)f9_38
1387
        +    f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6  *(i64)f6_19;
1388
    i64 t3 = f0_2*(i64)f3    + f1_2*(i64)f2    + f4  *(i64)f9_38
1389
        +    f5_2*(i64)f8_19 + f6  *(i64)f7_38;
1390
    i64 t4 = f0_2*(i64)f4    + f1_2*(i64)f3_2  + f2  *(i64)f2
1391
        +    f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7  *(i64)f7_38;
1392
    i64 t5 = f0_2*(i64)f5    + f1_2*(i64)f4    + f2_2*(i64)f3
1393
        +    f6  *(i64)f9_38 + f7_2*(i64)f8_19;
1394
    i64 t6 = f0_2*(i64)f6    + f1_2*(i64)f5_2  + f2_2*(i64)f4
1395
        +    f3_2*(i64)f3    + f7_2*(i64)f9_38 + f8  *(i64)f8_19;
1396
    i64 t7 = f0_2*(i64)f7    + f1_2*(i64)f6    + f2_2*(i64)f5
1397
        +    f3_2*(i64)f4    + f8  *(i64)f9_38;
1398
    i64 t8 = f0_2*(i64)f8    + f1_2*(i64)f7_2  + f2_2*(i64)f6
1399
        +    f3_2*(i64)f5_2  + f4  *(i64)f4    + f9  *(i64)f9_38;
1400
    i64 t9 = f0_2*(i64)f9    + f1_2*(i64)f8    + f2_2*(i64)f7
1401
        +    f3_2*(i64)f6    + f4  *(i64)f5_2;
1402
    // t0 < 0.67 * 2^61
1403
    // t1 < 0.41 * 2^61
1404
    // t2 < 0.52 * 2^61
1405
    // t3 < 0.32 * 2^61
1406
    // t4 < 0.38 * 2^61
1407
    // t5 < 0.22 * 2^61
1408
    // t6 < 0.23 * 2^61
1409
    // t7 < 0.13 * 2^61
1410
    // t8 < 0.09 * 2^61
1411
    // t9 < 0.03 * 2^61
1412

1413
    FE_CARRY;
1414
}
1415

1416
// h = 2 * (f^2)
1417
//
1418
// Precondition
1419
// -------------
1420
//   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
1421
//   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
1422
//
1423
// Note: we could implement fe_sq2() by copying fe_sq(), multiplying
1424
// each limb by 2, *then* perform the carry.  This saves one carry.
1425
// However, doing so with the stated preconditions does not work (t2
1426
// would overflow).  There are 3 ways to solve this:
1427
//
1428
// 1. Show that t2 actually never overflows (it really does not).
1429
// 2. Accept an additional carry, at a small lost of performance.
1430
// 3. Make sure the input of fe_sq2() is freshly carried.
1431
//
1432
// SUPERCOP ref10 relies on (1).
1433
// Monocypher chose (2) and (3), mostly to save code.
1434
static void fe_sq2(fe h, const fe f)
1435
{
1436
    fe_sq(h, f);
1437
    fe_mul_small(h, h, 2);
1438
}
1439

1440
// This could be simplified, but it would be slower
1441
static void fe_pow22523(fe out, const fe z)
1442
{
1443
    fe t0, t1, t2;
1444
    fe_sq(t0, z);
1445
    fe_sq(t1,t0);                   fe_sq(t1, t1);  fe_mul(t1, z, t1);
1446
    fe_mul(t0, t0, t1);
1447
    fe_sq(t0, t0);                                  fe_mul(t0, t1, t0);
1448
    fe_sq(t1, t0);  FOR (i, 1,   5) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
1449
    fe_sq(t1, t0);  FOR (i, 1,  10) fe_sq(t1, t1);  fe_mul(t1, t1, t0);
1450
    fe_sq(t2, t1);  FOR (i, 1,  20) fe_sq(t2, t2);  fe_mul(t1, t2, t1);
1451
    fe_sq(t1, t1);  FOR (i, 1,  10) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
1452
    fe_sq(t1, t0);  FOR (i, 1,  50) fe_sq(t1, t1);  fe_mul(t1, t1, t0);
1453
    fe_sq(t2, t1);  FOR (i, 1, 100) fe_sq(t2, t2);  fe_mul(t1, t2, t1);
1454
    fe_sq(t1, t1);  FOR (i, 1,  50) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
1455
    fe_sq(t0, t0);  FOR (i, 1,   2) fe_sq(t0, t0);  fe_mul(out, t0, z);
1456
    WIPE_BUFFER(t0);
1457
    WIPE_BUFFER(t1);
1458
    WIPE_BUFFER(t2);
1459
}
1460

1461
// Inverting means multiplying by 2^255 - 21
1462
// 2^255 - 21 = (2^252 - 3) * 8 + 3
1463
// So we reuse the multiplication chain of fe_pow22523
1464
static void fe_invert(fe out, const fe z)
1465
{
1466
    fe tmp;
1467
    fe_pow22523(tmp, z);
1468
    // tmp2^8 * z^3
1469
    fe_sq(tmp, tmp);                        // 0
1470
    fe_sq(tmp, tmp);  fe_mul(tmp, tmp, z);  // 1
1471
    fe_sq(tmp, tmp);  fe_mul(out, tmp, z);  // 1
1472
    WIPE_BUFFER(tmp);
1473
}
1474

1475
//  Parity check.  Returns 0 if even, 1 if odd
1476
static int fe_isodd(const fe f)
1477
{
1478
    u8 s[32];
1479
    fe_tobytes(s, f);
1480
    u8 isodd = s[0] & 1;
1481
    WIPE_BUFFER(s);
1482
    return isodd;
1483
}
1484

1485
// Returns 1 if equal, 0 if not equal
1486
static int fe_isequal(const fe f, const fe g)
1487
{
1488
    u8 fs[32];
1489
    u8 gs[32];
1490
    fe_tobytes(fs, f);
1491
    fe_tobytes(gs, g);
1492
    int isdifferent = crypto_verify32(fs, gs);
1493
    WIPE_BUFFER(fs);
1494
    WIPE_BUFFER(gs);
1495
    return 1 + isdifferent;
1496
}
1497

1498
// Inverse square root.
1499
// Returns true if x is a non zero square, false otherwise.
1500
// After the call:
1501
//   isr = sqrt(1/x)        if x is non-zero square.
1502
//   isr = sqrt(sqrt(-1)/x) if x is not a square.
1503
//   isr = 0                if x is zero.
1504
// We do not guarantee the sign of the square root.
1505
//
1506
// Notes:
1507
// Let quartic = x^((p-1)/4)
1508
//
1509
// x^((p-1)/2) = chi(x)
1510
// quartic^2   = chi(x)
1511
// quartic     = sqrt(chi(x))
1512
// quartic     = 1 or -1 or sqrt(-1) or -sqrt(-1)
1513
//
1514
// Note that x is a square if quartic is 1 or -1
1515
// There are 4 cases to consider:
1516
//
1517
// if   quartic         = 1  (x is a square)
1518
// then x^((p-1)/4)     = 1
1519
//      x^((p-5)/4) * x = 1
1520
//      x^((p-5)/4)     = 1/x
1521
//      x^((p-5)/8)     = sqrt(1/x) or -sqrt(1/x)
1522
//
1523
// if   quartic                = -1  (x is a square)
1524
// then x^((p-1)/4)            = -1
1525
//      x^((p-5)/4) * x        = -1
1526
//      x^((p-5)/4)            = -1/x
1527
//      x^((p-5)/8)            = sqrt(-1)   / sqrt(x)
1528
//      x^((p-5)/8) * sqrt(-1) = sqrt(-1)^2 / sqrt(x)
1529
//      x^((p-5)/8) * sqrt(-1) = -1/sqrt(x)
1530
//      x^((p-5)/8) * sqrt(-1) = -sqrt(1/x) or sqrt(1/x)
1531
//
1532
// if   quartic         = sqrt(-1)  (x is not a square)
1533
// then x^((p-1)/4)     = sqrt(-1)
1534
//      x^((p-5)/4) * x = sqrt(-1)
1535
//      x^((p-5)/4)     = sqrt(-1)/x
1536
//      x^((p-5)/8)     = sqrt(sqrt(-1)/x) or -sqrt(sqrt(-1)/x)
1537
//
1538
// Note that the product of two non-squares is always a square:
1539
//   For any non-squares a and b, chi(a) = -1 and chi(b) = -1.
1540
//   Since chi(x) = x^((p-1)/2), chi(a)*chi(b) = chi(a*b) = 1.
1541
//   Therefore a*b is a square.
1542
//
1543
//   Since sqrt(-1) and x are both non-squares, their product is a
1544
//   square, and we can compute their square root.
1545
//
1546
// if   quartic                = -sqrt(-1)  (x is not a square)
1547
// then x^((p-1)/4)            = -sqrt(-1)
1548
//      x^((p-5)/4) * x        = -sqrt(-1)
1549
//      x^((p-5)/4)            = -sqrt(-1)/x
1550
//      x^((p-5)/8)            = sqrt(-sqrt(-1)/x)
1551
//      x^((p-5)/8)            = sqrt( sqrt(-1)/x) * sqrt(-1)
1552
//      x^((p-5)/8) * sqrt(-1) = sqrt( sqrt(-1)/x) * sqrt(-1)^2
1553
//      x^((p-5)/8) * sqrt(-1) = sqrt( sqrt(-1)/x) * -1
1554
//      x^((p-5)/8) * sqrt(-1) = -sqrt(sqrt(-1)/x) or sqrt(sqrt(-1)/x)
1555
static int invsqrt(fe isr, const fe x)
1556
{
1557
    fe check, quartic;
1558
    fe_copy(check, x);
1559
    fe_pow22523(isr, check);
1560
    fe_sq (quartic, isr);
1561
    fe_mul(quartic, quartic, check);
1562
    fe_1  (check);          int p1 = fe_isequal(quartic, check);
1563
    fe_neg(check, check );  int m1 = fe_isequal(quartic, check);
1564
    fe_neg(check, sqrtm1);  int ms = fe_isequal(quartic, check);
1565
    fe_mul(check, isr, sqrtm1);
1566
    fe_ccopy(isr, check, m1 | ms);
1567
    WIPE_BUFFER(quartic);
1568
    WIPE_BUFFER(check);
1569
    return p1 | m1;
1570
}
1571

1572
// trim a scalar for scalar multiplication
1573
static void trim_scalar(u8 scalar[32])
1574
{
1575
    scalar[ 0] &= 248;
1576
    scalar[31] &= 127;
1577
    scalar[31] |= 64;
1578
}
1579

1580
// get bit from scalar at position i
1581
static int scalar_bit(const u8 s[32], int i)
1582
{
1583
    if (i < 0) { return 0; } // handle -1 for sliding windows
1584
    return (s[i>>3] >> (i&7)) & 1;
1585
}
1586

1587
///////////////
1588
/// X-25519 /// Taken from SUPERCOP's ref10 implementation.
1589
///////////////
1590
static void scalarmult(u8 q[32], const u8 scalar[32], const u8 p[32],
1591
                       int nb_bits)
1592
{
1593
    // computes the scalar product
1594
    fe x1;
1595
    fe_frombytes(x1, p);
1596

1597
    // computes the actual scalar product (the result is in x2 and z2)
1598
    fe x2, z2, x3, z3, t0, t1;
1599
    // Montgomery ladder
1600
    // In projective coordinates, to avoid divisions: x = X / Z
1601
    // We don't care about the y coordinate, it's only 1 bit of information
1602
    fe_1(x2);        fe_0(z2); // "zero" point
1603
    fe_copy(x3, x1); fe_1(z3); // "one"  point
1604
    int swap = 0;
1605
    for (int pos = nb_bits-1; pos >= 0; --pos) {
1606
        // constant time conditional swap before ladder step
1607
        int b = scalar_bit(scalar, pos);
1608
        swap ^= b; // xor trick avoids swapping at the end of the loop
1609
        fe_cswap(x2, x3, swap);
1610
        fe_cswap(z2, z3, swap);
1611
        swap = b;  // anticipates one last swap after the loop
1612

1613
        // Montgomery ladder step: replaces (P2, P3) by (P2*2, P2+P3)
1614
        // with differential addition
1615
        fe_sub(t0, x3, z3);
1616
        fe_sub(t1, x2, z2);
1617
        fe_add(x2, x2, z2);
1618
        fe_add(z2, x3, z3);
1619
        fe_mul(z3, t0, x2);
1620
        fe_mul(z2, z2, t1);
1621
        fe_sq (t0, t1    );
1622
        fe_sq (t1, x2    );
1623
        fe_add(x3, z3, z2);
1624
        fe_sub(z2, z3, z2);
1625
        fe_mul(x2, t1, t0);
1626
        fe_sub(t1, t1, t0);
1627
        fe_sq (z2, z2    );
1628
        fe_mul_small(z3, t1, 121666);
1629
        fe_sq (x3, x3    );
1630
        fe_add(t0, t0, z3);
1631
        fe_mul(z3, x1, z2);
1632
        fe_mul(z2, t1, t0);
1633
    }
1634
    // last swap is necessary to compensate for the xor trick
1635
    // Note: after this swap, P3 == P2 + P1.
1636
    fe_cswap(x2, x3, swap);
1637
    fe_cswap(z2, z3, swap);
1638

1639
    // normalises the coordinates: x == X / Z
1640
    fe_invert(z2, z2);
1641
    fe_mul(x2, x2, z2);
1642
    fe_tobytes(q, x2);
1643

1644
    WIPE_BUFFER(x1);
1645
    WIPE_BUFFER(x2);  WIPE_BUFFER(z2);  WIPE_BUFFER(t0);
1646
    WIPE_BUFFER(x3);  WIPE_BUFFER(z3);  WIPE_BUFFER(t1);
1647
}
1648

1649
void crypto_x25519(u8       raw_shared_secret[32],
1650
                   const u8 your_secret_key  [32],
1651
                   const u8 their_public_key [32])
1652
{
1653
    // restrict the possible scalar values
1654
    u8 e[32];
1655
    COPY(e, your_secret_key, 32);
1656
    trim_scalar(e);
1657
    scalarmult(raw_shared_secret, e, their_public_key, 255);
1658
    WIPE_BUFFER(e);
1659
}
1660

1661
void crypto_x25519_public_key(u8       public_key[32],
1662
                              const u8 secret_key[32])
1663
{
1664
    static const u8 base_point[32] = {9};
1665
    crypto_x25519(public_key, secret_key, base_point);
1666
}
1667

1668
///////////////////////////
1669
/// Arithmetic modulo L ///
1670
///////////////////////////
1671
static const u32 L[8] = {0x5cf5d3ed, 0x5812631a, 0xa2f79cd6, 0x14def9de,
1672
                         0x00000000, 0x00000000, 0x00000000, 0x10000000,};
1673

1674
//  p = a*b + p
1675
static void multiply(u32 p[16], const u32 a[8], const u32 b[8])
1676
{
1677
    FOR (i, 0, 8) {
1678
        u64 carry = 0;
1679
        FOR (j, 0, 8) {
1680
            carry  += p[i+j] + (u64)a[i] * b[j];
1681
            p[i+j]  = (u32)carry;
1682
            carry >>= 32;
1683
        }
1684
        p[i+8] = (u32)carry;
1685
    }
1686
}
1687

1688
static int is_above_l(const u32 x[8])
1689
{
1690
    // We work with L directly, in a 2's complement encoding
1691
    // (-L == ~L + 1)
1692
    u64 carry = 1;
1693
    FOR (i, 0, 8) {
1694
        carry += (u64)x[i] + ~L[i];
1695
        carry >>= 32;
1696
    }
1697
    return carry;
1698
}
1699

1700
// Final reduction modulo L, by conditionally removing L.
1701
// if x < l     , then r = x
1702
// if l <= x 2*l, then r = x-l
1703
// otherwise the result will be wrong
1704
static void remove_l(u32 r[8], const u32 x[8])
1705
{
1706
    u64 carry = is_above_l(x);
1707
    u32 mask  = ~(u32)carry + 1; // carry == 0 or 1
1708
    FOR (i, 0, 8) {
1709
        carry += (u64)x[i] + (~L[i] & mask);
1710
        r[i]   = (u32)carry;
1711
        carry >>= 32;
1712
    }
1713
}
1714

1715
// Full reduction modulo L (Barrett reduction)
1716
static void mod_l(u8 reduced[32], const u32 x[16])
1717
{
1718
    static const u32 r[9] = {0x0a2c131b,0xed9ce5a3,0x086329a7,0x2106215d,
1719
                             0xffffffeb,0xffffffff,0xffffffff,0xffffffff,0xf,};
1720
    // xr = x * r
1721
    u32 xr[25] = {0};
1722
    FOR (i, 0, 9) {
1723
        u64 carry = 0;
1724
        FOR (j, 0, 16) {
1725
            carry  += xr[i+j] + (u64)r[i] * x[j];
1726
            xr[i+j] = (u32)carry;
1727
            carry >>= 32;
1728
        }
1729
        xr[i+16] = (u32)carry;
1730
    }
1731
    // xr = floor(xr / 2^512) * L
1732
    // Since the result is guaranteed to be below 2*L,
1733
    // it is enough to only compute the first 256 bits.
1734
    // The division is performed by saying xr[i+16]. (16 * 32 = 512)
1735
    ZERO(xr, 8);
1736
    FOR (i, 0, 8) {
1737
        u64 carry = 0;
1738
        FOR (j, 0, 8-i) {
1739
            carry   += xr[i+j] + (u64)xr[i+16] * L[j];
1740
            xr[i+j] = (u32)carry;
1741
            carry >>= 32;
1742
        }
1743
    }
1744
    // xr = x - xr
1745
    u64 carry = 1;
1746
    FOR (i, 0, 8) {
1747
        carry  += (u64)x[i] + ~xr[i];
1748
        xr[i]   = (u32)carry;
1749
        carry >>= 32;
1750
    }
1751
    // Final reduction modulo L (conditional subtraction)
1752
    remove_l(xr, xr);
1753
    store32_le_buf(reduced, xr, 8);
1754

1755
    WIPE_BUFFER(xr);
1756
}
1757

1758
static void reduce(u8 r[64])
1759
{
1760
    u32 x[16];
1761
    load32_le_buf(x, r, 16);
1762
    mod_l(r, x);
1763
    WIPE_BUFFER(x);
1764
}
1765

1766
// r = (a * b) + c
1767
static void mul_add(u8 r[32], const u8 a[32], const u8 b[32], const u8 c[32])
1768
{
1769
    u32 A[8];  load32_le_buf(A, a, 8);
1770
    u32 B[8];  load32_le_buf(B, b, 8);
1771
    u32 p[16];
1772
    load32_le_buf(p, c, 8);
1773
    ZERO(p + 8, 8);
1774
    multiply(p, A, B);
1775
    mod_l(r, p);
1776
    WIPE_BUFFER(p);
1777
    WIPE_BUFFER(A);
1778
    WIPE_BUFFER(B);
1779
}
1780

1781
///////////////
1782
/// Ed25519 ///
1783
///////////////
1784

1785
// Point (group element, ge) in a twisted Edwards curve,
1786
// in extended projective coordinates.
1787
// ge        : x  = X/Z, y  = Y/Z, T  = XY/Z
1788
// ge_cached : Yp = X+Y, Ym = X-Y, T2 = T*D2
1789
// ge_precomp: Z  = 1
1790
typedef struct { fe X;  fe Y;  fe Z; fe T;  } ge;
1791
typedef struct { fe Yp; fe Ym; fe Z; fe T2; } ge_cached;
1792
typedef struct { fe Yp; fe Ym;       fe T2; } ge_precomp;
1793

1794
static void ge_zero(ge *p)
1795
{
1796
    fe_0(p->X);
1797
    fe_1(p->Y);
1798
    fe_1(p->Z);
1799
    fe_0(p->T);
1800
}
1801

1802
static void ge_tobytes(u8 s[32], const ge *h)
1803
{
1804
    fe recip, x, y;
1805
    fe_invert(recip, h->Z);
1806
    fe_mul(x, h->X, recip);
1807
    fe_mul(y, h->Y, recip);
1808
    fe_tobytes(s, y);
1809
    s[31] ^= fe_isodd(x) << 7;
1810

1811
    WIPE_BUFFER(recip);
1812
    WIPE_BUFFER(x);
1813
    WIPE_BUFFER(y);
1814
}
1815

1816
// h = s, where s is a point encoded in 32 bytes
1817
//
1818
// Variable time!  Inputs must not be secret!
1819
// => Use only to *check* signatures.
1820
//
1821
// From the specifications:
1822
//   The encoding of s contains y and the sign of x
1823
//   x = sqrt((y^2 - 1) / (d*y^2 + 1))
1824
// In extended coordinates:
1825
//   X = x, Y = y, Z = 1, T = x*y
1826
//
1827
//    Note that num * den is a square iff num / den is a square
1828
//    If num * den is not a square, the point was not on the curve.
1829
// From the above:
1830
//   Let num =   y^2 - 1
1831
//   Let den = d*y^2 + 1
1832
//   x = sqrt((y^2 - 1) / (d*y^2 + 1))
1833
//   x = sqrt(num / den)
1834
//   x = sqrt(num^2 / (num * den))
1835
//   x = num * sqrt(1 / (num * den))
1836
//
1837
// Therefore, we can just compute:
1838
//   num =   y^2 - 1
1839
//   den = d*y^2 + 1
1840
//   isr = invsqrt(num * den)  // abort if not square
1841
//   x   = num * isr
1842
// Finally, negate x if its sign is not as specified.
1843
static int ge_frombytes_vartime(ge *h, const u8 s[32])
1844
{
1845
    fe_frombytes(h->Y, s);
1846
    fe_1(h->Z);
1847
    fe_sq (h->T, h->Y);        // t =   y^2
1848
    fe_mul(h->X, h->T, d   );  // x = d*y^2
1849
    fe_sub(h->T, h->T, h->Z);  // t =   y^2 - 1
1850
    fe_add(h->X, h->X, h->Z);  // x = d*y^2 + 1
1851
    fe_mul(h->X, h->T, h->X);  // x = (y^2 - 1) * (d*y^2 + 1)
1852
    int is_square = invsqrt(h->X, h->X);
1853
    if (!is_square) {
1854
        return -1;             // Not on the curve, abort
1855
    }
1856
    fe_mul(h->X, h->T, h->X);  // x = sqrt((y^2 - 1) / (d*y^2 + 1))
1857
    if (fe_isodd(h->X) != (s[31] >> 7)) {
1858
        fe_neg(h->X, h->X);
1859
    }
1860
    fe_mul(h->T, h->X, h->Y);
1861
    return 0;
1862
}
1863

1864
static void ge_cache(ge_cached *c, const ge *p)
1865
{
1866
    fe_add (c->Yp, p->Y, p->X);
1867
    fe_sub (c->Ym, p->Y, p->X);
1868
    fe_copy(c->Z , p->Z      );
1869
    fe_mul (c->T2, p->T, D2  );
1870
}
1871

1872
// Internal buffers are not wiped! Inputs must not be secret!
1873
// => Use only to *check* signatures.
1874
static void ge_add(ge *s, const ge *p, const ge_cached *q)
1875
{
1876
    fe a, b;
1877
    fe_add(a   , p->Y, p->X );
1878
    fe_sub(b   , p->Y, p->X );
1879
    fe_mul(a   , a   , q->Yp);
1880
    fe_mul(b   , b   , q->Ym);
1881
    fe_add(s->Y, a   , b    );
1882
    fe_sub(s->X, a   , b    );
1883

1884
    fe_add(s->Z, p->Z, p->Z );
1885
    fe_mul(s->Z, s->Z, q->Z );
1886
    fe_mul(s->T, p->T, q->T2);
1887
    fe_add(a   , s->Z, s->T );
1888
    fe_sub(b   , s->Z, s->T );
1889

1890
    fe_mul(s->T, s->X, s->Y);
1891
    fe_mul(s->X, s->X, b   );
1892
    fe_mul(s->Y, s->Y, a   );
1893
    fe_mul(s->Z, a   , b   );
1894
}
1895

1896
// Internal buffers are not wiped! Inputs must not be secret!
1897
// => Use only to *check* signatures.
1898
static void ge_sub(ge *s, const ge *p, const ge_cached *q)
1899
{
1900
    ge_cached neg;
1901
    fe_copy(neg.Ym, q->Yp);
1902
    fe_copy(neg.Yp, q->Ym);
1903
    fe_copy(neg.Z , q->Z );
1904
    fe_neg (neg.T2, q->T2);
1905
    ge_add(s, p, &neg);
1906
}
1907

1908
static void ge_madd(ge *s, const ge *p, const ge_precomp *q, fe a, fe b)
1909
{
1910
    fe_add(a   , p->Y, p->X );
1911
    fe_sub(b   , p->Y, p->X );
1912
    fe_mul(a   , a   , q->Yp);
1913
    fe_mul(b   , b   , q->Ym);
1914
    fe_add(s->Y, a   , b    );
1915
    fe_sub(s->X, a   , b    );
1916

1917
    fe_add(s->Z, p->Z, p->Z );
1918
    fe_mul(s->T, p->T, q->T2);
1919
    fe_add(a   , s->Z, s->T );
1920
    fe_sub(b   , s->Z, s->T );
1921

1922
    fe_mul(s->T, s->X, s->Y);
1923
    fe_mul(s->X, s->X, b   );
1924
    fe_mul(s->Y, s->Y, a   );
1925
    fe_mul(s->Z, a   , b   );
1926
}
1927

1928
static void ge_msub(ge *s, const ge *p, const ge_precomp *q, fe a, fe b)
1929
{
1930
    fe_add(a   , p->Y, p->X );
1931
    fe_sub(b   , p->Y, p->X );
1932
    fe_mul(a   , a   , q->Ym);
1933
    fe_mul(b   , b   , q->Yp);
1934
    fe_add(s->Y, a   , b    );
1935
    fe_sub(s->X, a   , b    );
1936

1937
    fe_add(s->Z, p->Z, p->Z );
1938
    fe_mul(s->T, p->T, q->T2);
1939
    fe_sub(a   , s->Z, s->T );
1940
    fe_add(b   , s->Z, s->T );
1941

1942
    fe_mul(s->T, s->X, s->Y);
1943
    fe_mul(s->X, s->X, b   );
1944
    fe_mul(s->Y, s->Y, a   );
1945
    fe_mul(s->Z, a   , b   );
1946
}
1947

1948
static void ge_double(ge *s, const ge *p, ge *q)
1949
{
1950
    fe_sq (q->X, p->X);
1951
    fe_sq (q->Y, p->Y);
1952
    fe_sq2(q->Z, p->Z);
1953
    fe_add(q->T, p->X, p->Y);
1954
    fe_sq (s->T, q->T);
1955
    fe_add(q->T, q->Y, q->X);
1956
    fe_sub(q->Y, q->Y, q->X);
1957
    fe_sub(q->X, s->T, q->T);
1958
    fe_sub(q->Z, q->Z, q->Y);
1959

1960
    fe_mul(s->X, q->X , q->Z);
1961
    fe_mul(s->Y, q->T , q->Y);
1962
    fe_mul(s->Z, q->Y , q->Z);
1963
    fe_mul(s->T, q->X , q->T);
1964
}
1965

1966
// 5-bit signed window in cached format (Niels coordinates, Z=1)
1967
static const ge_precomp b_window[8] = {
1968
    {{25967493,-14356035,29566456,3660896,-12694345,
1969
      4014787,27544626,-11754271,-6079156,2047605,},
1970
     {-12545711,934262,-2722910,3049990,-727428,
1971
      9406986,12720692,5043384,19500929,-15469378,},
1972
     {-8738181,4489570,9688441,-14785194,10184609,
1973
      -12363380,29287919,11864899,-24514362,-4438546,},},
1974
    {{15636291,-9688557,24204773,-7912398,616977,
1975
      -16685262,27787600,-14772189,28944400,-1550024,},
1976
     {16568933,4717097,-11556148,-1102322,15682896,
1977
      -11807043,16354577,-11775962,7689662,11199574,},
1978
     {30464156,-5976125,-11779434,-15670865,23220365,
1979
      15915852,7512774,10017326,-17749093,-9920357,},},
1980
    {{10861363,11473154,27284546,1981175,-30064349,
1981
      12577861,32867885,14515107,-15438304,10819380,},
1982
     {4708026,6336745,20377586,9066809,-11272109,
1983
      6594696,-25653668,12483688,-12668491,5581306,},
1984
     {19563160,16186464,-29386857,4097519,10237984,
1985
      -4348115,28542350,13850243,-23678021,-15815942,},},
1986
    {{5153746,9909285,1723747,-2777874,30523605,
1987
      5516873,19480852,5230134,-23952439,-15175766,},
1988
     {-30269007,-3463509,7665486,10083793,28475525,
1989
      1649722,20654025,16520125,30598449,7715701,},
1990
     {28881845,14381568,9657904,3680757,-20181635,
1991
      7843316,-31400660,1370708,29794553,-1409300,},},
1992
    {{-22518993,-6692182,14201702,-8745502,-23510406,
1993
      8844726,18474211,-1361450,-13062696,13821877,},
1994
     {-6455177,-7839871,3374702,-4740862,-27098617,
1995
      -10571707,31655028,-7212327,18853322,-14220951,},
1996
     {4566830,-12963868,-28974889,-12240689,-7602672,
1997
      -2830569,-8514358,-10431137,2207753,-3209784,},},
1998
    {{-25154831,-4185821,29681144,7868801,-6854661,
1999
      -9423865,-12437364,-663000,-31111463,-16132436,},
2000
     {25576264,-2703214,7349804,-11814844,16472782,
2001
      9300885,3844789,15725684,171356,6466918,},
2002
     {23103977,13316479,9739013,-16149481,817875,
2003
      -15038942,8965339,-14088058,-30714912,16193877,},},
2004
    {{-33521811,3180713,-2394130,14003687,-16903474,
2005
      -16270840,17238398,4729455,-18074513,9256800,},
2006
     {-25182317,-4174131,32336398,5036987,-21236817,
2007
      11360617,22616405,9761698,-19827198,630305,},
2008
     {-13720693,2639453,-24237460,-7406481,9494427,
2009
      -5774029,-6554551,-15960994,-2449256,-14291300,},},
2010
    {{-3151181,-5046075,9282714,6866145,-31907062,
2011
      -863023,-18940575,15033784,25105118,-7894876,},
2012
     {-24326370,15950226,-31801215,-14592823,-11662737,
2013
      -5090925,1573892,-2625887,2198790,-15804619,},
2014
     {-3099351,10324967,-2241613,7453183,-5446979,
2015
      -2735503,-13812022,-16236442,-32461234,-12290683,},},
2016
};
2017

2018
// Incremental sliding windows (left to right)
2019
// Based on Roberto Maria Avanzi[2005]
2020
typedef struct {
2021
    i16 next_index; // position of the next signed digit
2022
    i8  next_digit; // next signed digit (odd number below 2^window_width)
2023
    u8  next_check; // point at which we must check for a new window
2024
} slide_ctx;
2025

2026
static void slide_init(slide_ctx *ctx, const u8 scalar[32])
2027
{
2028
    // scalar is guaranteed to be below L, either because we checked (s),
2029
    // or because we reduced it modulo L (h_ram). L is under 2^253, so
2030
    // so bits 253 to 255 are guaranteed to be zero. No need to test them.
2031
    //
2032
    // Note however that L is very close to 2^252, so bit 252 is almost
2033
    // always zero.  If we were to start at bit 251, the tests wouldn't
2034
    // catch the off-by-one error (constructing one that does would be
2035
    // prohibitively expensive).
2036
    //
2037
    // We should still check bit 252, though.
2038
    int i = 252;
2039
    while (i > 0 && scalar_bit(scalar, i) == 0) {
2040
        i--;
2041
    }
2042
    ctx->next_check = (u8)(i + 1);
2043
    ctx->next_index = -1;
2044
    ctx->next_digit = -1;
2045
}
2046

2047
static int slide_step(slide_ctx *ctx, int width, int i, const u8 scalar[32])
2048
{
2049
    if (i == ctx->next_check) {
2050
        if (scalar_bit(scalar, i) == scalar_bit(scalar, i - 1)) {
2051
            ctx->next_check--;
2052
        } else {
2053
            // compute digit of next window
2054
            int w = MIN(width, i + 1);
2055
            int v = -(scalar_bit(scalar, i) << (w-1));
2056
            FOR_T (int, j, 0, w-1) {
2057
                v += scalar_bit(scalar, i-(w-1)+j) << j;
2058
            }
2059
            v += scalar_bit(scalar, i-w);
2060
            int lsb = v & (~v + 1);            // smallest bit of v
2061
            int s   = (   ((lsb & 0xAA) != 0)  // log2(lsb)
2062
                       | (((lsb & 0xCC) != 0) << 1)
2063
                       | (((lsb & 0xF0) != 0) << 2));
2064
            ctx->next_index  = (i16)(i-(w-1)+s);
2065
            ctx->next_digit  = (i8) (v >> s   );
2066
            ctx->next_check -= (u8) w;
2067
        }
2068
    }
2069
    return i == ctx->next_index ? ctx->next_digit: 0;
2070
}
2071

2072
#define P_W_WIDTH 3 // Affects the size of the stack
2073
#define B_W_WIDTH 5 // Affects the size of the binary
2074
#define P_W_SIZE  (1<<(P_W_WIDTH-2))
2075

2076
// P = [b]B + [p]P, where B is the base point
2077
//
2078
// Variable time! Internal buffers are not wiped! Inputs must not be secret!
2079
// => Use only to *check* signatures.
2080
static void ge_double_scalarmult_vartime(ge *P, const u8 p[32], const u8 b[32])
2081
{
2082
    // cache P window for addition
2083
    ge_cached cP[P_W_SIZE];
2084
    {
2085
        ge P2, tmp;
2086
        ge_double(&P2, P, &tmp);
2087
        ge_cache(&cP[0], P);
2088
        FOR (i, 1, P_W_SIZE) {
2089
            ge_add(&tmp, &P2, &cP[i-1]);
2090
            ge_cache(&cP[i], &tmp);
2091
        }
2092
    }
2093

2094
    // Merged double and add ladder, fused with sliding
2095
    slide_ctx p_slide;  slide_init(&p_slide, p);
2096
    slide_ctx b_slide;  slide_init(&b_slide, b);
2097
    int i = MAX(p_slide.next_check, b_slide.next_check);
2098
    ge *sum = P;
2099
    ge_zero(sum);
2100
    while (i >= 0) {
2101
        ge tmp;
2102
        ge_double(sum, sum, &tmp);
2103
        int p_digit = slide_step(&p_slide, P_W_WIDTH, i, p);
2104
        int b_digit = slide_step(&b_slide, B_W_WIDTH, i, b);
2105
        if (p_digit > 0) { ge_add(sum, sum, &cP[ p_digit / 2]); }
2106
        if (p_digit < 0) { ge_sub(sum, sum, &cP[-p_digit / 2]); }
2107
        fe t1, t2;
2108
        if (b_digit > 0) { ge_madd(sum, sum, b_window +  b_digit/2, t1, t2); }
2109
        if (b_digit < 0) { ge_msub(sum, sum, b_window + -b_digit/2, t1, t2); }
2110
        i--;
2111
    }
2112
}
2113

2114
// R_check = s[B] - h_ram[pk], where B is the base point
2115
//
2116
// Variable time! Internal buffers are not wiped! Inputs must not be secret!
2117
// => Use only to *check* signatures.
2118
static int ge_r_check(u8 R_check[32], u8 s[32], u8 h_ram[32], u8 pk[32])
2119
{
2120
    ge  A;      // not secret, not wiped
2121
    u32 s32[8]; // not secret, not wiped
2122
    load32_le_buf(s32, s, 8);
2123
    if (ge_frombytes_vartime(&A, pk) ||         // A = pk
2124
        is_above_l(s32)) {                      // prevent s malleability
2125
        return -1;
2126
    }
2127
    fe_neg(A.X, A.X);
2128
    fe_neg(A.T, A.T);                           // A = -pk
2129
    ge_double_scalarmult_vartime(&A, h_ram, s); // A = [s]B - [h_ram]pk
2130
    ge_tobytes(R_check, &A);                    // R_check = A
2131
    return 0;
2132
}
2133

2134
// 5-bit signed comb in cached format (Niels coordinates, Z=1)
2135
static const ge_precomp b_comb_low[8] = {
2136
    {{-6816601,-2324159,-22559413,124364,18015490,
2137
      8373481,19993724,1979872,-18549925,9085059,},
2138
     {10306321,403248,14839893,9633706,8463310,
2139
      -8354981,-14305673,14668847,26301366,2818560,},
2140
     {-22701500,-3210264,-13831292,-2927732,-16326337,
2141
      -14016360,12940910,177905,12165515,-2397893,},},
2142
    {{-12282262,-7022066,9920413,-3064358,-32147467,
2143
      2927790,22392436,-14852487,2719975,16402117,},
2144
     {-7236961,-4729776,2685954,-6525055,-24242706,
2145
      -15940211,-6238521,14082855,10047669,12228189,},
2146
     {-30495588,-12893761,-11161261,3539405,-11502464,
2147
      16491580,-27286798,-15030530,-7272871,-15934455,},},
2148
    {{17650926,582297,-860412,-187745,-12072900,
2149
      -10683391,-20352381,15557840,-31072141,-5019061,},
2150
     {-6283632,-2259834,-4674247,-4598977,-4089240,
2151
      12435688,-31278303,1060251,6256175,10480726,},
2152
     {-13871026,2026300,-21928428,-2741605,-2406664,
2153
      -8034988,7355518,15733500,-23379862,7489131,},},
2154
    {{6883359,695140,23196907,9644202,-33430614,
2155
      11354760,-20134606,6388313,-8263585,-8491918,},
2156
     {-7716174,-13605463,-13646110,14757414,-19430591,
2157
      -14967316,10359532,-11059670,-21935259,12082603,},
2158
     {-11253345,-15943946,10046784,5414629,24840771,
2159
      8086951,-6694742,9868723,15842692,-16224787,},},
2160
    {{9639399,11810955,-24007778,-9320054,3912937,
2161
      -9856959,996125,-8727907,-8919186,-14097242,},
2162
     {7248867,14468564,25228636,-8795035,14346339,
2163
      8224790,6388427,-7181107,6468218,-8720783,},
2164
     {15513115,15439095,7342322,-10157390,18005294,
2165
      -7265713,2186239,4884640,10826567,7135781,},},
2166
    {{-14204238,5297536,-5862318,-6004934,28095835,
2167
      4236101,-14203318,1958636,-16816875,3837147,},
2168
     {-5511166,-13176782,-29588215,12339465,15325758,
2169
      -15945770,-8813185,11075932,-19608050,-3776283,},
2170
     {11728032,9603156,-4637821,-5304487,-7827751,
2171
      2724948,31236191,-16760175,-7268616,14799772,},},
2172
    {{-28842672,4840636,-12047946,-9101456,-1445464,
2173
      381905,-30977094,-16523389,1290540,12798615,},
2174
     {27246947,-10320914,14792098,-14518944,5302070,
2175
      -8746152,-3403974,-4149637,-27061213,10749585,},
2176
     {25572375,-6270368,-15353037,16037944,1146292,
2177
      32198,23487090,9585613,24714571,-1418265,},},
2178
    {{19844825,282124,-17583147,11004019,-32004269,
2179
      -2716035,6105106,-1711007,-21010044,14338445,},
2180
     {8027505,8191102,-18504907,-12335737,25173494,
2181
      -5923905,15446145,7483684,-30440441,10009108,},
2182
     {-14134701,-4174411,10246585,-14677495,33553567,
2183
      -14012935,23366126,15080531,-7969992,7663473,},},
2184
};
2185

2186
static const ge_precomp b_comb_high[8] = {
2187
    {{33055887,-4431773,-521787,6654165,951411,
2188
      -6266464,-5158124,6995613,-5397442,-6985227,},
2189
     {4014062,6967095,-11977872,3960002,8001989,
2190
      5130302,-2154812,-1899602,-31954493,-16173976,},
2191
     {16271757,-9212948,23792794,731486,-25808309,
2192
      -3546396,6964344,-4767590,10976593,10050757,},},
2193
    {{2533007,-4288439,-24467768,-12387405,-13450051,
2194
      14542280,12876301,13893535,15067764,8594792,},
2195
     {20073501,-11623621,3165391,-13119866,13188608,
2196
      -11540496,-10751437,-13482671,29588810,2197295,},
2197
     {-1084082,11831693,6031797,14062724,14748428,
2198
      -8159962,-20721760,11742548,31368706,13161200,},},
2199
    {{2050412,-6457589,15321215,5273360,25484180,
2200
      124590,-18187548,-7097255,-6691621,-14604792,},
2201
     {9938196,2162889,-6158074,-1711248,4278932,
2202
      -2598531,-22865792,-7168500,-24323168,11746309,},
2203
     {-22691768,-14268164,5965485,9383325,20443693,
2204
      5854192,28250679,-1381811,-10837134,13717818,},},
2205
    {{-8495530,16382250,9548884,-4971523,-4491811,
2206
      -3902147,6182256,-12832479,26628081,10395408,},
2207
     {27329048,-15853735,7715764,8717446,-9215518,
2208
      -14633480,28982250,-5668414,4227628,242148,},
2209
     {-13279943,-7986904,-7100016,8764468,-27276630,
2210
      3096719,29678419,-9141299,3906709,11265498,},},
2211
    {{11918285,15686328,-17757323,-11217300,-27548967,
2212
      4853165,-27168827,6807359,6871949,-1075745,},
2213
     {-29002610,13984323,-27111812,-2713442,28107359,
2214
      -13266203,6155126,15104658,3538727,-7513788,},
2215
     {14103158,11233913,-33165269,9279850,31014152,
2216
      4335090,-1827936,4590951,13960841,12787712,},},
2217
    {{1469134,-16738009,33411928,13942824,8092558,
2218
      -8778224,-11165065,1437842,22521552,-2792954,},
2219
     {31352705,-4807352,-25327300,3962447,12541566,
2220
      -9399651,-27425693,7964818,-23829869,5541287,},
2221
     {-25732021,-6864887,23848984,3039395,-9147354,
2222
      6022816,-27421653,10590137,25309915,-1584678,},},
2223
    {{-22951376,5048948,31139401,-190316,-19542447,
2224
      -626310,-17486305,-16511925,-18851313,-12985140,},
2225
     {-9684890,14681754,30487568,7717771,-10829709,
2226
      9630497,30290549,-10531496,-27798994,-13812825,},
2227
     {5827835,16097107,-24501327,12094619,7413972,
2228
      11447087,28057551,-1793987,-14056981,4359312,},},
2229
    {{26323183,2342588,-21887793,-1623758,-6062284,
2230
      2107090,-28724907,9036464,-19618351,-13055189,},
2231
     {-29697200,14829398,-4596333,14220089,-30022969,
2232
      2955645,12094100,-13693652,-5941445,7047569,},
2233
     {-3201977,14413268,-12058324,-16417589,-9035655,
2234
      -7224648,9258160,1399236,30397584,-5684634,},},
2235
};
2236

2237
static void lookup_add(ge *p, ge_precomp *tmp_c, fe tmp_a, fe tmp_b,
2238
                       const ge_precomp comb[8], const u8 scalar[32], int i)
2239
{
2240
    u8 teeth = (u8)((scalar_bit(scalar, i)          ) +
2241
                    (scalar_bit(scalar, i + 32) << 1) +
2242
                    (scalar_bit(scalar, i + 64) << 2) +
2243
                    (scalar_bit(scalar, i + 96) << 3));
2244
    u8 high  = teeth >> 3;
2245
    u8 index = (teeth ^ (high - 1)) & 7;
2246
    FOR (j, 0, 8) {
2247
        i32 select = 1 & (((j ^ index) - 1) >> 8);
2248
        fe_ccopy(tmp_c->Yp, comb[j].Yp, select);
2249
        fe_ccopy(tmp_c->Ym, comb[j].Ym, select);
2250
        fe_ccopy(tmp_c->T2, comb[j].T2, select);
2251
    }
2252
    fe_neg(tmp_a, tmp_c->T2);
2253
    fe_cswap(tmp_c->T2, tmp_a    , high ^ 1);
2254
    fe_cswap(tmp_c->Yp, tmp_c->Ym, high ^ 1);
2255
    ge_madd(p, p, tmp_c, tmp_a, tmp_b);
2256
}
2257

2258
// p = [scalar]B, where B is the base point
2259
static void ge_scalarmult_base(ge *p, const u8 scalar[32])
2260
{
2261
    // twin 4-bits signed combs, from Mike Hamburg's
2262
    // Fast and compact elliptic-curve cryptography (2012)
2263
    // 1 / 2 modulo L
2264
    static const u8 half_mod_L[32] = {
2265
        247,233,122,46,141,49,9,44,107,206,123,81,239,124,111,10,
2266
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8, };
2267
    // (2^256 - 1) / 2 modulo L
2268
    static const u8 half_ones[32] = {
2269
        142,74,204,70,186,24,118,107,184,231,190,57,250,173,119,99,
2270
        255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,7, };
2271

2272
    // All bits set form: 1 means 1, 0 means -1
2273
    u8 s_scalar[32];
2274
    mul_add(s_scalar, scalar, half_mod_L, half_ones);
2275

2276
    // Double and add ladder
2277
    fe tmp_a, tmp_b;  // temporaries for addition
2278
    ge_precomp tmp_c; // temporary for comb lookup
2279
    ge tmp_d;         // temporary for doubling
2280
    fe_1(tmp_c.Yp);
2281
    fe_1(tmp_c.Ym);
2282
    fe_0(tmp_c.T2);
2283

2284
    // Save a double on the first iteration
2285
    ge_zero(p);
2286
    lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, 31);
2287
    lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, 31+128);
2288
    // Regular double & add for the rest
2289
    for (int i = 30; i >= 0; i--) {
2290
        ge_double(p, p, &tmp_d);
2291
        lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, i);
2292
        lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, i+128);
2293
    }
2294
    // Note: we could save one addition at the end if we assumed the
2295
    // scalar fit in 252 bit.  Which it does in practice if it is
2296
    // selected at random.  However, non-random, non-hashed scalars
2297
    // *can* overflow 252 bits in practice.  Better account for that
2298
    // than leaving that kind of subtle corner case.
2299

2300
    WIPE_BUFFER(tmp_a);  WIPE_CTX(&tmp_d);
2301
    WIPE_BUFFER(tmp_b);  WIPE_CTX(&tmp_c);
2302
    WIPE_BUFFER(s_scalar);
2303
}
2304

2305
void crypto_sign_public_key_custom_hash(u8       public_key[32],
2306
                                        const u8 secret_key[32],
2307
                                        const crypto_sign_vtable *hash)
2308
{
2309
    u8 a[64];
2310
    hash->hash(a, secret_key, 32);
2311
    trim_scalar(a);
2312
    ge A;
2313
    ge_scalarmult_base(&A, a);
2314
    ge_tobytes(public_key, &A);
2315
    WIPE_BUFFER(a);
2316
    WIPE_CTX(&A);
2317
}
2318

2319
void crypto_sign_public_key(u8 public_key[32], const u8 secret_key[32])
2320
{
2321
    crypto_sign_public_key_custom_hash(public_key, secret_key,
2322
                                       &crypto_blake2b_vtable);
2323
}
2324

2325
void crypto_sign_init_first_pass_custom_hash(crypto_sign_ctx_abstract *ctx,
2326
                                             const u8 secret_key[32],
2327
                                             const u8 public_key[32],
2328
                                             const crypto_sign_vtable *hash)
2329
{
2330
    ctx->hash  = hash; // set vtable
2331
    u8 *a      = ctx->buf;
2332
    u8 *prefix = ctx->buf + 32;
2333
    ctx->hash->hash(a, secret_key, 32);
2334
    trim_scalar(a);
2335

2336
    if (public_key == 0) {
2337
        crypto_sign_public_key_custom_hash(ctx->pk, secret_key, ctx->hash);
2338
    } else {
2339
        COPY(ctx->pk, public_key, 32);
2340
    }
2341

2342
    // Deterministic part of EdDSA: Construct a nonce by hashing the message
2343
    // instead of generating a random number.
2344
    // An actual random number would work just fine, and would save us
2345
    // the trouble of hashing the message twice.  If we did that
2346
    // however, the user could fuck it up and reuse the nonce.
2347
    ctx->hash->init  (ctx);
2348
    ctx->hash->update(ctx, prefix , 32);
2349
}
2350

2351
void crypto_sign_init_first_pass(crypto_sign_ctx_abstract *ctx,
2352
                                 const u8 secret_key[32],
2353
                                 const u8 public_key[32])
2354
{
2355
    crypto_sign_init_first_pass_custom_hash(ctx, secret_key, public_key,
2356
                                            &crypto_blake2b_vtable);
2357
}
2358

2359
void crypto_sign_update(crypto_sign_ctx_abstract *ctx,
2360
                        const u8 *msg, size_t msg_size)
2361
{
2362
    ctx->hash->update(ctx, msg, msg_size);
2363
}
2364

2365
void crypto_sign_init_second_pass(crypto_sign_ctx_abstract *ctx)
2366
{
2367
    u8 *r        = ctx->buf + 32;
2368
    u8 *half_sig = ctx->buf + 64;
2369
    ctx->hash->final(ctx, r);
2370
    reduce(r);
2371

2372
    // first half of the signature = "random" nonce times the base point
2373
    ge R;
2374
    ge_scalarmult_base(&R, r);
2375
    ge_tobytes(half_sig, &R);
2376
    WIPE_CTX(&R);
2377

2378
    // Hash R, the public key, and the message together.
2379
    // It cannot be done in parallel with the first hash.
2380
    ctx->hash->init  (ctx);
2381
    ctx->hash->update(ctx, half_sig, 32);
2382
    ctx->hash->update(ctx, ctx->pk , 32);
2383
}
2384

2385
void crypto_sign_final(crypto_sign_ctx_abstract *ctx, u8 signature[64])
2386
{
2387
    u8 *a        = ctx->buf;
2388
    u8 *r        = ctx->buf + 32;
2389
    u8 *half_sig = ctx->buf + 64;
2390
    u8  h_ram[64];
2391
    ctx->hash->final(ctx, h_ram);
2392
    reduce(h_ram);
2393
    COPY(signature, half_sig, 32);
2394
    mul_add(signature + 32, h_ram, a, r); // s = h_ram * a + r
2395
    WIPE_BUFFER(h_ram);
2396
    crypto_wipe(ctx, ctx->hash->ctx_size);
2397
}
2398

2399
void crypto_sign(u8        signature[64],
2400
                 const u8  secret_key[32],
2401
                 const u8  public_key[32],
2402
                 const u8 *message, size_t message_size)
2403
{
2404
    crypto_sign_ctx ctx;
2405
    crypto_sign_ctx_abstract *actx = (crypto_sign_ctx_abstract*)&ctx;
2406
    crypto_sign_init_first_pass (actx, secret_key, public_key);
2407
    crypto_sign_update          (actx, message, message_size);
2408
    crypto_sign_init_second_pass(actx);
2409
    crypto_sign_update          (actx, message, message_size);
2410
    crypto_sign_final           (actx, signature);
2411
}
2412

2413
void crypto_check_init_custom_hash(crypto_check_ctx_abstract *ctx,
2414
                                   const u8 signature[64],
2415
                                   const u8 public_key[32],
2416
                                   const crypto_sign_vtable *hash)
2417
{
2418
    ctx->hash = hash; // set vtable
2419
    COPY(ctx->buf, signature , 64);
2420
    COPY(ctx->pk , public_key, 32);
2421
    ctx->hash->init  (ctx);
2422
    ctx->hash->update(ctx, signature , 32);
2423
    ctx->hash->update(ctx, public_key, 32);
2424
}
2425

2426
void crypto_check_init(crypto_check_ctx_abstract *ctx, const u8 signature[64],
2427
                       const u8 public_key[32])
2428
{
2429
    crypto_check_init_custom_hash(ctx, signature, public_key,
2430
                                  &crypto_blake2b_vtable);
2431
}
2432

2433
void crypto_check_update(crypto_check_ctx_abstract *ctx,
2434
                         const u8 *msg, size_t msg_size)
2435
{
2436
    ctx->hash->update(ctx, msg, msg_size);
2437
}
2438

2439
int crypto_check_final(crypto_check_ctx_abstract *ctx)
2440
{
2441
    u8 h_ram[64];
2442
    ctx->hash->final(ctx, h_ram);
2443
    reduce(h_ram);
2444
    u8 *R       = ctx->buf;      // R
2445
    u8 *s       = ctx->buf + 32; // s
2446
    u8 *R_check = ctx->pk;       // overwrite ctx->pk to save stack space
2447
    if (ge_r_check(R_check, s, h_ram, ctx->pk)) {
2448
        return -1;
2449
    }
2450
    return crypto_verify32(R, R_check); // R == R_check ? OK : fail
2451
}
2452

2453
int crypto_check(const u8  signature[64], const u8 public_key[32],
2454
                 const u8 *message, size_t message_size)
2455
{
2456
    crypto_check_ctx ctx;
2457
    crypto_check_ctx_abstract *actx = (crypto_check_ctx_abstract*)&ctx;
2458
    crypto_check_init  (actx, signature, public_key);
2459
    crypto_check_update(actx, message, message_size);
2460
    return crypto_check_final(actx);
2461
}
2462

2463
///////////////////////
2464
/// EdDSA to X25519 ///
2465
///////////////////////
2466
void crypto_from_eddsa_private(u8 x25519[32], const u8 eddsa[32])
2467
{
2468
    u8 a[64];
2469
    crypto_blake2b(a, eddsa, 32);
2470
    COPY(x25519, a, 32);
2471
    WIPE_BUFFER(a);
2472
}
2473

2474
void crypto_from_eddsa_public(u8 x25519[32], const u8 eddsa[32])
2475
{
2476
    fe t1, t2;
2477
    fe_frombytes(t2, eddsa);
2478
    fe_add(t1, fe_one, t2);
2479
    fe_sub(t2, fe_one, t2);
2480
    fe_invert(t2, t2);
2481
    fe_mul(t1, t1, t2);
2482
    fe_tobytes(x25519, t1);
2483
    WIPE_BUFFER(t1);
2484
    WIPE_BUFFER(t2);
2485
}
2486

2487
/////////////////////////////////////////////
2488
/// Dirty ephemeral public key generation ///
2489
/////////////////////////////////////////////
2490

2491
// Those functions generates a public key, *without* clearing the
2492
// cofactor.  Sending that key over the network leaks 3 bits of the
2493
// private key.  Use only to generate ephemeral keys that will be hidden
2494
// with crypto_curve_to_hidden().
2495
//
2496
// The public key is otherwise compatible with crypto_x25519() and
2497
// crypto_key_exchange() (those properly clear the cofactor).
2498
//
2499
// Note that the distribution of the resulting public keys is almost
2500
// uniform.  Flipping the sign of the v coordinate (not provided by this
2501
// function), covers the entire key space almost perfectly, where
2502
// "almost" means a 2^-128 bias (undetectable).  This uniformity is
2503
// needed to ensure the proper randomness of the resulting
2504
// representatives (once we apply crypto_curve_to_hidden()).
2505
//
2506
// Recall that Curve25519 has order C = 2^255 + e, with e < 2^128 (not
2507
// to be confused with the prime order of the main subgroup, L, which is
2508
// 8 times less than that).
2509
//
2510
// Generating all points would require us to multiply a point of order C
2511
// (the base point plus any point of order 8) by all scalars from 0 to
2512
// C-1.  Clamping limits us to scalars between 2^254 and 2^255 - 1. But
2513
// by negating the resulting point at random, we also cover scalars from
2514
// -2^255 + 1 to -2^254 (which modulo C is congruent to e+1 to 2^254 + e).
2515
//
2516
// In practice:
2517
// - Scalars from 0         to e + 1     are never generated
2518
// - Scalars from 2^255     to 2^255 + e are never generated
2519
// - Scalars from 2^254 + 1 to 2^254 + e are generated twice
2520
//
2521
// Since e < 2^128, detecting this bias requires observing over 2^100
2522
// representatives from a given source (this will never happen), *and*
2523
// recovering enough of the private key to determine that they do, or do
2524
// not, belong to the biased set (this practically requires solving
2525
// discrete logarithm, which is conjecturally intractable).
2526
//
2527
// In practice, this means the bias is impossible to detect.
2528

2529
// s + (x*L) % 8*L
2530
// Guaranteed to fit in 256 bits iff s fits in 255 bits.
2531
//   L             < 2^253
2532
//   x%8           < 2^3
2533
//   L * (x%8)     < 2^255
2534
//   s             < 2^255
2535
//   s + L * (x%8) < 2^256
2536
static void add_xl(u8 s[32], u8 x)
2537
{
2538
    u64 mod8  = x & 7;
2539
    u64 carry = 0;
2540
    FOR (i , 0, 8) {
2541
        carry = carry + load32_le(s + 4*i) + L[i] * mod8;
2542
        store32_le(s + 4*i, (u32)carry);
2543
        carry >>= 32;
2544
    }
2545
}
2546

2547
// "Small" dirty ephemeral key.
2548
// Use if you need to shrink the size of the binary, and can afford to
2549
// slow down by a factor of two (compared to the fast version)
2550
//
2551
// This version works by decoupling the cofactor from the main factor.
2552
//
2553
// - The trimmed scalar determines the main factor
2554
// - The clamped bits of the scalar determine the cofactor.
2555
//
2556
// Cofactor and main factor are combined into a single scalar, which is
2557
// then multiplied by a point of order 8*L (unlike the base point, which
2558
// has prime order).  That "dirty" base point is the addition of the
2559
// regular base point (9), and a point of order 8.
2560
void crypto_x25519_dirty_small(u8 public_key[32], const u8 secret_key[32])
2561
{
2562
    // Base point of order 8*L
2563
    // Raw scalar multiplication with it does not clear the cofactor,
2564
    // and the resulting public key will reveal 3 bits of the scalar.
2565
    static const u8 dirty_base_point[32] = {
2566
        0x34, 0xfc, 0x6c, 0xb7, 0xc8, 0xde, 0x58, 0x97, 0x77, 0x70, 0xd9, 0x52,
2567
        0x16, 0xcc, 0xdc, 0x6c, 0x85, 0x90, 0xbe, 0xcd, 0x91, 0x9c, 0x07, 0x59,
2568
        0x94, 0x14, 0x56, 0x3b, 0x4b, 0xa4, 0x47, 0x0f, };
2569
    // separate the main factor & the cofactor of the scalar
2570
    u8 scalar[32];
2571
    COPY(scalar, secret_key, 32);
2572
    trim_scalar(scalar);
2573

2574
    // Separate the main factor and the cofactor
2575
    //
2576
    // The scalar is trimmed, so its cofactor is cleared.  The three
2577
    // least significant bits however still have a main factor.  We must
2578
    // remove it for X25519 compatibility.
2579
    //
2580
    // We exploit the fact that 5*L = 1 (modulo 8)
2581
    //   cofactor = lsb * 5 * L             (modulo 8*L)
2582
    //   combined = scalar + cofactor       (modulo 8*L)
2583
    //   combined = scalar + (lsb * 5 * L)  (modulo 8*L)
2584
    add_xl(scalar, secret_key[0] * 5);
2585
    scalarmult(public_key, scalar, dirty_base_point, 256);
2586
    WIPE_BUFFER(scalar);
2587
}
2588

2589
// "Fast" dirty ephemeral key
2590
// We use this one by default.
2591
//
2592
// This version works by performing a regular scalar multiplication,
2593
// then add a low order point.  The scalar multiplication is done in
2594
// Edwards space for more speed (*2 compared to the "small" version).
2595
// The cost is a bigger binary for programs that don't also sign messages.
2596
void crypto_x25519_dirty_fast(u8 public_key[32], const u8 secret_key[32])
2597
{
2598
    u8 scalar[32];
2599
    ge pk;
2600
    COPY(scalar, secret_key, 32);
2601
    trim_scalar(scalar);
2602
    ge_scalarmult_base(&pk, scalar);
2603

2604
    // Select low order point
2605
    // We're computing the [cofactor]lop scalar multiplication, where:
2606
    //   cofactor = tweak & 7.
2607
    //   lop      = (lop_x, lop_y)
2608
    //   lop_x    = sqrt((sqrt(d + 1) + 1) / d)
2609
    //   lop_y    = -lop_x * sqrtm1
2610
    // Notes:
2611
    // - A (single) Montgomery ladder would be twice as slow.
2612
    // - An actual scalar multiplication would hurt performance.
2613
    // - A full table lookup would take more code.
2614
    u8 cofactor = secret_key[0] & 7;
2615
    int a = (cofactor >> 2) & 1;
2616
    int b = (cofactor >> 1) & 1;
2617
    int c = (cofactor >> 0) & 1;
2618
    fe t1, t2, t3;
2619
    fe_0(t1);
2620
    fe_ccopy(t1, sqrtm1, b);
2621
    fe_ccopy(t1, lop_x , c);
2622
    fe_neg  (t3, t1);
2623
    fe_ccopy(t1, t3, a);
2624
    fe_1(t2);
2625
    fe_0(t3);
2626
    fe_ccopy(t2, t3   , b);
2627
    fe_ccopy(t2, lop_y, c);
2628
    fe_neg  (t3, t2);
2629
    fe_ccopy(t2, t3, a^b);
2630
    ge_precomp low_order_point;
2631
    fe_add(low_order_point.Yp, t2, t1);
2632
    fe_sub(low_order_point.Ym, t2, t1);
2633
    fe_mul(low_order_point.T2, t2, t1);
2634
    fe_mul(low_order_point.T2, low_order_point.T2, D2);
2635

2636
    // Add low order point to the public key
2637
    ge_madd(&pk, &pk, &low_order_point, t1, t2);
2638

2639
    // Convert to Montgomery u coordinate (we ignore the sign)
2640
    fe_add(t1, pk.Z, pk.Y);
2641
    fe_sub(t2, pk.Z, pk.Y);
2642
    fe_invert(t2, t2);
2643
    fe_mul(t1, t1, t2);
2644

2645
    fe_tobytes(public_key, t1);
2646

2647
    WIPE_BUFFER(t1);  WIPE_BUFFER(scalar);
2648
    WIPE_BUFFER(t2);  WIPE_CTX(&pk);
2649
    WIPE_BUFFER(t3);  WIPE_CTX(&low_order_point);
2650
}
2651

2652
///////////////////
2653
/// Elligator 2 ///
2654
///////////////////
2655
static const fe A = {486662};
2656

2657
// Elligator direct map
2658
//
2659
// Computes the point corresponding to a representative, encoded in 32
2660
// bytes (little Endian).  Since positive representatives fits in 254
2661
// bits, The two most significant bits are ignored.
2662
//
2663
// From the paper:
2664
// w = -A / (fe(1) + non_square * r^2)
2665
// e = chi(w^3 + A*w^2 + w)
2666
// u = e*w - (fe(1)-e)*(A//2)
2667
// v = -e * sqrt(u^3 + A*u^2 + u)
2668
//
2669
// We ignore v because we don't need it for X25519 (the Montgomery
2670
// ladder only uses u).
2671
//
2672
// Note that e is either 0, 1 or -1
2673
// if e = 0    u = 0  and v = 0
2674
// if e = 1    u = w
2675
// if e = -1   u = -w - A = w * non_square * r^2
2676
//
2677
// Let r1 = non_square * r^2
2678
// Let r2 = 1 + r1
2679
// Note that r2 cannot be zero, -1/non_square is not a square.
2680
// We can (tediously) verify that:
2681
//   w^3 + A*w^2 + w = (A^2*r1 - r2^2) * A / r2^3
2682
// Therefore:
2683
//   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3))
2684
//   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * 1
2685
//   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * chi(r2^6)
2686
//   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)  *     r2^6)
2687
//   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) *  A * r2^3)
2688
// Corollary:
2689
//   e =  1 if (A^2*r1 - r2^2) *  A * r2^3) is a non-zero square
2690
//   e = -1 if (A^2*r1 - r2^2) *  A * r2^3) is not a square
2691
//   Note that w^3 + A*w^2 + w (and therefore e) can never be zero:
2692
//     w^3 + A*w^2 + w = w * (w^2 + A*w + 1)
2693
//     w^3 + A*w^2 + w = w * (w^2 + A*w + A^2/4 - A^2/4 + 1)
2694
//     w^3 + A*w^2 + w = w * (w + A/2)^2        - A^2/4 + 1)
2695
//     which is zero only if:
2696
//       w = 0                   (impossible)
2697
//       (w + A/2)^2 = A^2/4 - 1 (impossible, because A^2/4-1 is not a square)
2698
//
2699
// Let isr   = invsqrt((A^2*r1 - r2^2) *  A * r2^3)
2700
//     isr   = sqrt(1        / ((A^2*r1 - r2^2) *  A * r2^3)) if e =  1
2701
//     isr   = sqrt(sqrt(-1) / ((A^2*r1 - r2^2) *  A * r2^3)) if e = -1
2702
//
2703
// if e = 1
2704
//   let u1 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2
2705
//       u1 = w
2706
//       u1 = u
2707
//
2708
// if e = -1
2709
//   let ufactor = -non_square * sqrt(-1) * r^2
2710
//   let vfactor = sqrt(ufactor)
2711
//   let u2 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2 * ufactor
2712
//       u2 = w * -1 * -non_square * r^2
2713
//       u2 = w * non_square * r^2
2714
//       u2 = u
2715
void crypto_hidden_to_curve(uint8_t curve[32], const uint8_t hidden[32])
2716
{
2717
    // Representatives are encoded in 254 bits.
2718
    // The two most significant ones are random padding that must be ignored.
2719
    u8 clamped[32];
2720
    COPY(clamped, hidden, 32);
2721
    clamped[31] &= 0x3f;
2722

2723
    fe r, u, t1, t2, t3;
2724
    fe_frombytes(r, clamped);
2725
    fe_sq2(t1, r);
2726
    fe_add(u, t1, fe_one);
2727
    fe_sq (t2, u);
2728
    fe_mul(t3, A2, t1);
2729
    fe_sub(t3, t3, t2);
2730
    fe_mul(t3, t3, A);
2731
    fe_mul(t1, t2, u);
2732
    fe_mul(t1, t3, t1);
2733
    int is_square = invsqrt(t1, t1);
2734
    fe_sq(u, r);
2735
    fe_mul(u, u, ufactor);
2736
    fe_ccopy(u, fe_one, is_square);
2737
    fe_sq (t1, t1);
2738
    fe_mul(u, u, A);
2739
    fe_mul(u, u, t3);
2740
    fe_mul(u, u, t2);
2741
    fe_mul(u, u, t1);
2742
    fe_neg(u, u);
2743
    fe_tobytes(curve, u);
2744

2745
    WIPE_BUFFER(t1);  WIPE_BUFFER(r);
2746
    WIPE_BUFFER(t2);  WIPE_BUFFER(u);
2747
    WIPE_BUFFER(t3);  WIPE_BUFFER(clamped);
2748
}
2749

2750
// Elligator inverse map
2751
//
2752
// Computes the representative of a point, if possible.  If not, it does
2753
// nothing and returns -1.  Note that the success of the operation
2754
// depends only on the point (more precisely its u coordinate).  The
2755
// tweak parameter is used only upon success
2756
//
2757
// The tweak should be a random byte.  Beyond that, its contents are an
2758
// implementation detail. Currently, the tweak comprises:
2759
// - Bit  1  : sign of the v coordinate (0 if positive, 1 if negative)
2760
// - Bit  2-5: not used
2761
// - Bits 6-7: random padding
2762
//
2763
// From the paper:
2764
// Let sq = -non_square * u * (u+A)
2765
// if sq is not a square, or u = -A, there is no mapping
2766
// Assuming there is a mapping:
2767
//   if v is positive: r = sqrt(-(u+A) / u)
2768
//   if v is negative: r = sqrt(-u / (u+A))
2769
//
2770
// We compute isr = invsqrt(-non_square * u * (u+A))
2771
// if it wasn't a non-zero square, abort.
2772
// else, isr = sqrt(-1 / (non_square * u * (u+A))
2773
//
2774
// This causes us to abort if u is zero, even though we shouldn't. This
2775
// never happens in practice, because (i) a random point in the curve has
2776
// a negligible chance of being zero, and (ii) scalar multiplication with
2777
// a trimmed scalar *never* yields zero.
2778
//
2779
// Since:
2780
//   isr * (u+A) = sqrt(-1     / (non_square * u * (u+A)) * (u+A)
2781
//   isr * (u+A) = sqrt(-(u+A) / (non_square * u * (u+A))
2782
// and:
2783
//   isr = u = sqrt(-1 / (non_square * u * (u+A)) * u
2784
//   isr = u = sqrt(-u / (non_square * u * (u+A))
2785
// Therefore:
2786
//   if v is positive: r = isr * (u+A)
2787
//   if v is negative: r = isr * u
2788
int crypto_curve_to_hidden(u8 hidden[32], const u8 public_key[32], u8 tweak)
2789
{
2790
    fe t1, t2, t3;
2791
    fe_frombytes(t1, public_key);
2792

2793
    fe_add(t2, t1, A);
2794
    fe_mul(t3, t1, t2);
2795
    fe_mul_small(t3, t3, -2);
2796
    int is_square = invsqrt(t3, t3);
2797
    if (!is_square) {
2798
        // The only variable time bit.  This ultimately reveals how many
2799
        // tries it took us to find a representable key.
2800
        // This does not affect security as long as we try keys at random.
2801
        WIPE_BUFFER(t1);
2802
        WIPE_BUFFER(t2);
2803
        WIPE_BUFFER(t3);
2804
        return -1;
2805
    }
2806
    fe_ccopy    (t1, t2, tweak & 1);
2807
    fe_mul      (t3, t1, t3);
2808
    fe_mul_small(t1, t3, 2);
2809
    fe_neg      (t2, t3);
2810
    fe_ccopy    (t3, t2, fe_isodd(t1));
2811
    fe_tobytes(hidden, t3);
2812

2813
    // Pad with two random bits
2814
    hidden[31] |= tweak & 0xc0;
2815

2816
    WIPE_BUFFER(t1);
2817
    WIPE_BUFFER(t2);
2818
    WIPE_BUFFER(t3);
2819
    return 0;
2820
}
2821

2822
void crypto_hidden_key_pair(u8 hidden[32], u8 secret_key[32], u8 seed[32])
2823
{
2824
    u8 pk [32]; // public key
2825
    u8 buf[64]; // seed + representative
2826
    COPY(buf + 32, seed, 32);
2827
    do {
2828
        crypto_chacha20(buf, 0, 64, buf+32, zero);
2829
        crypto_x25519_dirty_fast(pk, buf); // or the "small" version
2830
    } while(crypto_curve_to_hidden(buf+32, pk, buf[32]));
2831
    // Note that the return value of crypto_curve_to_hidden() is
2832
    // independent from its tweak parameter.
2833
    // Therefore, buf[32] is not actually reused.  Either we loop one
2834
    // more time and buf[32] is used for the new seed, or we succeeded,
2835
    // and buf[32] becomes the tweak parameter.
2836

2837
    crypto_wipe(seed, 32);
2838
    COPY(hidden    , buf + 32, 32);
2839
    COPY(secret_key, buf     , 32);
2840
    WIPE_BUFFER(buf);
2841
    WIPE_BUFFER(pk);
2842
}
2843

2844
////////////////////
2845
/// Key exchange ///
2846
////////////////////
2847
void crypto_key_exchange(u8       shared_key[32],
2848
                         const u8 your_secret_key [32],
2849
                         const u8 their_public_key[32])
2850
{
2851
    crypto_x25519(shared_key, your_secret_key, their_public_key);
2852
    crypto_hchacha20(shared_key, shared_key, zero);
2853
}
2854

2855
///////////////////////
2856
/// Scalar division ///
2857
///////////////////////
2858

2859
// Montgomery reduction.
2860
// Divides x by (2^256), and reduces the result modulo L
2861
//
2862
// Precondition:
2863
//   x < L * 2^256
2864
// Constants:
2865
//   r = 2^256                 (makes division by r trivial)
2866
//   k = (r * (1/r) - 1) // L  (1/r is computed modulo L   )
2867
// Algorithm:
2868
//   s = (x * k) % r
2869
//   t = x + s*L      (t is always a multiple of r)
2870
//   u = (t/r) % L    (u is always below 2*L, conditional subtraction is enough)
2871
static void redc(u32 u[8], u32 x[16])
2872
{
2873
    static const u32 k[8]  = { 0x12547e1b, 0xd2b51da3, 0xfdba84ff, 0xb1a206f2,
2874
                               0xffa36bea, 0x14e75438, 0x6fe91836, 0x9db6c6f2,};
2875
    static const u32 l[8]  = { 0x5cf5d3ed, 0x5812631a, 0xa2f79cd6, 0x14def9de,
2876
                               0x00000000, 0x00000000, 0x00000000, 0x10000000,};
2877
    // s = x * k (modulo 2^256)
2878
    // This is cheaper than the full multiplication.
2879
    u32 s[8] = {0};
2880
    FOR (i, 0, 8) {
2881
        u64 carry = 0;
2882
        FOR (j, 0, 8-i) {
2883
            carry  += s[i+j] + (u64)x[i] * k[j];
2884
            s[i+j]  = (u32)carry;
2885
            carry >>= 32;
2886
        }
2887
    }
2888
    u32 t[16] = {0};
2889
    multiply(t, s, l);
2890

2891
    // t = t + x
2892
    u64 carry = 0;
2893
    FOR (i, 0, 16) {
2894
        carry  += (u64)t[i] + x[i];
2895
        t[i]    = (u32)carry;
2896
        carry >>= 32;
2897
    }
2898

2899
    // u = (t / 2^256) % L
2900
    // Note that t / 2^256 is always below 2*L,
2901
    // So a constant time conditional subtraction is enough
2902
    // We work with L directly, in a 2's complement encoding
2903
    // (-L == ~L + 1)
2904
    remove_l(u, t+8);
2905

2906
    WIPE_BUFFER(s);
2907
    WIPE_BUFFER(t);
2908
}
2909

2910
void crypto_x25519_inverse(u8 blind_salt [32], const u8 private_key[32],
2911
                           const u8 curve_point[32])
2912
{
2913
    static const  u8 Lm2[32] = { // L - 2
2914
        0xeb, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58, 0xd6, 0x9c, 0xf7, 0xa2,
2915
        0xde, 0xf9, 0xde, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
2916
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, };
2917
    // 1 in Montgomery form
2918
    u32 m_inv [8] = {0x8d98951d, 0xd6ec3174, 0x737dcf70, 0xc6ef5bf4,
2919
                     0xfffffffe, 0xffffffff, 0xffffffff, 0x0fffffff,};
2920

2921
    u8 scalar[32];
2922
    COPY(scalar, private_key, 32);
2923
    trim_scalar(scalar);
2924

2925
    // Convert the scalar in Montgomery form
2926
    // m_scl = scalar * 2^256 (modulo L)
2927
    u32 m_scl[8];
2928
    {
2929
        u32 tmp[16];
2930
        ZERO(tmp, 8);
2931
        load32_le_buf(tmp+8, scalar, 8);
2932
        mod_l(scalar, tmp);
2933
        load32_le_buf(m_scl, scalar, 8);
2934
        WIPE_BUFFER(tmp); // Wipe ASAP to save stack space
2935
    }
2936

2937
    u32 product[16];
2938
    for (int i = 252; i >= 0; i--) {
2939
        ZERO(product, 16);
2940
        multiply(product, m_inv, m_inv);
2941
        redc(m_inv, product);
2942
        if (scalar_bit(Lm2, i)) {
2943
            ZERO(product, 16);
2944
            multiply(product, m_inv, m_scl);
2945
            redc(m_inv, product);
2946
        }
2947
    }
2948
    // Convert the inverse *out* of Montgomery form
2949
    // scalar = m_inv / 2^256 (modulo L)
2950
    COPY(product, m_inv, 8);
2951
    ZERO(product + 8, 8);
2952
    redc(m_inv, product);
2953
    store32_le_buf(scalar, m_inv, 8); // the *inverse* of the scalar
2954

2955
    // Clear the cofactor of scalar:
2956
    //   cleared = scalar * (3*L + 1)      (modulo 8*L)
2957
    //   cleared = scalar + scalar * 3 * L (modulo 8*L)
2958
    // Note that (scalar * 3) is reduced modulo 8, so we only need the
2959
    // first byte.
2960
    add_xl(scalar, scalar[0] * 3);
2961

2962
    // Recall that 8*L < 2^256. However it is also very close to
2963
    // 2^255. If we spanned the ladder over 255 bits, random tests
2964
    // wouldn't catch the off-by-one error.
2965
    scalarmult(blind_salt, scalar, curve_point, 256);
2966

2967
    WIPE_BUFFER(scalar);   WIPE_BUFFER(m_scl);
2968
    WIPE_BUFFER(product);  WIPE_BUFFER(m_inv);
2969
}
2970

2971
////////////////////////////////
2972
/// Authenticated encryption ///
2973
////////////////////////////////
2974
static void lock_auth(u8 mac[16], const u8  auth_key[32],
2975
                      const u8 *ad         , size_t ad_size,
2976
                      const u8 *cipher_text, size_t text_size)
2977
{
2978
    u8 sizes[16]; // Not secret, not wiped
2979
    store64_le(sizes + 0, ad_size);
2980
    store64_le(sizes + 8, text_size);
2981
    crypto_poly1305_ctx poly_ctx;           // auto wiped...
2982
    crypto_poly1305_init  (&poly_ctx, auth_key);
2983
    crypto_poly1305_update(&poly_ctx, ad         , ad_size);
2984
    crypto_poly1305_update(&poly_ctx, zero       , align(ad_size, 16));
2985
    crypto_poly1305_update(&poly_ctx, cipher_text, text_size);
2986
    crypto_poly1305_update(&poly_ctx, zero       , align(text_size, 16));
2987
    crypto_poly1305_update(&poly_ctx, sizes      , 16);
2988
    crypto_poly1305_final (&poly_ctx, mac); // ...here
2989
}
2990

2991
void crypto_lock_aead(u8 mac[16], u8 *cipher_text,
2992
                      const u8  key[32], const u8  nonce[24],
2993
                      const u8 *ad        , size_t ad_size,
2994
                      const u8 *plain_text, size_t text_size)
2995
{
2996
    u8 sub_key[32];
2997
    u8 auth_key[64]; // "Wasting" the whole Chacha block is faster
2998
    crypto_hchacha20(sub_key, key, nonce);
2999
    crypto_chacha20(auth_key, 0, 64, sub_key, nonce + 16);
3000
    crypto_chacha20_ctr(cipher_text, plain_text, text_size,
3001
                        sub_key, nonce + 16, 1);
3002
    lock_auth(mac, auth_key, ad, ad_size, cipher_text, text_size);
3003
    WIPE_BUFFER(sub_key);
3004
    WIPE_BUFFER(auth_key);
3005
}
3006

3007
int crypto_unlock_aead(u8 *plain_text, const u8 key[32], const u8 nonce[24],
3008
                       const u8  mac[16],
3009
                       const u8 *ad         , size_t ad_size,
3010
                       const u8 *cipher_text, size_t text_size)
3011
{
3012
    u8 sub_key[32];
3013
    u8 auth_key[64]; // "Wasting" the whole Chacha block is faster
3014
    crypto_hchacha20(sub_key, key, nonce);
3015
    crypto_chacha20(auth_key, 0, 64, sub_key, nonce + 16);
3016
    u8 real_mac[16];
3017
    lock_auth(real_mac, auth_key, ad, ad_size, cipher_text, text_size);
3018
    WIPE_BUFFER(auth_key);
3019
    if (crypto_verify16(mac, real_mac)) {
3020
        WIPE_BUFFER(sub_key);
3021
        WIPE_BUFFER(real_mac);
3022
        return -1;
3023
    }
3024
    crypto_chacha20_ctr(plain_text, cipher_text, text_size,
3025
                        sub_key, nonce + 16, 1);
3026
    WIPE_BUFFER(sub_key);
3027
    WIPE_BUFFER(real_mac);
3028
    return 0;
3029
}
3030

3031
void crypto_lock(u8 mac[16], u8 *cipher_text,
3032
                 const u8 key[32], const u8 nonce[24],
3033
                 const u8 *plain_text, size_t text_size)
3034
{
3035
    crypto_lock_aead(mac, cipher_text, key, nonce, 0, 0, plain_text, text_size);
3036
}
3037

3038
int crypto_unlock(u8 *plain_text,
3039
                  const u8 key[32], const u8 nonce[24], const u8 mac[16],
3040
                  const u8 *cipher_text, size_t text_size)
3041
{
3042
    return crypto_unlock_aead(plain_text, key, nonce, mac, 0, 0,
3043
                              cipher_text, text_size);
3044
}
3045

3046
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

Product

Resources

Company

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more, all in one place. Commercial Alternative to JupyterHub.

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.