Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
tpruvot
GitHub Repository: tpruvot/cpuminer-multi
Path: blob/linux/yescrypt/yescrypt-opt.c
1201 views
1
/*-
2
* Copyright 2009 Colin Percival
3
* Copyright 2013,2014 Alexander Peslyak
4
* All rights reserved.
5
*
6
* Redistribution and use in source and binary forms, with or without
7
* modification, are permitted provided that the following conditions
8
* are met:
9
* 1. Redistributions of source code must retain the above copyright
10
* notice, this list of conditions and the following disclaimer.
11
* 2. Redistributions in binary form must reproduce the above copyright
12
* notice, this list of conditions and the following disclaimer in the
13
* documentation and/or other materials provided with the distribution.
14
*
15
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25
* SUCH DAMAGE.
26
*
27
* This file was originally written by Colin Percival as part of the Tarsnap
28
* online backup system.
29
*/
30
31
#include <errno.h>
32
#include <stdint.h>
33
#include <stdlib.h>
34
35
#include "sha256_Y.h"
36
#include "sysendian.h"
37
38
#include "yescrypt-platform.h"
39
40
static __inline void blkcpy(uint64_t * dest, const uint64_t * src, size_t count)
41
{
42
do {
43
*dest++ = *src++; *dest++ = *src++;
44
*dest++ = *src++; *dest++ = *src++;
45
} while (count -= 4);
46
}
47
48
static __inline void blkxor(uint64_t * dest, const uint64_t * src, size_t count)
49
{
50
do {
51
*dest++ ^= *src++; *dest++ ^= *src++;
52
*dest++ ^= *src++; *dest++ ^= *src++;
53
} while (count -= 4);
54
}
55
56
typedef union {
57
uint32_t w[16];
58
uint64_t d[8];
59
} salsa20_blk_t;
60
61
static __inline void salsa20_simd_shuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout)
62
{
63
#define COMBINE(out, in1, in2) \
64
Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32);
65
COMBINE(0, 0, 2)
66
COMBINE(1, 5, 7)
67
COMBINE(2, 2, 4)
68
COMBINE(3, 7, 1)
69
COMBINE(4, 4, 6)
70
COMBINE(5, 1, 3)
71
COMBINE(6, 6, 0)
72
COMBINE(7, 3, 5)
73
#undef COMBINE
74
}
75
76
static __inline void salsa20_simd_unshuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout)
77
{
78
#define COMBINE(out, in1, in2) \
79
Bout->w[out * 2] = (uint32_t) Bin->d[in1]; \
80
Bout->w[out * 2 + 1] = Bin->d[in2] >> 32;
81
COMBINE(0, 0, 6)
82
COMBINE(1, 5, 3)
83
COMBINE(2, 2, 0)
84
COMBINE(3, 7, 5)
85
COMBINE(4, 4, 2)
86
COMBINE(5, 1, 7)
87
COMBINE(6, 6, 4)
88
COMBINE(7, 3, 1)
89
#undef COMBINE
90
}
91
92
/**
93
* salsa20_8(B):
94
* Apply the salsa20/8 core to the provided block.
95
*/
96
static void salsa20_8(uint64_t B[8])
97
{
98
size_t i;
99
salsa20_blk_t X;
100
#define x X.w
101
102
salsa20_simd_unshuffle((const salsa20_blk_t *)B, &X);
103
104
for (i = 0; i < 8; i += 2) {
105
#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
106
/* Operate on columns */
107
x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9);
108
x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18);
109
110
x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9);
111
x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18);
112
113
x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9);
114
x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18);
115
116
x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9);
117
x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18);
118
119
/* Operate on rows */
120
x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9);
121
x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18);
122
123
x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9);
124
x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18);
125
126
x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9);
127
x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18);
128
129
x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9);
130
x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18);
131
#undef R
132
}
133
#undef x
134
135
{
136
salsa20_blk_t Y;
137
salsa20_simd_shuffle(&X, &Y);
138
for (i = 0; i < 16; i += 4) {
139
((salsa20_blk_t *)B)->w[i] += Y.w[i];
140
((salsa20_blk_t *)B)->w[i + 1] += Y.w[i + 1];
141
((salsa20_blk_t *)B)->w[i + 2] += Y.w[i + 2];
142
((salsa20_blk_t *)B)->w[i + 3] += Y.w[i + 3];
143
}
144
}
145
}
146
147
/**
148
* blockmix_salsa8(Bin, Bout, X, r):
149
* Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r
150
* bytes in length; the output Bout must also be the same size. The
151
* temporary space X must be 64 bytes.
152
*/
153
static void
154
blockmix_salsa8(const uint64_t * Bin, uint64_t * Bout, uint64_t * X, size_t r)
155
{
156
size_t i;
157
158
/* 1: X <-- B_{2r - 1} */
159
blkcpy(X, &Bin[(2 * r - 1) * 8], 8);
160
161
/* 2: for i = 0 to 2r - 1 do */
162
for (i = 0; i < 2 * r; i += 2) {
163
/* 3: X <-- H(X \xor B_i) */
164
blkxor(X, &Bin[i * 8], 8);
165
salsa20_8(X);
166
167
/* 4: Y_i <-- X */
168
/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
169
blkcpy(&Bout[i * 4], X, 8);
170
171
/* 3: X <-- H(X \xor B_i) */
172
blkxor(X, &Bin[i * 8 + 8], 8);
173
salsa20_8(X);
174
175
/* 4: Y_i <-- X */
176
/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
177
blkcpy(&Bout[i * 4 + r * 8], X, 8);
178
}
179
}
180
181
/* These are tunable */
182
#define S_BITS 8
183
#define S_SIMD 2
184
#define S_P 4
185
#define S_ROUNDS 6
186
187
/* Number of S-boxes. Not tunable, hard-coded in a few places. */
188
#define S_N 2
189
190
/* Derived values. Not tunable on their own. */
191
#define S_SIZE1 (1 << S_BITS)
192
#define S_MASK ((S_SIZE1 - 1) * S_SIMD * 8)
193
#define S_MASK2 (((uint64_t)S_MASK << 32) | S_MASK)
194
#define S_SIZE_ALL (S_N * S_SIZE1 * S_SIMD)
195
#define S_P_SIZE (S_P * S_SIMD)
196
#define S_MIN_R ((S_P * S_SIMD + 15) / 16)
197
198
/**
199
* pwxform(B):
200
* Transform the provided block using the provided S-boxes.
201
*/
202
static void block_pwxform(uint64_t * B, const uint64_t * S)
203
{
204
uint64_t (*X)[S_SIMD] = (uint64_t (*)[S_SIMD])B;
205
const uint8_t *S0 = (const uint8_t *)S;
206
const uint8_t *S1 = (const uint8_t *)(S + S_SIZE1 * S_SIMD);
207
size_t i, j;
208
#if S_SIMD > 2
209
size_t k;
210
#endif
211
212
for (j = 0; j < S_P; j++) {
213
uint64_t *Xj = X[j];
214
uint64_t x0 = Xj[0];
215
#if S_SIMD > 1
216
uint64_t x1 = Xj[1];
217
#endif
218
219
for (i = 0; i < S_ROUNDS; i++) {
220
uint64_t x = x0 & S_MASK2;
221
const uint64_t *p0, *p1;
222
223
p0 = (const uint64_t *)(S0 + (uint32_t)x);
224
p1 = (const uint64_t *)(S1 + (x >> 32));
225
226
x0 = (uint64_t)(x0 >> 32) * (uint32_t)x0;
227
x0 += p0[0];
228
x0 ^= p1[0];
229
230
#if S_SIMD > 1
231
x1 = (uint64_t)(x1 >> 32) * (uint32_t)x1;
232
x1 += p0[1];
233
x1 ^= p1[1];
234
#endif
235
236
#if S_SIMD > 2
237
for (k = 2; k < S_SIMD; k++) {
238
x = Xj[k];
239
240
x = (uint64_t)(x >> 32) * (uint32_t)x;
241
x += p0[k];
242
x ^= p1[k];
243
244
Xj[k] = x;
245
}
246
#endif
247
}
248
249
Xj[0] = x0;
250
#if S_SIMD > 1
251
Xj[1] = x1;
252
#endif
253
}
254
}
255
256
/**
257
* blockmix_pwxform(Bin, Bout, S, r):
258
* Compute Bout = BlockMix_pwxform{salsa20/8, S, r}(Bin). The input Bin must
259
* be 128r bytes in length; the output Bout must also be the same size.
260
*
261
* S lacks const qualifier to match blockmix_salsa8()'s prototype, which we
262
* need to refer to both functions via the same function pointers.
263
*/
264
static void blockmix_pwxform(const uint64_t * Bin, uint64_t * Bout, uint64_t * S, size_t r)
265
{
266
size_t r1, r2, i;
267
268
/* Convert 128-byte blocks to (S_P_SIZE * 64-bit) blocks */
269
r1 = r * 128 / (S_P_SIZE * 8);
270
271
/* X <-- B_{r1 - 1} */
272
blkcpy(Bout, &Bin[(r1 - 1) * S_P_SIZE], S_P_SIZE);
273
274
/* X <-- X \xor B_i */
275
blkxor(Bout, Bin, S_P_SIZE);
276
277
/* X <-- H'(X) */
278
/* B'_i <-- X */
279
block_pwxform(Bout, S);
280
281
/* for i = 0 to r1 - 1 do */
282
for (i = 1; i < r1; i++) {
283
/* X <-- X \xor B_i */
284
blkcpy(&Bout[i * S_P_SIZE], &Bout[(i - 1) * S_P_SIZE],
285
S_P_SIZE);
286
blkxor(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE], S_P_SIZE);
287
288
/* X <-- H'(X) */
289
/* B'_i <-- X */
290
block_pwxform(&Bout[i * S_P_SIZE], S);
291
}
292
293
/* Handle partial blocks */
294
if (i * S_P_SIZE < r * 16)
295
blkcpy(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE],
296
r * 16 - i * S_P_SIZE);
297
298
i = (r1 - 1) * S_P_SIZE / 8;
299
/* Convert 128-byte blocks to 64-byte blocks */
300
r2 = r * 2;
301
302
/* B'_i <-- H(B'_i) */
303
salsa20_8(&Bout[i * 8]);
304
i++;
305
306
for (; i < r2; i++) {
307
/* B'_i <-- H(B'_i \xor B'_{i-1}) */
308
blkxor(&Bout[i * 8], &Bout[(i - 1) * 8], 8);
309
salsa20_8(&Bout[i * 8]);
310
}
311
}
312
313
/**
314
* integerify(B, r):
315
* Return the result of parsing B_{2r-1} as a little-endian integer.
316
*/
317
static __inline uint64_t
318
integerify(const uint64_t * B, size_t r)
319
{
320
/*
321
* Our 64-bit words are in host byte order, and word 6 holds the second 32-bit
322
* word of B_{2r-1} due to SIMD shuffling. The 64-bit value we return is also
323
* in host byte order, as it should be.
324
*/
325
const uint64_t * X = &B[(2 * r - 1) * 8];
326
uint32_t lo = (uint32_t) X[0];
327
uint32_t hi = (uint32_t) (X[6] >> 32);
328
return ((uint64_t)hi << 32) + lo;
329
}
330
331
/**
332
* smix1(B, r, N, flags, V, NROM, shared, XY, S):
333
* Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in
334
* length; the temporary storage V must be 128rN bytes in length; the temporary
335
* storage XY must be 256r + 64 bytes in length. The value N must be even and
336
* no smaller than 2.
337
*/
338
static void
339
smix1(uint64_t * B, size_t r, uint64_t N, yescrypt_flags_t flags,
340
uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
341
uint64_t * XY, uint64_t * S)
342
{
343
void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) =
344
(S ? blockmix_pwxform : blockmix_salsa8);
345
const uint64_t * VROM = shared->shared1.aligned;
346
uint32_t VROM_mask = shared->mask1;
347
size_t s = 16 * r;
348
uint64_t * X = V;
349
uint64_t * Y = &XY[s];
350
uint64_t * Z = S ? S : &XY[2 * s];
351
uint64_t n, i, j;
352
size_t k;
353
354
/* 1: X <-- B */
355
/* 3: V_i <-- X */
356
for (i = 0; i < 2 * r; i++) {
357
const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8];
358
salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
359
salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8];
360
for (k = 0; k < 16; k++)
361
tmp->w[k] = le32dec(&src->w[k]);
362
salsa20_simd_shuffle(tmp, dst);
363
}
364
365
/* 4: X <-- H(X) */
366
/* 3: V_i <-- X */
367
blockmix(X, Y, Z, r);
368
blkcpy(&V[s], Y, s);
369
370
X = XY;
371
372
if (NROM && (VROM_mask & 1)) {
373
if ((1 & VROM_mask) == 1) {
374
/* j <-- Integerify(X) mod NROM */
375
j = integerify(Y, r) & (NROM - 1);
376
377
/* X <-- H(X \xor VROM_j) */
378
blkxor(Y, &VROM[j * s], s);
379
}
380
381
blockmix(Y, X, Z, r);
382
383
/* 2: for i = 0 to N - 1 do */
384
for (n = 1, i = 2; i < N; i += 2) {
385
/* 3: V_i <-- X */
386
blkcpy(&V[i * s], X, s);
387
388
if ((i & (i - 1)) == 0)
389
n <<= 1;
390
391
/* j <-- Wrap(Integerify(X), i) */
392
j = integerify(X, r) & (n - 1);
393
j += i - n;
394
395
/* X <-- X \xor V_j */
396
blkxor(X, &V[j * s], s);
397
398
/* 4: X <-- H(X) */
399
blockmix(X, Y, Z, r);
400
401
/* 3: V_i <-- X */
402
blkcpy(&V[(i + 1) * s], Y, s);
403
404
j = integerify(Y, r);
405
if (((i + 1) & VROM_mask) == 1) {
406
/* j <-- Integerify(X) mod NROM */
407
j &= NROM - 1;
408
409
/* X <-- H(X \xor VROM_j) */
410
blkxor(Y, &VROM[j * s], s);
411
} else {
412
/* j <-- Wrap(Integerify(X), i) */
413
j &= n - 1;
414
j += i + 1 - n;
415
416
/* X <-- H(X \xor V_j) */
417
blkxor(Y, &V[j * s], s);
418
}
419
420
blockmix(Y, X, Z, r);
421
}
422
} else {
423
yescrypt_flags_t rw = flags & YESCRYPT_RW;
424
425
/* 4: X <-- H(X) */
426
blockmix(Y, X, Z, r);
427
428
/* 2: for i = 0 to N - 1 do */
429
for (n = 1, i = 2; i < N; i += 2) {
430
/* 3: V_i <-- X */
431
blkcpy(&V[i * s], X, s);
432
433
if (rw) {
434
if ((i & (i - 1)) == 0)
435
n <<= 1;
436
437
/* j <-- Wrap(Integerify(X), i) */
438
j = integerify(X, r) & (n - 1);
439
j += i - n;
440
441
/* X <-- X \xor V_j */
442
blkxor(X, &V[j * s], s);
443
}
444
445
/* 4: X <-- H(X) */
446
blockmix(X, Y, Z, r);
447
448
/* 3: V_i <-- X */
449
blkcpy(&V[(i + 1) * s], Y, s);
450
451
if (rw) {
452
/* j <-- Wrap(Integerify(X), i) */
453
j = integerify(Y, r) & (n - 1);
454
j += (i + 1) - n;
455
456
/* X <-- X \xor V_j */
457
blkxor(Y, &V[j * s], s);
458
}
459
460
/* 4: X <-- H(X) */
461
blockmix(Y, X, Z, r);
462
}
463
}
464
465
/* B' <-- X */
466
for (i = 0; i < 2 * r; i++) {
467
const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8];
468
salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
469
salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8];
470
for (k = 0; k < 16; k++)
471
le32enc(&tmp->w[k], src->w[k]);
472
salsa20_simd_unshuffle(tmp, dst);
473
}
474
}
475
476
/**
477
* smix2(B, r, N, Nloop, flags, V, NROM, shared, XY, S):
478
* Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in
479
* length; the temporary storage V must be 128rN bytes in length; the temporary
480
* storage XY must be 256r + 64 bytes in length. The value N must be a
481
* power of 2 greater than 1. The value Nloop must be even.
482
*/
483
static void
484
smix2(uint64_t * B, size_t r, uint64_t N, uint64_t Nloop,
485
yescrypt_flags_t flags,
486
uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
487
uint64_t * XY, uint64_t * S)
488
{
489
void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) =
490
(S ? blockmix_pwxform : blockmix_salsa8);
491
const uint64_t * VROM = shared->shared1.aligned;
492
uint32_t VROM_mask = shared->mask1 | 1;
493
size_t s = 16 * r;
494
yescrypt_flags_t rw = flags & YESCRYPT_RW;
495
uint64_t * X = XY;
496
uint64_t * Y = &XY[s];
497
uint64_t * Z = S ? S : &XY[2 * s];
498
uint64_t i, j;
499
size_t k;
500
501
if (Nloop == 0)
502
return;
503
504
/* X <-- B' */
505
for (i = 0; i < 2 * r; i++) {
506
const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8];
507
salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
508
salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8];
509
for (k = 0; k < 16; k++)
510
tmp->w[k] = le32dec(&src->w[k]);
511
salsa20_simd_shuffle(tmp, dst);
512
}
513
514
if (NROM) {
515
/* 6: for i = 0 to N - 1 do */
516
for (i = 0; i < Nloop; i += 2) {
517
/* 7: j <-- Integerify(X) mod N */
518
j = integerify(X, r) & (N - 1);
519
520
/* 8: X <-- H(X \xor V_j) */
521
blkxor(X, &V[j * s], s);
522
/* V_j <-- Xprev \xor V_j */
523
if (rw)
524
blkcpy(&V[j * s], X, s);
525
blockmix(X, Y, Z, r);
526
527
j = integerify(Y, r);
528
if (((i + 1) & VROM_mask) == 1) {
529
/* j <-- Integerify(X) mod NROM */
530
j &= NROM - 1;
531
532
/* X <-- H(X \xor VROM_j) */
533
blkxor(Y, &VROM[j * s], s);
534
} else {
535
/* 7: j <-- Integerify(X) mod N */
536
j &= N - 1;
537
538
/* 8: X <-- H(X \xor V_j) */
539
blkxor(Y, &V[j * s], s);
540
/* V_j <-- Xprev \xor V_j */
541
if (rw)
542
blkcpy(&V[j * s], Y, s);
543
}
544
545
blockmix(Y, X, Z, r);
546
}
547
} else {
548
/* 6: for i = 0 to N - 1 do */
549
i = Nloop / 2;
550
do {
551
/* 7: j <-- Integerify(X) mod N */
552
j = integerify(X, r) & (N - 1);
553
554
/* 8: X <-- H(X \xor V_j) */
555
blkxor(X, &V[j * s], s);
556
/* V_j <-- Xprev \xor V_j */
557
if (rw)
558
blkcpy(&V[j * s], X, s);
559
blockmix(X, Y, Z, r);
560
561
/* 7: j <-- Integerify(X) mod N */
562
j = integerify(Y, r) & (N - 1);
563
564
/* 8: X <-- H(X \xor V_j) */
565
blkxor(Y, &V[j * s], s);
566
/* V_j <-- Xprev \xor V_j */
567
if (rw)
568
blkcpy(&V[j * s], Y, s);
569
blockmix(Y, X, Z, r);
570
} while (--i);
571
}
572
573
/* 10: B' <-- X */
574
for (i = 0; i < 2 * r; i++) {
575
const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8];
576
salsa20_blk_t *tmp = (salsa20_blk_t *)Y;
577
salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8];
578
for (k = 0; k < 16; k++)
579
le32enc(&tmp->w[k], src->w[k]);
580
salsa20_simd_unshuffle(tmp, dst);
581
}
582
}
583
584
/**
585
* p2floor(x):
586
* Largest power of 2 not greater than argument.
587
*/
588
static uint64_t
589
p2floor(uint64_t x)
590
{
591
uint64_t y;
592
while ((y = x & (x - 1)))
593
x = y;
594
return x;
595
}
596
597
/**
598
* smix(B, r, N, p, t, flags, V, NROM, shared, XY, S):
599
* Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the
600
* temporary storage V must be 128rN bytes in length; the temporary storage
601
* XY must be 256r+64 or (256r+64)*p bytes in length (the larger size is
602
* required with OpenMP-enabled builds). The value N must be a power of 2
603
* greater than 1.
604
*/
605
static void
606
smix(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t,
607
yescrypt_flags_t flags,
608
uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared,
609
uint64_t * XY, uint64_t * S)
610
{
611
size_t s = 16 * r;
612
uint64_t Nchunk = N / p, Nloop_all, Nloop_rw;
613
uint32_t i;
614
615
Nloop_all = Nchunk;
616
if (flags & YESCRYPT_RW) {
617
if (t <= 1) {
618
if (t)
619
Nloop_all *= 2; /* 2/3 */
620
Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */
621
} else {
622
Nloop_all *= t - 1;
623
}
624
} else if (t) {
625
if (t == 1)
626
Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */
627
Nloop_all *= t;
628
}
629
630
Nloop_rw = 0;
631
if (flags & __YESCRYPT_INIT_SHARED)
632
Nloop_rw = Nloop_all;
633
else if (flags & YESCRYPT_RW)
634
Nloop_rw = Nloop_all / p;
635
636
Nchunk &= ~(uint64_t)1; /* round down to even */
637
Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */
638
Nloop_rw &= ~(uint64_t)1; /* round down to even */
639
640
#ifdef _OPENMP
641
#pragma omp parallel if (p > 1) default(none) private(i) shared(B, r, N, p, flags, V, NROM, shared, XY, S, s, Nchunk, Nloop_all, Nloop_rw)
642
{
643
#pragma omp for
644
#endif
645
for (i = 0; i < p; i++) {
646
uint64_t Vchunk = i * Nchunk;
647
uint64_t * Bp = &B[i * s];
648
uint64_t * Vp = &V[Vchunk * s];
649
#ifdef _OPENMP
650
uint64_t * XYp = &XY[i * (2 * s + 8)];
651
#else
652
uint64_t * XYp = XY;
653
#endif
654
uint64_t Np = (i < p - 1) ? Nchunk : (N - Vchunk);
655
uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S;
656
if (Sp)
657
smix1(Bp, 1, S_SIZE_ALL / 16,
658
flags & ~YESCRYPT_PWXFORM,
659
Sp, NROM, shared, XYp, NULL);
660
if (!(flags & __YESCRYPT_INIT_SHARED_2))
661
smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp);
662
smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp,
663
NROM, shared, XYp, Sp);
664
}
665
666
if (Nloop_all > Nloop_rw) {
667
#ifdef _OPENMP
668
#pragma omp for
669
#endif
670
for (i = 0; i < p; i++) {
671
uint64_t * Bp = &B[i * s];
672
#ifdef _OPENMP
673
uint64_t * XYp = &XY[i * (2 * s + 8)];
674
#else
675
uint64_t * XYp = XY;
676
#endif
677
uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S;
678
smix2(Bp, r, N, Nloop_all - Nloop_rw,
679
flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp);
680
}
681
}
682
#ifdef _OPENMP
683
}
684
#endif
685
}
686
687
/**
688
* yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
689
* N, r, p, t, flags, buf, buflen):
690
* Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
691
* p, buflen), or a revision of scrypt as requested by flags and shared, and
692
* write the result into buf. The parameters r, p, and buflen must satisfy
693
* r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N must be a power
694
* of 2 greater than 1.
695
*
696
* t controls computation time while not affecting peak memory usage. shared
697
* and flags may request special modes as described in yescrypt.h. local is
698
* the thread-local data structure, allowing to preserve and reuse a memory
699
* allocation across calls, thereby reducing its overhead.
700
*
701
* Return 0 on success; or -1 on error.
702
*/
703
int
704
yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
705
const uint8_t * passwd, size_t passwdlen,
706
const uint8_t * salt, size_t saltlen,
707
uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags,
708
uint8_t * buf, size_t buflen)
709
{
710
yescrypt_region_t tmp;
711
uint64_t NROM;
712
size_t B_size, V_size, XY_size, need;
713
uint64_t * B, * V, * XY, * S;
714
uint64_t sha256[4];
715
716
/*
717
* YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose,
718
* so don't let it have side-effects. Without this adjustment, it'd
719
* enable the SHA-256 password pre-hashing and output post-hashing,
720
* because any deviation from classic scrypt implies those.
721
*/
722
if (p == 1)
723
flags &= ~YESCRYPT_PARALLEL_SMIX;
724
725
/* Sanity-check parameters */
726
if (flags & ~YESCRYPT_KNOWN_FLAGS) {
727
errno = EINVAL;
728
return -1;
729
}
730
#if SIZE_MAX > UINT32_MAX
731
if (buflen > (((uint64_t)(1) << 32) - 1) * 32) {
732
errno = EFBIG;
733
return -1;
734
}
735
#endif
736
if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) {
737
errno = EFBIG;
738
return -1;
739
}
740
if (((N & (N - 1)) != 0) || (N <= 1) || (r < 1) || (p < 1)) {
741
errno = EINVAL;
742
return -1;
743
}
744
if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 1)) {
745
errno = EINVAL;
746
return -1;
747
}
748
#if S_MIN_R > 1
749
if ((flags & YESCRYPT_PWXFORM) && (r < S_MIN_R)) {
750
errno = EINVAL;
751
return -1;
752
}
753
#endif
754
if ((p > SIZE_MAX / ((size_t)256 * r + 64)) ||
755
#if SIZE_MAX / 256 <= UINT32_MAX
756
(r > SIZE_MAX / 256) ||
757
#endif
758
(N > SIZE_MAX / 128 / r)) {
759
errno = ENOMEM;
760
return -1;
761
}
762
if (N > UINT64_MAX / ((uint64_t)t + 1)) {
763
errno = EFBIG;
764
return -1;
765
}
766
#ifdef _OPENMP
767
if (!(flags & YESCRYPT_PARALLEL_SMIX) &&
768
(N > SIZE_MAX / 128 / (r * p))) {
769
errno = ENOMEM;
770
return -1;
771
}
772
#endif
773
if ((flags & YESCRYPT_PWXFORM) &&
774
#ifndef _OPENMP
775
(flags & YESCRYPT_PARALLEL_SMIX) &&
776
#endif
777
p > SIZE_MAX / (S_SIZE_ALL * sizeof(*S))) {
778
errno = ENOMEM;
779
return -1;
780
}
781
782
NROM = 0;
783
if (shared->shared1.aligned) {
784
NROM = shared->shared1.aligned_size / ((size_t)128 * r);
785
if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) ||
786
!(flags & YESCRYPT_RW)) {
787
errno = EINVAL;
788
return -1;
789
}
790
}
791
792
/* Allocate memory */
793
V = NULL;
794
V_size = (size_t)128 * r * N;
795
#ifdef _OPENMP
796
if (!(flags & YESCRYPT_PARALLEL_SMIX))
797
V_size *= p;
798
#endif
799
need = V_size;
800
if (flags & __YESCRYPT_INIT_SHARED) {
801
if (local->aligned_size < need) {
802
if (local->base || local->aligned ||
803
local->base_size || local->aligned_size) {
804
errno = EINVAL;
805
return -1;
806
}
807
if (!alloc_region(local, need))
808
return -1;
809
}
810
V = (uint64_t *)local->aligned;
811
need = 0;
812
}
813
B_size = (size_t)128 * r * p;
814
need += B_size;
815
if (need < B_size) {
816
errno = ENOMEM;
817
return -1;
818
}
819
XY_size = (size_t)256 * r + 64;
820
#ifdef _OPENMP
821
XY_size *= p;
822
#endif
823
need += XY_size;
824
if (need < XY_size) {
825
errno = ENOMEM;
826
return -1;
827
}
828
if (flags & YESCRYPT_PWXFORM) {
829
size_t S_size = S_SIZE_ALL * sizeof(*S);
830
#ifdef _OPENMP
831
S_size *= p;
832
#else
833
if (flags & YESCRYPT_PARALLEL_SMIX)
834
S_size *= p;
835
#endif
836
need += S_size;
837
if (need < S_size) {
838
errno = ENOMEM;
839
return -1;
840
}
841
}
842
if (flags & __YESCRYPT_INIT_SHARED) {
843
if (!alloc_region(&tmp, need))
844
return -1;
845
B = (uint64_t *)tmp.aligned;
846
XY = (uint64_t *)((uint8_t *)B + B_size);
847
} else {
848
init_region(&tmp);
849
if (local->aligned_size < need) {
850
if (free_region(local))
851
return -1;
852
if (!alloc_region(local, need))
853
return -1;
854
}
855
B = (uint64_t *)local->aligned;
856
V = (uint64_t *)((uint8_t *)B + B_size);
857
XY = (uint64_t *)((uint8_t *)V + V_size);
858
}
859
S = NULL;
860
if (flags & YESCRYPT_PWXFORM)
861
S = (uint64_t *)((uint8_t *)XY + XY_size);
862
863
if (t || flags) {
864
SHA256_CTX_Y ctx;
865
SHA256_Init_Y(&ctx);
866
SHA256_Update_Y(&ctx, passwd, passwdlen);
867
SHA256_Final_Y((uint8_t *)sha256, &ctx);
868
passwd = (uint8_t *)sha256;
869
passwdlen = sizeof(sha256);
870
}
871
872
/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
873
PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1,
874
(uint8_t *)B, B_size);
875
876
if (t || flags)
877
blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0]));
878
879
if (p == 1 || (flags & YESCRYPT_PARALLEL_SMIX)) {
880
smix(B, r, N, p, t, flags, V, NROM, shared, XY, S);
881
} else {
882
uint32_t i;
883
884
/* 2: for i = 0 to p - 1 do */
885
#ifdef _OPENMP
886
#pragma omp parallel for default(none) private(i) shared(B, r, N, p, t, flags, V, NROM, shared, XY, S)
887
#endif
888
for (i = 0; i < p; i++) {
889
/* 3: B_i <-- MF(B_i, N) */
890
#ifdef _OPENMP
891
smix(&B[(size_t)16 * r * i], r, N, 1, t, flags,
892
&V[(size_t)16 * r * i * N],
893
NROM, shared,
894
&XY[((size_t)32 * r + 8) * i],
895
S ? &S[S_SIZE_ALL * i] : S);
896
#else
897
smix(&B[(size_t)16 * r * i], r, N, 1, t, flags, V,
898
NROM, shared, XY, S);
899
#endif
900
}
901
}
902
903
/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
904
PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, buf, buflen);
905
906
/*
907
* Except when computing classic scrypt, allow all computation so far
908
* to be performed on the client. The final steps below match those of
909
* SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so
910
* far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of
911
* SCRAM's use of SHA-1) would be usable with yescrypt hashes.
912
*/
913
if ((t || flags) && buflen == sizeof(sha256)) {
914
/* Compute ClientKey */
915
{
916
HMAC_SHA256_CTX_Y ctx;
917
HMAC_SHA256_Init_Y(&ctx, buf, buflen);
918
if (yescrypt_client_key != NULL)
919
HMAC_SHA256_Update_Y(&ctx, yescrypt_client_key,
920
yescrypt_client_key_len);
921
else
922
HMAC_SHA256_Update_Y(&ctx, salt, saltlen);
923
HMAC_SHA256_Final_Y((uint8_t *)sha256, &ctx);
924
}
925
/* Compute StoredKey */
926
{
927
SHA256_CTX_Y ctx;
928
SHA256_Init_Y(&ctx);
929
SHA256_Update_Y(&ctx, (uint8_t *)sha256, sizeof(sha256));
930
SHA256_Final_Y(buf, &ctx);
931
}
932
}
933
934
if (free_region(&tmp))
935
return -1;
936
937
/* Success! */
938
return 0;
939
}
940
941