Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
tpruvot
GitHub Repository: tpruvot/cpuminer-multi
Path: blob/linux/sha3/sph_luffa.c
1201 views
1
/* $Id: luffa.c 219 2010-06-08 17:24:41Z tp $ */
2
/*
3
* Luffa implementation.
4
*
5
* ==========================(LICENSE BEGIN)============================
6
*
7
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
8
*
9
* Permission is hereby granted, free of charge, to any person obtaining
10
* a copy of this software and associated documentation files (the
11
* "Software"), to deal in the Software without restriction, including
12
* without limitation the rights to use, copy, modify, merge, publish,
13
* distribute, sublicense, and/or sell copies of the Software, and to
14
* permit persons to whom the Software is furnished to do so, subject to
15
* the following conditions:
16
*
17
* The above copyright notice and this permission notice shall be
18
* included in all copies or substantial portions of the Software.
19
*
20
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
*
28
* ===========================(LICENSE END)=============================
29
*
30
* @author Thomas Pornin <[email protected]>
31
*/
32
33
#include <stddef.h>
34
#include <string.h>
35
#include <limits.h>
36
37
#include "sph_luffa.h"
38
39
#ifdef __cplusplus
40
extern "C"{
41
#endif
42
43
#if SPH_64_TRUE && !defined SPH_LUFFA_PARALLEL
44
#define SPH_LUFFA_PARALLEL 1
45
#endif
46
47
#ifdef _MSC_VER
48
#pragma warning (disable: 4146)
49
#endif
50
51
static const sph_u32 V_INIT[5][8] = {
52
{
53
SPH_C32(0x6d251e69), SPH_C32(0x44b051e0),
54
SPH_C32(0x4eaa6fb4), SPH_C32(0xdbf78465),
55
SPH_C32(0x6e292011), SPH_C32(0x90152df4),
56
SPH_C32(0xee058139), SPH_C32(0xdef610bb)
57
}, {
58
SPH_C32(0xc3b44b95), SPH_C32(0xd9d2f256),
59
SPH_C32(0x70eee9a0), SPH_C32(0xde099fa3),
60
SPH_C32(0x5d9b0557), SPH_C32(0x8fc944b3),
61
SPH_C32(0xcf1ccf0e), SPH_C32(0x746cd581)
62
}, {
63
SPH_C32(0xf7efc89d), SPH_C32(0x5dba5781),
64
SPH_C32(0x04016ce5), SPH_C32(0xad659c05),
65
SPH_C32(0x0306194f), SPH_C32(0x666d1836),
66
SPH_C32(0x24aa230a), SPH_C32(0x8b264ae7)
67
}, {
68
SPH_C32(0x858075d5), SPH_C32(0x36d79cce),
69
SPH_C32(0xe571f7d7), SPH_C32(0x204b1f67),
70
SPH_C32(0x35870c6a), SPH_C32(0x57e9e923),
71
SPH_C32(0x14bcb808), SPH_C32(0x7cde72ce)
72
}, {
73
SPH_C32(0x6c68e9be), SPH_C32(0x5ec41e22),
74
SPH_C32(0xc825b7c7), SPH_C32(0xaffb4363),
75
SPH_C32(0xf5df3999), SPH_C32(0x0fc688f1),
76
SPH_C32(0xb07224cc), SPH_C32(0x03e86cea)
77
}
78
};
79
80
static const sph_u32 RC00[8] = {
81
SPH_C32(0x303994a6), SPH_C32(0xc0e65299),
82
SPH_C32(0x6cc33a12), SPH_C32(0xdc56983e),
83
SPH_C32(0x1e00108f), SPH_C32(0x7800423d),
84
SPH_C32(0x8f5b7882), SPH_C32(0x96e1db12)
85
};
86
87
static const sph_u32 RC04[8] = {
88
SPH_C32(0xe0337818), SPH_C32(0x441ba90d),
89
SPH_C32(0x7f34d442), SPH_C32(0x9389217f),
90
SPH_C32(0xe5a8bce6), SPH_C32(0x5274baf4),
91
SPH_C32(0x26889ba7), SPH_C32(0x9a226e9d)
92
};
93
94
static const sph_u32 RC10[8] = {
95
SPH_C32(0xb6de10ed), SPH_C32(0x70f47aae),
96
SPH_C32(0x0707a3d4), SPH_C32(0x1c1e8f51),
97
SPH_C32(0x707a3d45), SPH_C32(0xaeb28562),
98
SPH_C32(0xbaca1589), SPH_C32(0x40a46f3e)
99
};
100
101
static const sph_u32 RC14[8] = {
102
SPH_C32(0x01685f3d), SPH_C32(0x05a17cf4),
103
SPH_C32(0xbd09caca), SPH_C32(0xf4272b28),
104
SPH_C32(0x144ae5cc), SPH_C32(0xfaa7ae2b),
105
SPH_C32(0x2e48f1c1), SPH_C32(0xb923c704)
106
};
107
108
#if SPH_LUFFA_PARALLEL
109
110
static const sph_u64 RCW010[8] = {
111
SPH_C64(0xb6de10ed303994a6), SPH_C64(0x70f47aaec0e65299),
112
SPH_C64(0x0707a3d46cc33a12), SPH_C64(0x1c1e8f51dc56983e),
113
SPH_C64(0x707a3d451e00108f), SPH_C64(0xaeb285627800423d),
114
SPH_C64(0xbaca15898f5b7882), SPH_C64(0x40a46f3e96e1db12)
115
};
116
117
static const sph_u64 RCW014[8] = {
118
SPH_C64(0x01685f3de0337818), SPH_C64(0x05a17cf4441ba90d),
119
SPH_C64(0xbd09caca7f34d442), SPH_C64(0xf4272b289389217f),
120
SPH_C64(0x144ae5cce5a8bce6), SPH_C64(0xfaa7ae2b5274baf4),
121
SPH_C64(0x2e48f1c126889ba7), SPH_C64(0xb923c7049a226e9d)
122
};
123
124
#endif
125
126
static const sph_u32 RC20[8] = {
127
SPH_C32(0xfc20d9d2), SPH_C32(0x34552e25),
128
SPH_C32(0x7ad8818f), SPH_C32(0x8438764a),
129
SPH_C32(0xbb6de032), SPH_C32(0xedb780c8),
130
SPH_C32(0xd9847356), SPH_C32(0xa2c78434)
131
};
132
133
static const sph_u32 RC24[8] = {
134
SPH_C32(0xe25e72c1), SPH_C32(0xe623bb72),
135
SPH_C32(0x5c58a4a4), SPH_C32(0x1e38e2e7),
136
SPH_C32(0x78e38b9d), SPH_C32(0x27586719),
137
SPH_C32(0x36eda57f), SPH_C32(0x703aace7)
138
};
139
140
static const sph_u32 RC30[8] = {
141
SPH_C32(0xb213afa5), SPH_C32(0xc84ebe95),
142
SPH_C32(0x4e608a22), SPH_C32(0x56d858fe),
143
SPH_C32(0x343b138f), SPH_C32(0xd0ec4e3d),
144
SPH_C32(0x2ceb4882), SPH_C32(0xb3ad2208)
145
};
146
147
static const sph_u32 RC34[8] = {
148
SPH_C32(0xe028c9bf), SPH_C32(0x44756f91),
149
SPH_C32(0x7e8fce32), SPH_C32(0x956548be),
150
SPH_C32(0xfe191be2), SPH_C32(0x3cb226e5),
151
SPH_C32(0x5944a28e), SPH_C32(0xa1c4c355)
152
};
153
154
#if SPH_LUFFA_PARALLEL
155
156
static const sph_u64 RCW230[8] = {
157
SPH_C64(0xb213afa5fc20d9d2), SPH_C64(0xc84ebe9534552e25),
158
SPH_C64(0x4e608a227ad8818f), SPH_C64(0x56d858fe8438764a),
159
SPH_C64(0x343b138fbb6de032), SPH_C64(0xd0ec4e3dedb780c8),
160
SPH_C64(0x2ceb4882d9847356), SPH_C64(0xb3ad2208a2c78434)
161
};
162
163
164
static const sph_u64 RCW234[8] = {
165
SPH_C64(0xe028c9bfe25e72c1), SPH_C64(0x44756f91e623bb72),
166
SPH_C64(0x7e8fce325c58a4a4), SPH_C64(0x956548be1e38e2e7),
167
SPH_C64(0xfe191be278e38b9d), SPH_C64(0x3cb226e527586719),
168
SPH_C64(0x5944a28e36eda57f), SPH_C64(0xa1c4c355703aace7)
169
};
170
171
#endif
172
173
static const sph_u32 RC40[8] = {
174
SPH_C32(0xf0d2e9e3), SPH_C32(0xac11d7fa),
175
SPH_C32(0x1bcb66f2), SPH_C32(0x6f2d9bc9),
176
SPH_C32(0x78602649), SPH_C32(0x8edae952),
177
SPH_C32(0x3b6ba548), SPH_C32(0xedae9520)
178
};
179
180
static const sph_u32 RC44[8] = {
181
SPH_C32(0x5090d577), SPH_C32(0x2d1925ab),
182
SPH_C32(0xb46496ac), SPH_C32(0xd1925ab0),
183
SPH_C32(0x29131ab6), SPH_C32(0x0fc053c3),
184
SPH_C32(0x3f014f0c), SPH_C32(0xfc053c31)
185
};
186
187
#define DECL_TMP8(w) \
188
sph_u32 w ## 0, w ## 1, w ## 2, w ## 3, w ## 4, w ## 5, w ## 6, w ## 7;
189
190
#define M2(d, s) do { \
191
sph_u32 tmp = s ## 7; \
192
d ## 7 = s ## 6; \
193
d ## 6 = s ## 5; \
194
d ## 5 = s ## 4; \
195
d ## 4 = s ## 3 ^ tmp; \
196
d ## 3 = s ## 2 ^ tmp; \
197
d ## 2 = s ## 1; \
198
d ## 1 = s ## 0 ^ tmp; \
199
d ## 0 = tmp; \
200
} while (0)
201
202
#define XOR(d, s1, s2) do { \
203
d ## 0 = s1 ## 0 ^ s2 ## 0; \
204
d ## 1 = s1 ## 1 ^ s2 ## 1; \
205
d ## 2 = s1 ## 2 ^ s2 ## 2; \
206
d ## 3 = s1 ## 3 ^ s2 ## 3; \
207
d ## 4 = s1 ## 4 ^ s2 ## 4; \
208
d ## 5 = s1 ## 5 ^ s2 ## 5; \
209
d ## 6 = s1 ## 6 ^ s2 ## 6; \
210
d ## 7 = s1 ## 7 ^ s2 ## 7; \
211
} while (0)
212
213
#if SPH_LUFFA_PARALLEL
214
215
#define SUB_CRUMB_GEN(a0, a1, a2, a3, width) do { \
216
sph_u ## width tmp; \
217
tmp = (a0); \
218
(a0) |= (a1); \
219
(a2) ^= (a3); \
220
(a1) = SPH_T ## width(~(a1)); \
221
(a0) ^= (a3); \
222
(a3) &= tmp; \
223
(a1) ^= (a3); \
224
(a3) ^= (a2); \
225
(a2) &= (a0); \
226
(a0) = SPH_T ## width(~(a0)); \
227
(a2) ^= (a1); \
228
(a1) |= (a3); \
229
tmp ^= (a1); \
230
(a3) ^= (a2); \
231
(a2) &= (a1); \
232
(a1) ^= (a0); \
233
(a0) = tmp; \
234
} while (0)
235
236
#define SUB_CRUMB(a0, a1, a2, a3) SUB_CRUMB_GEN(a0, a1, a2, a3, 32)
237
#define SUB_CRUMBW(a0, a1, a2, a3) SUB_CRUMB_GEN(a0, a1, a2, a3, 64)
238
239
240
#if 0
241
242
#define ROL32W(x, n) SPH_T64( \
243
(((x) << (n)) \
244
& ~((SPH_C64(0xFFFFFFFF) >> (32 - (n))) << 32)) \
245
| (((x) >> (32 - (n))) \
246
& ~((SPH_C64(0xFFFFFFFF) >> (n)) << (n))))
247
248
#define MIX_WORDW(u, v) do { \
249
(v) ^= (u); \
250
(u) = ROL32W((u), 2) ^ (v); \
251
(v) = ROL32W((v), 14) ^ (u); \
252
(u) = ROL32W((u), 10) ^ (v); \
253
(v) = ROL32W((v), 1); \
254
} while (0)
255
256
#endif
257
258
#define MIX_WORDW(u, v) do { \
259
sph_u32 ul, uh, vl, vh; \
260
(v) ^= (u); \
261
ul = SPH_T32((sph_u32)(u)); \
262
uh = SPH_T32((sph_u32)((u) >> 32)); \
263
vl = SPH_T32((sph_u32)(v)); \
264
vh = SPH_T32((sph_u32)((v) >> 32)); \
265
ul = SPH_ROTL32(ul, 2) ^ vl; \
266
vl = SPH_ROTL32(vl, 14) ^ ul; \
267
ul = SPH_ROTL32(ul, 10) ^ vl; \
268
vl = SPH_ROTL32(vl, 1); \
269
uh = SPH_ROTL32(uh, 2) ^ vh; \
270
vh = SPH_ROTL32(vh, 14) ^ uh; \
271
uh = SPH_ROTL32(uh, 10) ^ vh; \
272
vh = SPH_ROTL32(vh, 1); \
273
(u) = (sph_u64)ul | ((sph_u64)uh << 32); \
274
(v) = (sph_u64)vl | ((sph_u64)vh << 32); \
275
} while (0)
276
277
#else
278
279
#define SUB_CRUMB(a0, a1, a2, a3) do { \
280
sph_u32 tmp; \
281
tmp = (a0); \
282
(a0) |= (a1); \
283
(a2) ^= (a3); \
284
(a1) = SPH_T32(~(a1)); \
285
(a0) ^= (a3); \
286
(a3) &= tmp; \
287
(a1) ^= (a3); \
288
(a3) ^= (a2); \
289
(a2) &= (a0); \
290
(a0) = SPH_T32(~(a0)); \
291
(a2) ^= (a1); \
292
(a1) |= (a3); \
293
tmp ^= (a1); \
294
(a3) ^= (a2); \
295
(a2) &= (a1); \
296
(a1) ^= (a0); \
297
(a0) = tmp; \
298
} while (0)
299
300
#endif
301
302
#define MIX_WORD(u, v) do { \
303
(v) ^= (u); \
304
(u) = SPH_ROTL32((u), 2) ^ (v); \
305
(v) = SPH_ROTL32((v), 14) ^ (u); \
306
(u) = SPH_ROTL32((u), 10) ^ (v); \
307
(v) = SPH_ROTL32((v), 1); \
308
} while (0)
309
310
#define DECL_STATE3 \
311
sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \
312
sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \
313
sph_u32 V20, V21, V22, V23, V24, V25, V26, V27;
314
315
#define READ_STATE3(state) do { \
316
V00 = (state)->V[0][0]; \
317
V01 = (state)->V[0][1]; \
318
V02 = (state)->V[0][2]; \
319
V03 = (state)->V[0][3]; \
320
V04 = (state)->V[0][4]; \
321
V05 = (state)->V[0][5]; \
322
V06 = (state)->V[0][6]; \
323
V07 = (state)->V[0][7]; \
324
V10 = (state)->V[1][0]; \
325
V11 = (state)->V[1][1]; \
326
V12 = (state)->V[1][2]; \
327
V13 = (state)->V[1][3]; \
328
V14 = (state)->V[1][4]; \
329
V15 = (state)->V[1][5]; \
330
V16 = (state)->V[1][6]; \
331
V17 = (state)->V[1][7]; \
332
V20 = (state)->V[2][0]; \
333
V21 = (state)->V[2][1]; \
334
V22 = (state)->V[2][2]; \
335
V23 = (state)->V[2][3]; \
336
V24 = (state)->V[2][4]; \
337
V25 = (state)->V[2][5]; \
338
V26 = (state)->V[2][6]; \
339
V27 = (state)->V[2][7]; \
340
} while (0)
341
342
#define WRITE_STATE3(state) do { \
343
(state)->V[0][0] = V00; \
344
(state)->V[0][1] = V01; \
345
(state)->V[0][2] = V02; \
346
(state)->V[0][3] = V03; \
347
(state)->V[0][4] = V04; \
348
(state)->V[0][5] = V05; \
349
(state)->V[0][6] = V06; \
350
(state)->V[0][7] = V07; \
351
(state)->V[1][0] = V10; \
352
(state)->V[1][1] = V11; \
353
(state)->V[1][2] = V12; \
354
(state)->V[1][3] = V13; \
355
(state)->V[1][4] = V14; \
356
(state)->V[1][5] = V15; \
357
(state)->V[1][6] = V16; \
358
(state)->V[1][7] = V17; \
359
(state)->V[2][0] = V20; \
360
(state)->V[2][1] = V21; \
361
(state)->V[2][2] = V22; \
362
(state)->V[2][3] = V23; \
363
(state)->V[2][4] = V24; \
364
(state)->V[2][5] = V25; \
365
(state)->V[2][6] = V26; \
366
(state)->V[2][7] = V27; \
367
} while (0)
368
369
#define MI3 do { \
370
DECL_TMP8(M) \
371
DECL_TMP8(a) \
372
M0 = sph_dec32be_aligned(buf + 0); \
373
M1 = sph_dec32be_aligned(buf + 4); \
374
M2 = sph_dec32be_aligned(buf + 8); \
375
M3 = sph_dec32be_aligned(buf + 12); \
376
M4 = sph_dec32be_aligned(buf + 16); \
377
M5 = sph_dec32be_aligned(buf + 20); \
378
M6 = sph_dec32be_aligned(buf + 24); \
379
M7 = sph_dec32be_aligned(buf + 28); \
380
XOR(a, V0, V1); \
381
XOR(a, a, V2); \
382
M2(a, a); \
383
XOR(V0, a, V0); \
384
XOR(V0, M, V0); \
385
M2(M, M); \
386
XOR(V1, a, V1); \
387
XOR(V1, M, V1); \
388
M2(M, M); \
389
XOR(V2, a, V2); \
390
XOR(V2, M, V2); \
391
} while (0)
392
393
#define TWEAK3 do { \
394
V14 = SPH_ROTL32(V14, 1); \
395
V15 = SPH_ROTL32(V15, 1); \
396
V16 = SPH_ROTL32(V16, 1); \
397
V17 = SPH_ROTL32(V17, 1); \
398
V24 = SPH_ROTL32(V24, 2); \
399
V25 = SPH_ROTL32(V25, 2); \
400
V26 = SPH_ROTL32(V26, 2); \
401
V27 = SPH_ROTL32(V27, 2); \
402
} while (0)
403
404
#if SPH_LUFFA_PARALLEL
405
406
#define P3 do { \
407
int r; \
408
sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \
409
TWEAK3; \
410
W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \
411
W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \
412
W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \
413
W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \
414
W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \
415
W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \
416
W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \
417
W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \
418
for (r = 0; r < 8; r ++) { \
419
SUB_CRUMBW(W0, W1, W2, W3); \
420
SUB_CRUMBW(W5, W6, W7, W4); \
421
MIX_WORDW(W0, W4); \
422
MIX_WORDW(W1, W5); \
423
MIX_WORDW(W2, W6); \
424
MIX_WORDW(W3, W7); \
425
W0 ^= RCW010[r]; \
426
W4 ^= RCW014[r]; \
427
} \
428
V00 = SPH_T32((sph_u32)W0); \
429
V10 = SPH_T32((sph_u32)(W0 >> 32)); \
430
V01 = SPH_T32((sph_u32)W1); \
431
V11 = SPH_T32((sph_u32)(W1 >> 32)); \
432
V02 = SPH_T32((sph_u32)W2); \
433
V12 = SPH_T32((sph_u32)(W2 >> 32)); \
434
V03 = SPH_T32((sph_u32)W3); \
435
V13 = SPH_T32((sph_u32)(W3 >> 32)); \
436
V04 = SPH_T32((sph_u32)W4); \
437
V14 = SPH_T32((sph_u32)(W4 >> 32)); \
438
V05 = SPH_T32((sph_u32)W5); \
439
V15 = SPH_T32((sph_u32)(W5 >> 32)); \
440
V06 = SPH_T32((sph_u32)W6); \
441
V16 = SPH_T32((sph_u32)(W6 >> 32)); \
442
V07 = SPH_T32((sph_u32)W7); \
443
V17 = SPH_T32((sph_u32)(W7 >> 32)); \
444
for (r = 0; r < 8; r ++) { \
445
SUB_CRUMB(V20, V21, V22, V23); \
446
SUB_CRUMB(V25, V26, V27, V24); \
447
MIX_WORD(V20, V24); \
448
MIX_WORD(V21, V25); \
449
MIX_WORD(V22, V26); \
450
MIX_WORD(V23, V27); \
451
V20 ^= RC20[r]; \
452
V24 ^= RC24[r]; \
453
} \
454
} while (0)
455
456
#else
457
458
#define P3 do { \
459
int r; \
460
TWEAK3; \
461
for (r = 0; r < 8; r ++) { \
462
SUB_CRUMB(V00, V01, V02, V03); \
463
SUB_CRUMB(V05, V06, V07, V04); \
464
MIX_WORD(V00, V04); \
465
MIX_WORD(V01, V05); \
466
MIX_WORD(V02, V06); \
467
MIX_WORD(V03, V07); \
468
V00 ^= RC00[r]; \
469
V04 ^= RC04[r]; \
470
} \
471
for (r = 0; r < 8; r ++) { \
472
SUB_CRUMB(V10, V11, V12, V13); \
473
SUB_CRUMB(V15, V16, V17, V14); \
474
MIX_WORD(V10, V14); \
475
MIX_WORD(V11, V15); \
476
MIX_WORD(V12, V16); \
477
MIX_WORD(V13, V17); \
478
V10 ^= RC10[r]; \
479
V14 ^= RC14[r]; \
480
} \
481
for (r = 0; r < 8; r ++) { \
482
SUB_CRUMB(V20, V21, V22, V23); \
483
SUB_CRUMB(V25, V26, V27, V24); \
484
MIX_WORD(V20, V24); \
485
MIX_WORD(V21, V25); \
486
MIX_WORD(V22, V26); \
487
MIX_WORD(V23, V27); \
488
V20 ^= RC20[r]; \
489
V24 ^= RC24[r]; \
490
} \
491
} while (0)
492
493
#endif
494
495
#define DECL_STATE4 \
496
sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \
497
sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \
498
sph_u32 V20, V21, V22, V23, V24, V25, V26, V27; \
499
sph_u32 V30, V31, V32, V33, V34, V35, V36, V37;
500
501
#define READ_STATE4(state) do { \
502
V00 = (state)->V[0][0]; \
503
V01 = (state)->V[0][1]; \
504
V02 = (state)->V[0][2]; \
505
V03 = (state)->V[0][3]; \
506
V04 = (state)->V[0][4]; \
507
V05 = (state)->V[0][5]; \
508
V06 = (state)->V[0][6]; \
509
V07 = (state)->V[0][7]; \
510
V10 = (state)->V[1][0]; \
511
V11 = (state)->V[1][1]; \
512
V12 = (state)->V[1][2]; \
513
V13 = (state)->V[1][3]; \
514
V14 = (state)->V[1][4]; \
515
V15 = (state)->V[1][5]; \
516
V16 = (state)->V[1][6]; \
517
V17 = (state)->V[1][7]; \
518
V20 = (state)->V[2][0]; \
519
V21 = (state)->V[2][1]; \
520
V22 = (state)->V[2][2]; \
521
V23 = (state)->V[2][3]; \
522
V24 = (state)->V[2][4]; \
523
V25 = (state)->V[2][5]; \
524
V26 = (state)->V[2][6]; \
525
V27 = (state)->V[2][7]; \
526
V30 = (state)->V[3][0]; \
527
V31 = (state)->V[3][1]; \
528
V32 = (state)->V[3][2]; \
529
V33 = (state)->V[3][3]; \
530
V34 = (state)->V[3][4]; \
531
V35 = (state)->V[3][5]; \
532
V36 = (state)->V[3][6]; \
533
V37 = (state)->V[3][7]; \
534
} while (0)
535
536
#define WRITE_STATE4(state) do { \
537
(state)->V[0][0] = V00; \
538
(state)->V[0][1] = V01; \
539
(state)->V[0][2] = V02; \
540
(state)->V[0][3] = V03; \
541
(state)->V[0][4] = V04; \
542
(state)->V[0][5] = V05; \
543
(state)->V[0][6] = V06; \
544
(state)->V[0][7] = V07; \
545
(state)->V[1][0] = V10; \
546
(state)->V[1][1] = V11; \
547
(state)->V[1][2] = V12; \
548
(state)->V[1][3] = V13; \
549
(state)->V[1][4] = V14; \
550
(state)->V[1][5] = V15; \
551
(state)->V[1][6] = V16; \
552
(state)->V[1][7] = V17; \
553
(state)->V[2][0] = V20; \
554
(state)->V[2][1] = V21; \
555
(state)->V[2][2] = V22; \
556
(state)->V[2][3] = V23; \
557
(state)->V[2][4] = V24; \
558
(state)->V[2][5] = V25; \
559
(state)->V[2][6] = V26; \
560
(state)->V[2][7] = V27; \
561
(state)->V[3][0] = V30; \
562
(state)->V[3][1] = V31; \
563
(state)->V[3][2] = V32; \
564
(state)->V[3][3] = V33; \
565
(state)->V[3][4] = V34; \
566
(state)->V[3][5] = V35; \
567
(state)->V[3][6] = V36; \
568
(state)->V[3][7] = V37; \
569
} while (0)
570
571
#define MI4 do { \
572
DECL_TMP8(M) \
573
DECL_TMP8(a) \
574
DECL_TMP8(b) \
575
M0 = sph_dec32be_aligned(buf + 0); \
576
M1 = sph_dec32be_aligned(buf + 4); \
577
M2 = sph_dec32be_aligned(buf + 8); \
578
M3 = sph_dec32be_aligned(buf + 12); \
579
M4 = sph_dec32be_aligned(buf + 16); \
580
M5 = sph_dec32be_aligned(buf + 20); \
581
M6 = sph_dec32be_aligned(buf + 24); \
582
M7 = sph_dec32be_aligned(buf + 28); \
583
XOR(a, V0, V1); \
584
XOR(b, V2, V3); \
585
XOR(a, a, b); \
586
M2(a, a); \
587
XOR(V0, a, V0); \
588
XOR(V1, a, V1); \
589
XOR(V2, a, V2); \
590
XOR(V3, a, V3); \
591
M2(b, V0); \
592
XOR(b, b, V3); \
593
M2(V3, V3); \
594
XOR(V3, V3, V2); \
595
M2(V2, V2); \
596
XOR(V2, V2, V1); \
597
M2(V1, V1); \
598
XOR(V1, V1, V0); \
599
XOR(V0, b, M); \
600
M2(M, M); \
601
XOR(V1, V1, M); \
602
M2(M, M); \
603
XOR(V2, V2, M); \
604
M2(M, M); \
605
XOR(V3, V3, M); \
606
} while (0)
607
608
#define TWEAK4 do { \
609
V14 = SPH_ROTL32(V14, 1); \
610
V15 = SPH_ROTL32(V15, 1); \
611
V16 = SPH_ROTL32(V16, 1); \
612
V17 = SPH_ROTL32(V17, 1); \
613
V24 = SPH_ROTL32(V24, 2); \
614
V25 = SPH_ROTL32(V25, 2); \
615
V26 = SPH_ROTL32(V26, 2); \
616
V27 = SPH_ROTL32(V27, 2); \
617
V34 = SPH_ROTL32(V34, 3); \
618
V35 = SPH_ROTL32(V35, 3); \
619
V36 = SPH_ROTL32(V36, 3); \
620
V37 = SPH_ROTL32(V37, 3); \
621
} while (0)
622
623
#if SPH_LUFFA_PARALLEL
624
625
#define P4 do { \
626
int r; \
627
sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \
628
TWEAK4; \
629
W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \
630
W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \
631
W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \
632
W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \
633
W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \
634
W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \
635
W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \
636
W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \
637
for (r = 0; r < 8; r ++) { \
638
SUB_CRUMBW(W0, W1, W2, W3); \
639
SUB_CRUMBW(W5, W6, W7, W4); \
640
MIX_WORDW(W0, W4); \
641
MIX_WORDW(W1, W5); \
642
MIX_WORDW(W2, W6); \
643
MIX_WORDW(W3, W7); \
644
W0 ^= RCW010[r]; \
645
W4 ^= RCW014[r]; \
646
} \
647
V00 = SPH_T32((sph_u32)W0); \
648
V10 = SPH_T32((sph_u32)(W0 >> 32)); \
649
V01 = SPH_T32((sph_u32)W1); \
650
V11 = SPH_T32((sph_u32)(W1 >> 32)); \
651
V02 = SPH_T32((sph_u32)W2); \
652
V12 = SPH_T32((sph_u32)(W2 >> 32)); \
653
V03 = SPH_T32((sph_u32)W3); \
654
V13 = SPH_T32((sph_u32)(W3 >> 32)); \
655
V04 = SPH_T32((sph_u32)W4); \
656
V14 = SPH_T32((sph_u32)(W4 >> 32)); \
657
V05 = SPH_T32((sph_u32)W5); \
658
V15 = SPH_T32((sph_u32)(W5 >> 32)); \
659
V06 = SPH_T32((sph_u32)W6); \
660
V16 = SPH_T32((sph_u32)(W6 >> 32)); \
661
V07 = SPH_T32((sph_u32)W7); \
662
V17 = SPH_T32((sph_u32)(W7 >> 32)); \
663
W0 = (sph_u64)V20 | ((sph_u64)V30 << 32); \
664
W1 = (sph_u64)V21 | ((sph_u64)V31 << 32); \
665
W2 = (sph_u64)V22 | ((sph_u64)V32 << 32); \
666
W3 = (sph_u64)V23 | ((sph_u64)V33 << 32); \
667
W4 = (sph_u64)V24 | ((sph_u64)V34 << 32); \
668
W5 = (sph_u64)V25 | ((sph_u64)V35 << 32); \
669
W6 = (sph_u64)V26 | ((sph_u64)V36 << 32); \
670
W7 = (sph_u64)V27 | ((sph_u64)V37 << 32); \
671
for (r = 0; r < 8; r ++) { \
672
SUB_CRUMBW(W0, W1, W2, W3); \
673
SUB_CRUMBW(W5, W6, W7, W4); \
674
MIX_WORDW(W0, W4); \
675
MIX_WORDW(W1, W5); \
676
MIX_WORDW(W2, W6); \
677
MIX_WORDW(W3, W7); \
678
W0 ^= RCW230[r]; \
679
W4 ^= RCW234[r]; \
680
} \
681
V20 = SPH_T32((sph_u32)W0); \
682
V30 = SPH_T32((sph_u32)(W0 >> 32)); \
683
V21 = SPH_T32((sph_u32)W1); \
684
V31 = SPH_T32((sph_u32)(W1 >> 32)); \
685
V22 = SPH_T32((sph_u32)W2); \
686
V32 = SPH_T32((sph_u32)(W2 >> 32)); \
687
V23 = SPH_T32((sph_u32)W3); \
688
V33 = SPH_T32((sph_u32)(W3 >> 32)); \
689
V24 = SPH_T32((sph_u32)W4); \
690
V34 = SPH_T32((sph_u32)(W4 >> 32)); \
691
V25 = SPH_T32((sph_u32)W5); \
692
V35 = SPH_T32((sph_u32)(W5 >> 32)); \
693
V26 = SPH_T32((sph_u32)W6); \
694
V36 = SPH_T32((sph_u32)(W6 >> 32)); \
695
V27 = SPH_T32((sph_u32)W7); \
696
V37 = SPH_T32((sph_u32)(W7 >> 32)); \
697
} while (0)
698
699
#else
700
701
#define P4 do { \
702
int r; \
703
TWEAK4; \
704
for (r = 0; r < 8; r ++) { \
705
SUB_CRUMB(V00, V01, V02, V03); \
706
SUB_CRUMB(V05, V06, V07, V04); \
707
MIX_WORD(V00, V04); \
708
MIX_WORD(V01, V05); \
709
MIX_WORD(V02, V06); \
710
MIX_WORD(V03, V07); \
711
V00 ^= RC00[r]; \
712
V04 ^= RC04[r]; \
713
} \
714
for (r = 0; r < 8; r ++) { \
715
SUB_CRUMB(V10, V11, V12, V13); \
716
SUB_CRUMB(V15, V16, V17, V14); \
717
MIX_WORD(V10, V14); \
718
MIX_WORD(V11, V15); \
719
MIX_WORD(V12, V16); \
720
MIX_WORD(V13, V17); \
721
V10 ^= RC10[r]; \
722
V14 ^= RC14[r]; \
723
} \
724
for (r = 0; r < 8; r ++) { \
725
SUB_CRUMB(V20, V21, V22, V23); \
726
SUB_CRUMB(V25, V26, V27, V24); \
727
MIX_WORD(V20, V24); \
728
MIX_WORD(V21, V25); \
729
MIX_WORD(V22, V26); \
730
MIX_WORD(V23, V27); \
731
V20 ^= RC20[r]; \
732
V24 ^= RC24[r]; \
733
} \
734
for (r = 0; r < 8; r ++) { \
735
SUB_CRUMB(V30, V31, V32, V33); \
736
SUB_CRUMB(V35, V36, V37, V34); \
737
MIX_WORD(V30, V34); \
738
MIX_WORD(V31, V35); \
739
MIX_WORD(V32, V36); \
740
MIX_WORD(V33, V37); \
741
V30 ^= RC30[r]; \
742
V34 ^= RC34[r]; \
743
} \
744
} while (0)
745
746
#endif
747
748
#define DECL_STATE5 \
749
sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \
750
sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \
751
sph_u32 V20, V21, V22, V23, V24, V25, V26, V27; \
752
sph_u32 V30, V31, V32, V33, V34, V35, V36, V37; \
753
sph_u32 V40, V41, V42, V43, V44, V45, V46, V47;
754
755
#define READ_STATE5(state) do { \
756
V00 = (state)->V[0][0]; \
757
V01 = (state)->V[0][1]; \
758
V02 = (state)->V[0][2]; \
759
V03 = (state)->V[0][3]; \
760
V04 = (state)->V[0][4]; \
761
V05 = (state)->V[0][5]; \
762
V06 = (state)->V[0][6]; \
763
V07 = (state)->V[0][7]; \
764
V10 = (state)->V[1][0]; \
765
V11 = (state)->V[1][1]; \
766
V12 = (state)->V[1][2]; \
767
V13 = (state)->V[1][3]; \
768
V14 = (state)->V[1][4]; \
769
V15 = (state)->V[1][5]; \
770
V16 = (state)->V[1][6]; \
771
V17 = (state)->V[1][7]; \
772
V20 = (state)->V[2][0]; \
773
V21 = (state)->V[2][1]; \
774
V22 = (state)->V[2][2]; \
775
V23 = (state)->V[2][3]; \
776
V24 = (state)->V[2][4]; \
777
V25 = (state)->V[2][5]; \
778
V26 = (state)->V[2][6]; \
779
V27 = (state)->V[2][7]; \
780
V30 = (state)->V[3][0]; \
781
V31 = (state)->V[3][1]; \
782
V32 = (state)->V[3][2]; \
783
V33 = (state)->V[3][3]; \
784
V34 = (state)->V[3][4]; \
785
V35 = (state)->V[3][5]; \
786
V36 = (state)->V[3][6]; \
787
V37 = (state)->V[3][7]; \
788
V40 = (state)->V[4][0]; \
789
V41 = (state)->V[4][1]; \
790
V42 = (state)->V[4][2]; \
791
V43 = (state)->V[4][3]; \
792
V44 = (state)->V[4][4]; \
793
V45 = (state)->V[4][5]; \
794
V46 = (state)->V[4][6]; \
795
V47 = (state)->V[4][7]; \
796
} while (0)
797
798
#define WRITE_STATE5(state) do { \
799
(state)->V[0][0] = V00; \
800
(state)->V[0][1] = V01; \
801
(state)->V[0][2] = V02; \
802
(state)->V[0][3] = V03; \
803
(state)->V[0][4] = V04; \
804
(state)->V[0][5] = V05; \
805
(state)->V[0][6] = V06; \
806
(state)->V[0][7] = V07; \
807
(state)->V[1][0] = V10; \
808
(state)->V[1][1] = V11; \
809
(state)->V[1][2] = V12; \
810
(state)->V[1][3] = V13; \
811
(state)->V[1][4] = V14; \
812
(state)->V[1][5] = V15; \
813
(state)->V[1][6] = V16; \
814
(state)->V[1][7] = V17; \
815
(state)->V[2][0] = V20; \
816
(state)->V[2][1] = V21; \
817
(state)->V[2][2] = V22; \
818
(state)->V[2][3] = V23; \
819
(state)->V[2][4] = V24; \
820
(state)->V[2][5] = V25; \
821
(state)->V[2][6] = V26; \
822
(state)->V[2][7] = V27; \
823
(state)->V[3][0] = V30; \
824
(state)->V[3][1] = V31; \
825
(state)->V[3][2] = V32; \
826
(state)->V[3][3] = V33; \
827
(state)->V[3][4] = V34; \
828
(state)->V[3][5] = V35; \
829
(state)->V[3][6] = V36; \
830
(state)->V[3][7] = V37; \
831
(state)->V[4][0] = V40; \
832
(state)->V[4][1] = V41; \
833
(state)->V[4][2] = V42; \
834
(state)->V[4][3] = V43; \
835
(state)->V[4][4] = V44; \
836
(state)->V[4][5] = V45; \
837
(state)->V[4][6] = V46; \
838
(state)->V[4][7] = V47; \
839
} while (0)
840
841
#define MI5 do { \
842
DECL_TMP8(M) \
843
DECL_TMP8(a) \
844
DECL_TMP8(b) \
845
M0 = sph_dec32be_aligned(buf + 0); \
846
M1 = sph_dec32be_aligned(buf + 4); \
847
M2 = sph_dec32be_aligned(buf + 8); \
848
M3 = sph_dec32be_aligned(buf + 12); \
849
M4 = sph_dec32be_aligned(buf + 16); \
850
M5 = sph_dec32be_aligned(buf + 20); \
851
M6 = sph_dec32be_aligned(buf + 24); \
852
M7 = sph_dec32be_aligned(buf + 28); \
853
XOR(a, V0, V1); \
854
XOR(b, V2, V3); \
855
XOR(a, a, b); \
856
XOR(a, a, V4); \
857
M2(a, a); \
858
XOR(V0, a, V0); \
859
XOR(V1, a, V1); \
860
XOR(V2, a, V2); \
861
XOR(V3, a, V3); \
862
XOR(V4, a, V4); \
863
M2(b, V0); \
864
XOR(b, b, V1); \
865
M2(V1, V1); \
866
XOR(V1, V1, V2); \
867
M2(V2, V2); \
868
XOR(V2, V2, V3); \
869
M2(V3, V3); \
870
XOR(V3, V3, V4); \
871
M2(V4, V4); \
872
XOR(V4, V4, V0); \
873
M2(V0, b); \
874
XOR(V0, V0, V4); \
875
M2(V4, V4); \
876
XOR(V4, V4, V3); \
877
M2(V3, V3); \
878
XOR(V3, V3, V2); \
879
M2(V2, V2); \
880
XOR(V2, V2, V1); \
881
M2(V1, V1); \
882
XOR(V1, V1, b); \
883
XOR(V0, V0, M); \
884
M2(M, M); \
885
XOR(V1, V1, M); \
886
M2(M, M); \
887
XOR(V2, V2, M); \
888
M2(M, M); \
889
XOR(V3, V3, M); \
890
M2(M, M); \
891
XOR(V4, V4, M); \
892
} while (0)
893
894
#define TWEAK5 do { \
895
V14 = SPH_ROTL32(V14, 1); \
896
V15 = SPH_ROTL32(V15, 1); \
897
V16 = SPH_ROTL32(V16, 1); \
898
V17 = SPH_ROTL32(V17, 1); \
899
V24 = SPH_ROTL32(V24, 2); \
900
V25 = SPH_ROTL32(V25, 2); \
901
V26 = SPH_ROTL32(V26, 2); \
902
V27 = SPH_ROTL32(V27, 2); \
903
V34 = SPH_ROTL32(V34, 3); \
904
V35 = SPH_ROTL32(V35, 3); \
905
V36 = SPH_ROTL32(V36, 3); \
906
V37 = SPH_ROTL32(V37, 3); \
907
V44 = SPH_ROTL32(V44, 4); \
908
V45 = SPH_ROTL32(V45, 4); \
909
V46 = SPH_ROTL32(V46, 4); \
910
V47 = SPH_ROTL32(V47, 4); \
911
} while (0)
912
913
#if SPH_LUFFA_PARALLEL
914
915
#define P5 do { \
916
int r; \
917
sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \
918
TWEAK5; \
919
W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \
920
W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \
921
W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \
922
W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \
923
W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \
924
W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \
925
W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \
926
W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \
927
for (r = 0; r < 8; r ++) { \
928
SUB_CRUMBW(W0, W1, W2, W3); \
929
SUB_CRUMBW(W5, W6, W7, W4); \
930
MIX_WORDW(W0, W4); \
931
MIX_WORDW(W1, W5); \
932
MIX_WORDW(W2, W6); \
933
MIX_WORDW(W3, W7); \
934
W0 ^= RCW010[r]; \
935
W4 ^= RCW014[r]; \
936
} \
937
V00 = SPH_T32((sph_u32)W0); \
938
V10 = SPH_T32((sph_u32)(W0 >> 32)); \
939
V01 = SPH_T32((sph_u32)W1); \
940
V11 = SPH_T32((sph_u32)(W1 >> 32)); \
941
V02 = SPH_T32((sph_u32)W2); \
942
V12 = SPH_T32((sph_u32)(W2 >> 32)); \
943
V03 = SPH_T32((sph_u32)W3); \
944
V13 = SPH_T32((sph_u32)(W3 >> 32)); \
945
V04 = SPH_T32((sph_u32)W4); \
946
V14 = SPH_T32((sph_u32)(W4 >> 32)); \
947
V05 = SPH_T32((sph_u32)W5); \
948
V15 = SPH_T32((sph_u32)(W5 >> 32)); \
949
V06 = SPH_T32((sph_u32)W6); \
950
V16 = SPH_T32((sph_u32)(W6 >> 32)); \
951
V07 = SPH_T32((sph_u32)W7); \
952
V17 = SPH_T32((sph_u32)(W7 >> 32)); \
953
W0 = (sph_u64)V20 | ((sph_u64)V30 << 32); \
954
W1 = (sph_u64)V21 | ((sph_u64)V31 << 32); \
955
W2 = (sph_u64)V22 | ((sph_u64)V32 << 32); \
956
W3 = (sph_u64)V23 | ((sph_u64)V33 << 32); \
957
W4 = (sph_u64)V24 | ((sph_u64)V34 << 32); \
958
W5 = (sph_u64)V25 | ((sph_u64)V35 << 32); \
959
W6 = (sph_u64)V26 | ((sph_u64)V36 << 32); \
960
W7 = (sph_u64)V27 | ((sph_u64)V37 << 32); \
961
for (r = 0; r < 8; r ++) { \
962
SUB_CRUMBW(W0, W1, W2, W3); \
963
SUB_CRUMBW(W5, W6, W7, W4); \
964
MIX_WORDW(W0, W4); \
965
MIX_WORDW(W1, W5); \
966
MIX_WORDW(W2, W6); \
967
MIX_WORDW(W3, W7); \
968
W0 ^= RCW230[r]; \
969
W4 ^= RCW234[r]; \
970
} \
971
V20 = SPH_T32((sph_u32)W0); \
972
V30 = SPH_T32((sph_u32)(W0 >> 32)); \
973
V21 = SPH_T32((sph_u32)W1); \
974
V31 = SPH_T32((sph_u32)(W1 >> 32)); \
975
V22 = SPH_T32((sph_u32)W2); \
976
V32 = SPH_T32((sph_u32)(W2 >> 32)); \
977
V23 = SPH_T32((sph_u32)W3); \
978
V33 = SPH_T32((sph_u32)(W3 >> 32)); \
979
V24 = SPH_T32((sph_u32)W4); \
980
V34 = SPH_T32((sph_u32)(W4 >> 32)); \
981
V25 = SPH_T32((sph_u32)W5); \
982
V35 = SPH_T32((sph_u32)(W5 >> 32)); \
983
V26 = SPH_T32((sph_u32)W6); \
984
V36 = SPH_T32((sph_u32)(W6 >> 32)); \
985
V27 = SPH_T32((sph_u32)W7); \
986
V37 = SPH_T32((sph_u32)(W7 >> 32)); \
987
for (r = 0; r < 8; r ++) { \
988
SUB_CRUMB(V40, V41, V42, V43); \
989
SUB_CRUMB(V45, V46, V47, V44); \
990
MIX_WORD(V40, V44); \
991
MIX_WORD(V41, V45); \
992
MIX_WORD(V42, V46); \
993
MIX_WORD(V43, V47); \
994
V40 ^= RC40[r]; \
995
V44 ^= RC44[r]; \
996
} \
997
} while (0)
998
999
#else
1000
1001
#define P5 do { \
1002
int r; \
1003
TWEAK5; \
1004
for (r = 0; r < 8; r ++) { \
1005
SUB_CRUMB(V00, V01, V02, V03); \
1006
SUB_CRUMB(V05, V06, V07, V04); \
1007
MIX_WORD(V00, V04); \
1008
MIX_WORD(V01, V05); \
1009
MIX_WORD(V02, V06); \
1010
MIX_WORD(V03, V07); \
1011
V00 ^= RC00[r]; \
1012
V04 ^= RC04[r]; \
1013
} \
1014
for (r = 0; r < 8; r ++) { \
1015
SUB_CRUMB(V10, V11, V12, V13); \
1016
SUB_CRUMB(V15, V16, V17, V14); \
1017
MIX_WORD(V10, V14); \
1018
MIX_WORD(V11, V15); \
1019
MIX_WORD(V12, V16); \
1020
MIX_WORD(V13, V17); \
1021
V10 ^= RC10[r]; \
1022
V14 ^= RC14[r]; \
1023
} \
1024
for (r = 0; r < 8; r ++) { \
1025
SUB_CRUMB(V20, V21, V22, V23); \
1026
SUB_CRUMB(V25, V26, V27, V24); \
1027
MIX_WORD(V20, V24); \
1028
MIX_WORD(V21, V25); \
1029
MIX_WORD(V22, V26); \
1030
MIX_WORD(V23, V27); \
1031
V20 ^= RC20[r]; \
1032
V24 ^= RC24[r]; \
1033
} \
1034
for (r = 0; r < 8; r ++) { \
1035
SUB_CRUMB(V30, V31, V32, V33); \
1036
SUB_CRUMB(V35, V36, V37, V34); \
1037
MIX_WORD(V30, V34); \
1038
MIX_WORD(V31, V35); \
1039
MIX_WORD(V32, V36); \
1040
MIX_WORD(V33, V37); \
1041
V30 ^= RC30[r]; \
1042
V34 ^= RC34[r]; \
1043
} \
1044
for (r = 0; r < 8; r ++) { \
1045
SUB_CRUMB(V40, V41, V42, V43); \
1046
SUB_CRUMB(V45, V46, V47, V44); \
1047
MIX_WORD(V40, V44); \
1048
MIX_WORD(V41, V45); \
1049
MIX_WORD(V42, V46); \
1050
MIX_WORD(V43, V47); \
1051
V40 ^= RC40[r]; \
1052
V44 ^= RC44[r]; \
1053
} \
1054
} while (0)
1055
1056
#endif
1057
1058
static void
1059
luffa3(sph_luffa224_context *sc, const void *data, size_t len)
1060
{
1061
unsigned char *buf;
1062
size_t ptr;
1063
DECL_STATE3
1064
1065
buf = sc->buf;
1066
ptr = sc->ptr;
1067
if (len < (sizeof sc->buf) - ptr) {
1068
memcpy(buf + ptr, data, len);
1069
ptr += len;
1070
sc->ptr = ptr;
1071
return;
1072
}
1073
1074
READ_STATE3(sc);
1075
while (len > 0) {
1076
size_t clen;
1077
1078
clen = (sizeof sc->buf) - ptr;
1079
if (clen > len)
1080
clen = len;
1081
memcpy(buf + ptr, data, clen);
1082
ptr += clen;
1083
data = (const unsigned char *)data + clen;
1084
len -= clen;
1085
if (ptr == sizeof sc->buf) {
1086
MI3;
1087
P3;
1088
ptr = 0;
1089
}
1090
}
1091
WRITE_STATE3(sc);
1092
sc->ptr = ptr;
1093
}
1094
1095
static void
1096
luffa3_close(sph_luffa224_context *sc, unsigned ub, unsigned n,
1097
void *dst, unsigned out_size_w32)
1098
{
1099
unsigned char *buf, *out;
1100
size_t ptr;
1101
unsigned z;
1102
int i;
1103
DECL_STATE3
1104
1105
buf = sc->buf;
1106
ptr = sc->ptr;
1107
z = 0x80 >> n;
1108
buf[ptr ++] = ((ub & -z) | z) & 0xFF;
1109
memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
1110
READ_STATE3(sc);
1111
for (i = 0; i < 2; i ++) {
1112
MI3;
1113
P3;
1114
memset(buf, 0, sizeof sc->buf);
1115
}
1116
out = dst;
1117
sph_enc32be(out + 0, V00 ^ V10 ^ V20);
1118
sph_enc32be(out + 4, V01 ^ V11 ^ V21);
1119
sph_enc32be(out + 8, V02 ^ V12 ^ V22);
1120
sph_enc32be(out + 12, V03 ^ V13 ^ V23);
1121
sph_enc32be(out + 16, V04 ^ V14 ^ V24);
1122
sph_enc32be(out + 20, V05 ^ V15 ^ V25);
1123
sph_enc32be(out + 24, V06 ^ V16 ^ V26);
1124
if (out_size_w32 > 7)
1125
sph_enc32be(out + 28, V07 ^ V17 ^ V27);
1126
}
1127
1128
static void
1129
luffa4(sph_luffa384_context *sc, const void *data, size_t len)
1130
{
1131
unsigned char *buf;
1132
size_t ptr;
1133
DECL_STATE4
1134
1135
buf = sc->buf;
1136
ptr = sc->ptr;
1137
if (len < (sizeof sc->buf) - ptr) {
1138
memcpy(buf + ptr, data, len);
1139
ptr += len;
1140
sc->ptr = ptr;
1141
return;
1142
}
1143
1144
READ_STATE4(sc);
1145
while (len > 0) {
1146
size_t clen;
1147
1148
clen = (sizeof sc->buf) - ptr;
1149
if (clen > len)
1150
clen = len;
1151
memcpy(buf + ptr, data, clen);
1152
ptr += clen;
1153
data = (const unsigned char *)data + clen;
1154
len -= clen;
1155
if (ptr == sizeof sc->buf) {
1156
MI4;
1157
P4;
1158
ptr = 0;
1159
}
1160
}
1161
WRITE_STATE4(sc);
1162
sc->ptr = ptr;
1163
}
1164
1165
static void
1166
luffa4_close(sph_luffa384_context *sc, unsigned ub, unsigned n, void *dst)
1167
{
1168
unsigned char *buf, *out;
1169
size_t ptr;
1170
unsigned z;
1171
int i;
1172
DECL_STATE4
1173
1174
buf = sc->buf;
1175
ptr = sc->ptr;
1176
out = dst;
1177
z = 0x80 >> n;
1178
buf[ptr ++] = ((ub & -z) | z) & 0xFF;
1179
memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
1180
READ_STATE4(sc);
1181
for (i = 0; i < 3; i ++) {
1182
MI4;
1183
P4;
1184
switch (i) {
1185
case 0:
1186
memset(buf, 0, sizeof sc->buf);
1187
break;
1188
case 1:
1189
sph_enc32be(out + 0, V00 ^ V10 ^ V20 ^ V30);
1190
sph_enc32be(out + 4, V01 ^ V11 ^ V21 ^ V31);
1191
sph_enc32be(out + 8, V02 ^ V12 ^ V22 ^ V32);
1192
sph_enc32be(out + 12, V03 ^ V13 ^ V23 ^ V33);
1193
sph_enc32be(out + 16, V04 ^ V14 ^ V24 ^ V34);
1194
sph_enc32be(out + 20, V05 ^ V15 ^ V25 ^ V35);
1195
sph_enc32be(out + 24, V06 ^ V16 ^ V26 ^ V36);
1196
sph_enc32be(out + 28, V07 ^ V17 ^ V27 ^ V37);
1197
break;
1198
case 2:
1199
sph_enc32be(out + 32, V00 ^ V10 ^ V20 ^ V30);
1200
sph_enc32be(out + 36, V01 ^ V11 ^ V21 ^ V31);
1201
sph_enc32be(out + 40, V02 ^ V12 ^ V22 ^ V32);
1202
sph_enc32be(out + 44, V03 ^ V13 ^ V23 ^ V33);
1203
break;
1204
}
1205
}
1206
}
1207
1208
static void
1209
luffa5(sph_luffa512_context *sc, const void *data, size_t len)
1210
{
1211
unsigned char *buf;
1212
size_t ptr;
1213
DECL_STATE5
1214
1215
buf = sc->buf;
1216
ptr = sc->ptr;
1217
if (len < (sizeof sc->buf) - ptr) {
1218
memcpy(buf + ptr, data, len);
1219
ptr += len;
1220
sc->ptr = ptr;
1221
return;
1222
}
1223
1224
READ_STATE5(sc);
1225
while (len > 0) {
1226
size_t clen;
1227
1228
clen = (sizeof sc->buf) - ptr;
1229
if (clen > len)
1230
clen = len;
1231
memcpy(buf + ptr, data, clen);
1232
ptr += clen;
1233
data = (const unsigned char *)data + clen;
1234
len -= clen;
1235
if (ptr == sizeof sc->buf) {
1236
MI5;
1237
P5;
1238
ptr = 0;
1239
}
1240
}
1241
WRITE_STATE5(sc);
1242
sc->ptr = ptr;
1243
}
1244
1245
static void
1246
luffa5_close(sph_luffa512_context *sc, unsigned ub, unsigned n, void *dst)
1247
{
1248
unsigned char *buf, *out;
1249
size_t ptr;
1250
unsigned z;
1251
int i;
1252
DECL_STATE5
1253
1254
buf = sc->buf;
1255
ptr = sc->ptr;
1256
out = dst;
1257
z = 0x80 >> n;
1258
buf[ptr ++] = ((ub & -z) | z) & 0xFF;
1259
memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
1260
READ_STATE5(sc);
1261
for (i = 0; i < 3; i ++) {
1262
MI5;
1263
P5;
1264
switch (i) {
1265
case 0:
1266
memset(buf, 0, sizeof sc->buf);
1267
break;
1268
case 1:
1269
sph_enc32be(out + 0, V00 ^ V10 ^ V20 ^ V30 ^ V40);
1270
sph_enc32be(out + 4, V01 ^ V11 ^ V21 ^ V31 ^ V41);
1271
sph_enc32be(out + 8, V02 ^ V12 ^ V22 ^ V32 ^ V42);
1272
sph_enc32be(out + 12, V03 ^ V13 ^ V23 ^ V33 ^ V43);
1273
sph_enc32be(out + 16, V04 ^ V14 ^ V24 ^ V34 ^ V44);
1274
sph_enc32be(out + 20, V05 ^ V15 ^ V25 ^ V35 ^ V45);
1275
sph_enc32be(out + 24, V06 ^ V16 ^ V26 ^ V36 ^ V46);
1276
sph_enc32be(out + 28, V07 ^ V17 ^ V27 ^ V37 ^ V47);
1277
break;
1278
case 2:
1279
sph_enc32be(out + 32, V00 ^ V10 ^ V20 ^ V30 ^ V40);
1280
sph_enc32be(out + 36, V01 ^ V11 ^ V21 ^ V31 ^ V41);
1281
sph_enc32be(out + 40, V02 ^ V12 ^ V22 ^ V32 ^ V42);
1282
sph_enc32be(out + 44, V03 ^ V13 ^ V23 ^ V33 ^ V43);
1283
sph_enc32be(out + 48, V04 ^ V14 ^ V24 ^ V34 ^ V44);
1284
sph_enc32be(out + 52, V05 ^ V15 ^ V25 ^ V35 ^ V45);
1285
sph_enc32be(out + 56, V06 ^ V16 ^ V26 ^ V36 ^ V46);
1286
sph_enc32be(out + 60, V07 ^ V17 ^ V27 ^ V37 ^ V47);
1287
break;
1288
}
1289
}
1290
}
1291
1292
/* see sph_luffa.h */
1293
void
1294
sph_luffa224_init(void *cc)
1295
{
1296
sph_luffa224_context *sc;
1297
1298
sc = cc;
1299
memcpy(sc->V, V_INIT, sizeof(sc->V));
1300
sc->ptr = 0;
1301
}
1302
1303
/* see sph_luffa.h */
1304
void
1305
sph_luffa224(void *cc, const void *data, size_t len)
1306
{
1307
luffa3(cc, data, len);
1308
}
1309
1310
/* see sph_luffa.h */
1311
void
1312
sph_luffa224_close(void *cc, void *dst)
1313
{
1314
sph_luffa224_addbits_and_close(cc, 0, 0, dst);
1315
}
1316
1317
/* see sph_luffa.h */
1318
void
1319
sph_luffa224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1320
{
1321
luffa3_close(cc, ub, n, dst, 7);
1322
sph_luffa224_init(cc);
1323
}
1324
1325
/* see sph_luffa.h */
1326
void
1327
sph_luffa256_init(void *cc)
1328
{
1329
sph_luffa256_context *sc;
1330
1331
sc = cc;
1332
memcpy(sc->V, V_INIT, sizeof(sc->V));
1333
sc->ptr = 0;
1334
}
1335
1336
/* see sph_luffa.h */
1337
void
1338
sph_luffa256(void *cc, const void *data, size_t len)
1339
{
1340
luffa3(cc, data, len);
1341
}
1342
1343
/* see sph_luffa.h */
1344
void
1345
sph_luffa256_close(void *cc, void *dst)
1346
{
1347
sph_luffa256_addbits_and_close(cc, 0, 0, dst);
1348
}
1349
1350
/* see sph_luffa.h */
1351
void
1352
sph_luffa256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1353
{
1354
luffa3_close(cc, ub, n, dst, 8);
1355
sph_luffa256_init(cc);
1356
}
1357
1358
/* see sph_luffa.h */
1359
void
1360
sph_luffa384_init(void *cc)
1361
{
1362
sph_luffa384_context *sc;
1363
1364
sc = cc;
1365
memcpy(sc->V, V_INIT, sizeof(sc->V));
1366
sc->ptr = 0;
1367
}
1368
1369
/* see sph_luffa.h */
1370
void
1371
sph_luffa384(void *cc, const void *data, size_t len)
1372
{
1373
luffa4(cc, data, len);
1374
}
1375
1376
/* see sph_luffa.h */
1377
void
1378
sph_luffa384_close(void *cc, void *dst)
1379
{
1380
sph_luffa384_addbits_and_close(cc, 0, 0, dst);
1381
}
1382
1383
/* see sph_luffa.h */
1384
void
1385
sph_luffa384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1386
{
1387
luffa4_close(cc, ub, n, dst);
1388
sph_luffa384_init(cc);
1389
}
1390
1391
/* see sph_luffa.h */
1392
void
1393
sph_luffa512_init(void *cc)
1394
{
1395
sph_luffa512_context *sc;
1396
1397
sc = cc;
1398
memcpy(sc->V, V_INIT, sizeof(sc->V));
1399
sc->ptr = 0;
1400
}
1401
1402
/* see sph_luffa.h */
1403
void
1404
sph_luffa512(void *cc, const void *data, size_t len)
1405
{
1406
luffa5(cc, data, len);
1407
}
1408
1409
/* see sph_luffa.h */
1410
void
1411
sph_luffa512_close(void *cc, void *dst)
1412
{
1413
sph_luffa512_addbits_and_close(cc, 0, 0, dst);
1414
}
1415
1416
/* see sph_luffa.h */
1417
void
1418
sph_luffa512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1419
{
1420
luffa5_close(cc, ub, n, dst);
1421
sph_luffa512_init(cc);
1422
}
1423
1424
#ifdef __cplusplus
1425
}
1426
#endif
1427