CoCalc -- sph

GitHub Repository: tpruvot/cpuminer-multi
Path: blob/linux/sha3/sph_luffa.c
¹⁵³² views
1
/* $Id: luffa.c 219 2010-06-08 17:24:41Z tp $ */
2
/*
3
 * Luffa implementation.
4
 *
5
 * ==========================(LICENSE BEGIN)============================
6
 *
7
 * Copyright (c) 2007-2010  Projet RNRT SAPHIR
8
 * 
9
 * Permission is hereby granted, free of charge, to any person obtaining
10
 * a copy of this software and associated documentation files (the
11
 * "Software"), to deal in the Software without restriction, including
12
 * without limitation the rights to use, copy, modify, merge, publish,
13
 * distribute, sublicense, and/or sell copies of the Software, and to
14
 * permit persons to whom the Software is furnished to do so, subject to
15
 * the following conditions:
16
 * 
17
 * The above copyright notice and this permission notice shall be
18
 * included in all copies or substantial portions of the Software.
19
 * 
20
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
 *
28
 * ===========================(LICENSE END)=============================
29
 *
30
 * @author   Thomas Pornin <[email protected]>
31
 */
32

33
#include <stddef.h>
34
#include <string.h>
35
#include <limits.h>
36

37
#include "sph_luffa.h"
38

39
#ifdef __cplusplus
40
extern "C"{
41
#endif
42

43
#if SPH_64_TRUE && !defined SPH_LUFFA_PARALLEL
44
#define SPH_LUFFA_PARALLEL   1
45
#endif
46

47
#ifdef _MSC_VER
48
#pragma warning (disable: 4146)
49
#endif
50

51
static const sph_u32 V_INIT[5][8] = {
52
	{
53
		SPH_C32(0x6d251e69), SPH_C32(0x44b051e0),
54
		SPH_C32(0x4eaa6fb4), SPH_C32(0xdbf78465),
55
		SPH_C32(0x6e292011), SPH_C32(0x90152df4),
56
		SPH_C32(0xee058139), SPH_C32(0xdef610bb)
57
	}, {
58
		SPH_C32(0xc3b44b95), SPH_C32(0xd9d2f256),
59
		SPH_C32(0x70eee9a0), SPH_C32(0xde099fa3),
60
		SPH_C32(0x5d9b0557), SPH_C32(0x8fc944b3),
61
		SPH_C32(0xcf1ccf0e), SPH_C32(0x746cd581)
62
	}, {
63
		SPH_C32(0xf7efc89d), SPH_C32(0x5dba5781),
64
		SPH_C32(0x04016ce5), SPH_C32(0xad659c05),
65
		SPH_C32(0x0306194f), SPH_C32(0x666d1836),
66
		SPH_C32(0x24aa230a), SPH_C32(0x8b264ae7)
67
	}, {
68
		SPH_C32(0x858075d5), SPH_C32(0x36d79cce),
69
		SPH_C32(0xe571f7d7), SPH_C32(0x204b1f67),
70
		SPH_C32(0x35870c6a), SPH_C32(0x57e9e923),
71
		SPH_C32(0x14bcb808), SPH_C32(0x7cde72ce)
72
	}, {
73
		SPH_C32(0x6c68e9be), SPH_C32(0x5ec41e22),
74
		SPH_C32(0xc825b7c7), SPH_C32(0xaffb4363),
75
		SPH_C32(0xf5df3999), SPH_C32(0x0fc688f1),
76
		SPH_C32(0xb07224cc), SPH_C32(0x03e86cea)
77
	}
78
};
79

80
static const sph_u32 RC00[8] = {
81
	SPH_C32(0x303994a6), SPH_C32(0xc0e65299),
82
	SPH_C32(0x6cc33a12), SPH_C32(0xdc56983e),
83
	SPH_C32(0x1e00108f), SPH_C32(0x7800423d),
84
	SPH_C32(0x8f5b7882), SPH_C32(0x96e1db12)
85
};
86

87
static const sph_u32 RC04[8] = {
88
	SPH_C32(0xe0337818), SPH_C32(0x441ba90d),
89
	SPH_C32(0x7f34d442), SPH_C32(0x9389217f),
90
	SPH_C32(0xe5a8bce6), SPH_C32(0x5274baf4),
91
	SPH_C32(0x26889ba7), SPH_C32(0x9a226e9d)
92
};
93

94
static const sph_u32 RC10[8] = {
95
	SPH_C32(0xb6de10ed), SPH_C32(0x70f47aae),
96
	SPH_C32(0x0707a3d4), SPH_C32(0x1c1e8f51),
97
	SPH_C32(0x707a3d45), SPH_C32(0xaeb28562),
98
	SPH_C32(0xbaca1589), SPH_C32(0x40a46f3e)
99
};
100

101
static const sph_u32 RC14[8] = {
102
	SPH_C32(0x01685f3d), SPH_C32(0x05a17cf4),
103
	SPH_C32(0xbd09caca), SPH_C32(0xf4272b28),
104
	SPH_C32(0x144ae5cc), SPH_C32(0xfaa7ae2b),
105
	SPH_C32(0x2e48f1c1), SPH_C32(0xb923c704)
106
};
107

108
#if SPH_LUFFA_PARALLEL
109

110
static const sph_u64 RCW010[8] = {
111
	SPH_C64(0xb6de10ed303994a6), SPH_C64(0x70f47aaec0e65299),
112
	SPH_C64(0x0707a3d46cc33a12), SPH_C64(0x1c1e8f51dc56983e),
113
	SPH_C64(0x707a3d451e00108f), SPH_C64(0xaeb285627800423d),
114
	SPH_C64(0xbaca15898f5b7882), SPH_C64(0x40a46f3e96e1db12)
115
};
116

117
static const sph_u64 RCW014[8] = {
118
	SPH_C64(0x01685f3de0337818), SPH_C64(0x05a17cf4441ba90d),
119
	SPH_C64(0xbd09caca7f34d442), SPH_C64(0xf4272b289389217f),
120
	SPH_C64(0x144ae5cce5a8bce6), SPH_C64(0xfaa7ae2b5274baf4),
121
	SPH_C64(0x2e48f1c126889ba7), SPH_C64(0xb923c7049a226e9d)
122
};
123

124
#endif
125

126
static const sph_u32 RC20[8] = {
127
	SPH_C32(0xfc20d9d2), SPH_C32(0x34552e25),
128
	SPH_C32(0x7ad8818f), SPH_C32(0x8438764a),
129
	SPH_C32(0xbb6de032), SPH_C32(0xedb780c8),
130
	SPH_C32(0xd9847356), SPH_C32(0xa2c78434)
131
};
132

133
static const sph_u32 RC24[8] = {
134
	SPH_C32(0xe25e72c1), SPH_C32(0xe623bb72),
135
	SPH_C32(0x5c58a4a4), SPH_C32(0x1e38e2e7),
136
	SPH_C32(0x78e38b9d), SPH_C32(0x27586719),
137
	SPH_C32(0x36eda57f), SPH_C32(0x703aace7)
138
};
139

140
static const sph_u32 RC30[8] = {
141
	SPH_C32(0xb213afa5), SPH_C32(0xc84ebe95),
142
	SPH_C32(0x4e608a22), SPH_C32(0x56d858fe),
143
	SPH_C32(0x343b138f), SPH_C32(0xd0ec4e3d),
144
	SPH_C32(0x2ceb4882), SPH_C32(0xb3ad2208)
145
};
146

147
static const sph_u32 RC34[8] = {
148
	SPH_C32(0xe028c9bf), SPH_C32(0x44756f91),
149
	SPH_C32(0x7e8fce32), SPH_C32(0x956548be),
150
	SPH_C32(0xfe191be2), SPH_C32(0x3cb226e5),
151
	SPH_C32(0x5944a28e), SPH_C32(0xa1c4c355)
152
};
153

154
#if SPH_LUFFA_PARALLEL
155

156
static const sph_u64 RCW230[8] = {
157
	SPH_C64(0xb213afa5fc20d9d2), SPH_C64(0xc84ebe9534552e25),
158
	SPH_C64(0x4e608a227ad8818f), SPH_C64(0x56d858fe8438764a),
159
	SPH_C64(0x343b138fbb6de032), SPH_C64(0xd0ec4e3dedb780c8),
160
	SPH_C64(0x2ceb4882d9847356), SPH_C64(0xb3ad2208a2c78434)
161
};
162

163

164
static const sph_u64 RCW234[8] = {
165
	SPH_C64(0xe028c9bfe25e72c1), SPH_C64(0x44756f91e623bb72),
166
	SPH_C64(0x7e8fce325c58a4a4), SPH_C64(0x956548be1e38e2e7),
167
	SPH_C64(0xfe191be278e38b9d), SPH_C64(0x3cb226e527586719),
168
	SPH_C64(0x5944a28e36eda57f), SPH_C64(0xa1c4c355703aace7)
169
};
170

171
#endif
172

173
static const sph_u32 RC40[8] = {
174
	SPH_C32(0xf0d2e9e3), SPH_C32(0xac11d7fa),
175
	SPH_C32(0x1bcb66f2), SPH_C32(0x6f2d9bc9),
176
	SPH_C32(0x78602649), SPH_C32(0x8edae952),
177
	SPH_C32(0x3b6ba548), SPH_C32(0xedae9520)
178
};
179

180
static const sph_u32 RC44[8] = {
181
	SPH_C32(0x5090d577), SPH_C32(0x2d1925ab),
182
	SPH_C32(0xb46496ac), SPH_C32(0xd1925ab0),
183
	SPH_C32(0x29131ab6), SPH_C32(0x0fc053c3),
184
	SPH_C32(0x3f014f0c), SPH_C32(0xfc053c31)
185
};
186

187
#define DECL_TMP8(w) \
188
	sph_u32 w ## 0, w ## 1, w ## 2, w ## 3, w ## 4, w ## 5, w ## 6, w ## 7;
189

190
#define M2(d, s)   do { \
191
		sph_u32 tmp = s ## 7; \
192
		d ## 7 = s ## 6; \
193
		d ## 6 = s ## 5; \
194
		d ## 5 = s ## 4; \
195
		d ## 4 = s ## 3 ^ tmp; \
196
		d ## 3 = s ## 2 ^ tmp; \
197
		d ## 2 = s ## 1; \
198
		d ## 1 = s ## 0 ^ tmp; \
199
		d ## 0 = tmp; \
200
	} while (0)
201

202
#define XOR(d, s1, s2)   do { \
203
		d ## 0 = s1 ## 0 ^ s2 ## 0; \
204
		d ## 1 = s1 ## 1 ^ s2 ## 1; \
205
		d ## 2 = s1 ## 2 ^ s2 ## 2; \
206
		d ## 3 = s1 ## 3 ^ s2 ## 3; \
207
		d ## 4 = s1 ## 4 ^ s2 ## 4; \
208
		d ## 5 = s1 ## 5 ^ s2 ## 5; \
209
		d ## 6 = s1 ## 6 ^ s2 ## 6; \
210
		d ## 7 = s1 ## 7 ^ s2 ## 7; \
211
	} while (0)
212

213
#if SPH_LUFFA_PARALLEL
214

215
#define SUB_CRUMB_GEN(a0, a1, a2, a3, width)   do { \
216
		sph_u ## width tmp; \
217
		tmp = (a0); \
218
		(a0) |= (a1); \
219
		(a2) ^= (a3); \
220
		(a1) = SPH_T ## width(~(a1)); \
221
		(a0) ^= (a3); \
222
		(a3) &= tmp; \
223
		(a1) ^= (a3); \
224
		(a3) ^= (a2); \
225
		(a2) &= (a0); \
226
		(a0) = SPH_T ## width(~(a0)); \
227
		(a2) ^= (a1); \
228
		(a1) |= (a3); \
229
		tmp ^= (a1); \
230
		(a3) ^= (a2); \
231
		(a2) &= (a1); \
232
		(a1) ^= (a0); \
233
		(a0) = tmp; \
234
	} while (0)
235

236
#define SUB_CRUMB(a0, a1, a2, a3)    SUB_CRUMB_GEN(a0, a1, a2, a3, 32)
237
#define SUB_CRUMBW(a0, a1, a2, a3)   SUB_CRUMB_GEN(a0, a1, a2, a3, 64)
238

239

240
#if 0
241

242
#define ROL32W(x, n)   SPH_T64( \
243
                       (((x) << (n)) \
244
                       & ~((SPH_C64(0xFFFFFFFF) >> (32 - (n))) << 32)) \
245
                       | (((x) >> (32 - (n))) \
246
                       & ~((SPH_C64(0xFFFFFFFF) >> (n)) << (n))))
247

248
#define MIX_WORDW(u, v)   do { \
249
		(v) ^= (u); \
250
		(u) = ROL32W((u), 2) ^ (v); \
251
		(v) = ROL32W((v), 14) ^ (u); \
252
		(u) = ROL32W((u), 10) ^ (v); \
253
		(v) = ROL32W((v), 1); \
254
	} while (0)
255

256
#endif
257

258
#define MIX_WORDW(u, v)   do { \
259
		sph_u32 ul, uh, vl, vh; \
260
		(v) ^= (u); \
261
		ul = SPH_T32((sph_u32)(u)); \
262
		uh = SPH_T32((sph_u32)((u) >> 32)); \
263
		vl = SPH_T32((sph_u32)(v)); \
264
		vh = SPH_T32((sph_u32)((v) >> 32)); \
265
		ul = SPH_ROTL32(ul, 2) ^ vl; \
266
		vl = SPH_ROTL32(vl, 14) ^ ul; \
267
		ul = SPH_ROTL32(ul, 10) ^ vl; \
268
		vl = SPH_ROTL32(vl, 1); \
269
		uh = SPH_ROTL32(uh, 2) ^ vh; \
270
		vh = SPH_ROTL32(vh, 14) ^ uh; \
271
		uh = SPH_ROTL32(uh, 10) ^ vh; \
272
		vh = SPH_ROTL32(vh, 1); \
273
		(u) = (sph_u64)ul | ((sph_u64)uh << 32); \
274
		(v) = (sph_u64)vl | ((sph_u64)vh << 32); \
275
	} while (0)
276

277
#else
278

279
#define SUB_CRUMB(a0, a1, a2, a3)   do { \
280
		sph_u32 tmp; \
281
		tmp = (a0); \
282
		(a0) |= (a1); \
283
		(a2) ^= (a3); \
284
		(a1) = SPH_T32(~(a1)); \
285
		(a0) ^= (a3); \
286
		(a3) &= tmp; \
287
		(a1) ^= (a3); \
288
		(a3) ^= (a2); \
289
		(a2) &= (a0); \
290
		(a0) = SPH_T32(~(a0)); \
291
		(a2) ^= (a1); \
292
		(a1) |= (a3); \
293
		tmp ^= (a1); \
294
		(a3) ^= (a2); \
295
		(a2) &= (a1); \
296
		(a1) ^= (a0); \
297
		(a0) = tmp; \
298
	} while (0)
299

300
#endif
301

302
#define MIX_WORD(u, v)   do { \
303
		(v) ^= (u); \
304
		(u) = SPH_ROTL32((u), 2) ^ (v); \
305
		(v) = SPH_ROTL32((v), 14) ^ (u); \
306
		(u) = SPH_ROTL32((u), 10) ^ (v); \
307
		(v) = SPH_ROTL32((v), 1); \
308
	} while (0)
309

310
#define DECL_STATE3 \
311
	sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \
312
	sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \
313
	sph_u32 V20, V21, V22, V23, V24, V25, V26, V27;
314

315
#define READ_STATE3(state)   do { \
316
		V00 = (state)->V[0][0]; \
317
		V01 = (state)->V[0][1]; \
318
		V02 = (state)->V[0][2]; \
319
		V03 = (state)->V[0][3]; \
320
		V04 = (state)->V[0][4]; \
321
		V05 = (state)->V[0][5]; \
322
		V06 = (state)->V[0][6]; \
323
		V07 = (state)->V[0][7]; \
324
		V10 = (state)->V[1][0]; \
325
		V11 = (state)->V[1][1]; \
326
		V12 = (state)->V[1][2]; \
327
		V13 = (state)->V[1][3]; \
328
		V14 = (state)->V[1][4]; \
329
		V15 = (state)->V[1][5]; \
330
		V16 = (state)->V[1][6]; \
331
		V17 = (state)->V[1][7]; \
332
		V20 = (state)->V[2][0]; \
333
		V21 = (state)->V[2][1]; \
334
		V22 = (state)->V[2][2]; \
335
		V23 = (state)->V[2][3]; \
336
		V24 = (state)->V[2][4]; \
337
		V25 = (state)->V[2][5]; \
338
		V26 = (state)->V[2][6]; \
339
		V27 = (state)->V[2][7]; \
340
	} while (0)
341

342
#define WRITE_STATE3(state)   do { \
343
		(state)->V[0][0] = V00; \
344
		(state)->V[0][1] = V01; \
345
		(state)->V[0][2] = V02; \
346
		(state)->V[0][3] = V03; \
347
		(state)->V[0][4] = V04; \
348
		(state)->V[0][5] = V05; \
349
		(state)->V[0][6] = V06; \
350
		(state)->V[0][7] = V07; \
351
		(state)->V[1][0] = V10; \
352
		(state)->V[1][1] = V11; \
353
		(state)->V[1][2] = V12; \
354
		(state)->V[1][3] = V13; \
355
		(state)->V[1][4] = V14; \
356
		(state)->V[1][5] = V15; \
357
		(state)->V[1][6] = V16; \
358
		(state)->V[1][7] = V17; \
359
		(state)->V[2][0] = V20; \
360
		(state)->V[2][1] = V21; \
361
		(state)->V[2][2] = V22; \
362
		(state)->V[2][3] = V23; \
363
		(state)->V[2][4] = V24; \
364
		(state)->V[2][5] = V25; \
365
		(state)->V[2][6] = V26; \
366
		(state)->V[2][7] = V27; \
367
	} while (0)
368

369
#define MI3   do { \
370
		DECL_TMP8(M) \
371
		DECL_TMP8(a) \
372
		M0 = sph_dec32be_aligned(buf +  0); \
373
		M1 = sph_dec32be_aligned(buf +  4); \
374
		M2 = sph_dec32be_aligned(buf +  8); \
375
		M3 = sph_dec32be_aligned(buf + 12); \
376
		M4 = sph_dec32be_aligned(buf + 16); \
377
		M5 = sph_dec32be_aligned(buf + 20); \
378
		M6 = sph_dec32be_aligned(buf + 24); \
379
		M7 = sph_dec32be_aligned(buf + 28); \
380
		XOR(a, V0, V1); \
381
		XOR(a, a, V2); \
382
		M2(a, a); \
383
		XOR(V0, a, V0); \
384
		XOR(V0, M, V0); \
385
		M2(M, M); \
386
		XOR(V1, a, V1); \
387
		XOR(V1, M, V1); \
388
		M2(M, M); \
389
		XOR(V2, a, V2); \
390
		XOR(V2, M, V2); \
391
	} while (0)
392

393
#define TWEAK3   do { \
394
		V14 = SPH_ROTL32(V14, 1); \
395
		V15 = SPH_ROTL32(V15, 1); \
396
		V16 = SPH_ROTL32(V16, 1); \
397
		V17 = SPH_ROTL32(V17, 1); \
398
		V24 = SPH_ROTL32(V24, 2); \
399
		V25 = SPH_ROTL32(V25, 2); \
400
		V26 = SPH_ROTL32(V26, 2); \
401
		V27 = SPH_ROTL32(V27, 2); \
402
	} while (0)
403

404
#if SPH_LUFFA_PARALLEL
405

406
#define P3   do { \
407
		int r; \
408
		sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \
409
		TWEAK3; \
410
		W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \
411
		W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \
412
		W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \
413
		W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \
414
		W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \
415
		W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \
416
		W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \
417
		W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \
418
		for (r = 0; r < 8; r ++) { \
419
			SUB_CRUMBW(W0, W1, W2, W3); \
420
			SUB_CRUMBW(W5, W6, W7, W4); \
421
			MIX_WORDW(W0, W4); \
422
			MIX_WORDW(W1, W5); \
423
			MIX_WORDW(W2, W6); \
424
			MIX_WORDW(W3, W7); \
425
			W0 ^= RCW010[r]; \
426
			W4 ^= RCW014[r]; \
427
		} \
428
		V00 = SPH_T32((sph_u32)W0); \
429
		V10 = SPH_T32((sph_u32)(W0 >> 32)); \
430
		V01 = SPH_T32((sph_u32)W1); \
431
		V11 = SPH_T32((sph_u32)(W1 >> 32)); \
432
		V02 = SPH_T32((sph_u32)W2); \
433
		V12 = SPH_T32((sph_u32)(W2 >> 32)); \
434
		V03 = SPH_T32((sph_u32)W3); \
435
		V13 = SPH_T32((sph_u32)(W3 >> 32)); \
436
		V04 = SPH_T32((sph_u32)W4); \
437
		V14 = SPH_T32((sph_u32)(W4 >> 32)); \
438
		V05 = SPH_T32((sph_u32)W5); \
439
		V15 = SPH_T32((sph_u32)(W5 >> 32)); \
440
		V06 = SPH_T32((sph_u32)W6); \
441
		V16 = SPH_T32((sph_u32)(W6 >> 32)); \
442
		V07 = SPH_T32((sph_u32)W7); \
443
		V17 = SPH_T32((sph_u32)(W7 >> 32)); \
444
		for (r = 0; r < 8; r ++) { \
445
			SUB_CRUMB(V20, V21, V22, V23); \
446
			SUB_CRUMB(V25, V26, V27, V24); \
447
			MIX_WORD(V20, V24); \
448
			MIX_WORD(V21, V25); \
449
			MIX_WORD(V22, V26); \
450
			MIX_WORD(V23, V27); \
451
			V20 ^= RC20[r]; \
452
			V24 ^= RC24[r]; \
453
		} \
454
	} while (0)
455

456
#else
457

458
#define P3   do { \
459
		int r; \
460
		TWEAK3; \
461
		for (r = 0; r < 8; r ++) { \
462
			SUB_CRUMB(V00, V01, V02, V03); \
463
			SUB_CRUMB(V05, V06, V07, V04); \
464
			MIX_WORD(V00, V04); \
465
			MIX_WORD(V01, V05); \
466
			MIX_WORD(V02, V06); \
467
			MIX_WORD(V03, V07); \
468
			V00 ^= RC00[r]; \
469
			V04 ^= RC04[r]; \
470
		} \
471
		for (r = 0; r < 8; r ++) { \
472
			SUB_CRUMB(V10, V11, V12, V13); \
473
			SUB_CRUMB(V15, V16, V17, V14); \
474
			MIX_WORD(V10, V14); \
475
			MIX_WORD(V11, V15); \
476
			MIX_WORD(V12, V16); \
477
			MIX_WORD(V13, V17); \
478
			V10 ^= RC10[r]; \
479
			V14 ^= RC14[r]; \
480
		} \
481
		for (r = 0; r < 8; r ++) { \
482
			SUB_CRUMB(V20, V21, V22, V23); \
483
			SUB_CRUMB(V25, V26, V27, V24); \
484
			MIX_WORD(V20, V24); \
485
			MIX_WORD(V21, V25); \
486
			MIX_WORD(V22, V26); \
487
			MIX_WORD(V23, V27); \
488
			V20 ^= RC20[r]; \
489
			V24 ^= RC24[r]; \
490
		} \
491
	} while (0)
492

493
#endif
494

495
#define DECL_STATE4 \
496
	sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \
497
	sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \
498
	sph_u32 V20, V21, V22, V23, V24, V25, V26, V27; \
499
	sph_u32 V30, V31, V32, V33, V34, V35, V36, V37;
500

501
#define READ_STATE4(state)   do { \
502
		V00 = (state)->V[0][0]; \
503
		V01 = (state)->V[0][1]; \
504
		V02 = (state)->V[0][2]; \
505
		V03 = (state)->V[0][3]; \
506
		V04 = (state)->V[0][4]; \
507
		V05 = (state)->V[0][5]; \
508
		V06 = (state)->V[0][6]; \
509
		V07 = (state)->V[0][7]; \
510
		V10 = (state)->V[1][0]; \
511
		V11 = (state)->V[1][1]; \
512
		V12 = (state)->V[1][2]; \
513
		V13 = (state)->V[1][3]; \
514
		V14 = (state)->V[1][4]; \
515
		V15 = (state)->V[1][5]; \
516
		V16 = (state)->V[1][6]; \
517
		V17 = (state)->V[1][7]; \
518
		V20 = (state)->V[2][0]; \
519
		V21 = (state)->V[2][1]; \
520
		V22 = (state)->V[2][2]; \
521
		V23 = (state)->V[2][3]; \
522
		V24 = (state)->V[2][4]; \
523
		V25 = (state)->V[2][5]; \
524
		V26 = (state)->V[2][6]; \
525
		V27 = (state)->V[2][7]; \
526
		V30 = (state)->V[3][0]; \
527
		V31 = (state)->V[3][1]; \
528
		V32 = (state)->V[3][2]; \
529
		V33 = (state)->V[3][3]; \
530
		V34 = (state)->V[3][4]; \
531
		V35 = (state)->V[3][5]; \
532
		V36 = (state)->V[3][6]; \
533
		V37 = (state)->V[3][7]; \
534
	} while (0)
535

536
#define WRITE_STATE4(state)   do { \
537
		(state)->V[0][0] = V00; \
538
		(state)->V[0][1] = V01; \
539
		(state)->V[0][2] = V02; \
540
		(state)->V[0][3] = V03; \
541
		(state)->V[0][4] = V04; \
542
		(state)->V[0][5] = V05; \
543
		(state)->V[0][6] = V06; \
544
		(state)->V[0][7] = V07; \
545
		(state)->V[1][0] = V10; \
546
		(state)->V[1][1] = V11; \
547
		(state)->V[1][2] = V12; \
548
		(state)->V[1][3] = V13; \
549
		(state)->V[1][4] = V14; \
550
		(state)->V[1][5] = V15; \
551
		(state)->V[1][6] = V16; \
552
		(state)->V[1][7] = V17; \
553
		(state)->V[2][0] = V20; \
554
		(state)->V[2][1] = V21; \
555
		(state)->V[2][2] = V22; \
556
		(state)->V[2][3] = V23; \
557
		(state)->V[2][4] = V24; \
558
		(state)->V[2][5] = V25; \
559
		(state)->V[2][6] = V26; \
560
		(state)->V[2][7] = V27; \
561
		(state)->V[3][0] = V30; \
562
		(state)->V[3][1] = V31; \
563
		(state)->V[3][2] = V32; \
564
		(state)->V[3][3] = V33; \
565
		(state)->V[3][4] = V34; \
566
		(state)->V[3][5] = V35; \
567
		(state)->V[3][6] = V36; \
568
		(state)->V[3][7] = V37; \
569
	} while (0)
570

571
#define MI4   do { \
572
		DECL_TMP8(M) \
573
		DECL_TMP8(a) \
574
		DECL_TMP8(b) \
575
		M0 = sph_dec32be_aligned(buf +  0); \
576
		M1 = sph_dec32be_aligned(buf +  4); \
577
		M2 = sph_dec32be_aligned(buf +  8); \
578
		M3 = sph_dec32be_aligned(buf + 12); \
579
		M4 = sph_dec32be_aligned(buf + 16); \
580
		M5 = sph_dec32be_aligned(buf + 20); \
581
		M6 = sph_dec32be_aligned(buf + 24); \
582
		M7 = sph_dec32be_aligned(buf + 28); \
583
		XOR(a, V0, V1); \
584
		XOR(b, V2, V3); \
585
		XOR(a, a, b); \
586
		M2(a, a); \
587
		XOR(V0, a, V0); \
588
		XOR(V1, a, V1); \
589
		XOR(V2, a, V2); \
590
		XOR(V3, a, V3); \
591
		M2(b, V0); \
592
		XOR(b, b, V3); \
593
		M2(V3, V3); \
594
		XOR(V3, V3, V2); \
595
		M2(V2, V2); \
596
		XOR(V2, V2, V1); \
597
		M2(V1, V1); \
598
		XOR(V1, V1, V0); \
599
		XOR(V0, b, M); \
600
		M2(M, M); \
601
		XOR(V1, V1, M); \
602
		M2(M, M); \
603
		XOR(V2, V2, M); \
604
		M2(M, M); \
605
		XOR(V3, V3, M); \
606
	} while (0)
607

608
#define TWEAK4   do { \
609
		V14 = SPH_ROTL32(V14, 1); \
610
		V15 = SPH_ROTL32(V15, 1); \
611
		V16 = SPH_ROTL32(V16, 1); \
612
		V17 = SPH_ROTL32(V17, 1); \
613
		V24 = SPH_ROTL32(V24, 2); \
614
		V25 = SPH_ROTL32(V25, 2); \
615
		V26 = SPH_ROTL32(V26, 2); \
616
		V27 = SPH_ROTL32(V27, 2); \
617
		V34 = SPH_ROTL32(V34, 3); \
618
		V35 = SPH_ROTL32(V35, 3); \
619
		V36 = SPH_ROTL32(V36, 3); \
620
		V37 = SPH_ROTL32(V37, 3); \
621
	} while (0)
622

623
#if SPH_LUFFA_PARALLEL
624

625
#define P4   do { \
626
		int r; \
627
		sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \
628
		TWEAK4; \
629
		W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \
630
		W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \
631
		W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \
632
		W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \
633
		W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \
634
		W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \
635
		W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \
636
		W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \
637
		for (r = 0; r < 8; r ++) { \
638
			SUB_CRUMBW(W0, W1, W2, W3); \
639
			SUB_CRUMBW(W5, W6, W7, W4); \
640
			MIX_WORDW(W0, W4); \
641
			MIX_WORDW(W1, W5); \
642
			MIX_WORDW(W2, W6); \
643
			MIX_WORDW(W3, W7); \
644
			W0 ^= RCW010[r]; \
645
			W4 ^= RCW014[r]; \
646
		} \
647
		V00 = SPH_T32((sph_u32)W0); \
648
		V10 = SPH_T32((sph_u32)(W0 >> 32)); \
649
		V01 = SPH_T32((sph_u32)W1); \
650
		V11 = SPH_T32((sph_u32)(W1 >> 32)); \
651
		V02 = SPH_T32((sph_u32)W2); \
652
		V12 = SPH_T32((sph_u32)(W2 >> 32)); \
653
		V03 = SPH_T32((sph_u32)W3); \
654
		V13 = SPH_T32((sph_u32)(W3 >> 32)); \
655
		V04 = SPH_T32((sph_u32)W4); \
656
		V14 = SPH_T32((sph_u32)(W4 >> 32)); \
657
		V05 = SPH_T32((sph_u32)W5); \
658
		V15 = SPH_T32((sph_u32)(W5 >> 32)); \
659
		V06 = SPH_T32((sph_u32)W6); \
660
		V16 = SPH_T32((sph_u32)(W6 >> 32)); \
661
		V07 = SPH_T32((sph_u32)W7); \
662
		V17 = SPH_T32((sph_u32)(W7 >> 32)); \
663
		W0 = (sph_u64)V20 | ((sph_u64)V30 << 32); \
664
		W1 = (sph_u64)V21 | ((sph_u64)V31 << 32); \
665
		W2 = (sph_u64)V22 | ((sph_u64)V32 << 32); \
666
		W3 = (sph_u64)V23 | ((sph_u64)V33 << 32); \
667
		W4 = (sph_u64)V24 | ((sph_u64)V34 << 32); \
668
		W5 = (sph_u64)V25 | ((sph_u64)V35 << 32); \
669
		W6 = (sph_u64)V26 | ((sph_u64)V36 << 32); \
670
		W7 = (sph_u64)V27 | ((sph_u64)V37 << 32); \
671
		for (r = 0; r < 8; r ++) { \
672
			SUB_CRUMBW(W0, W1, W2, W3); \
673
			SUB_CRUMBW(W5, W6, W7, W4); \
674
			MIX_WORDW(W0, W4); \
675
			MIX_WORDW(W1, W5); \
676
			MIX_WORDW(W2, W6); \
677
			MIX_WORDW(W3, W7); \
678
			W0 ^= RCW230[r]; \
679
			W4 ^= RCW234[r]; \
680
		} \
681
		V20 = SPH_T32((sph_u32)W0); \
682
		V30 = SPH_T32((sph_u32)(W0 >> 32)); \
683
		V21 = SPH_T32((sph_u32)W1); \
684
		V31 = SPH_T32((sph_u32)(W1 >> 32)); \
685
		V22 = SPH_T32((sph_u32)W2); \
686
		V32 = SPH_T32((sph_u32)(W2 >> 32)); \
687
		V23 = SPH_T32((sph_u32)W3); \
688
		V33 = SPH_T32((sph_u32)(W3 >> 32)); \
689
		V24 = SPH_T32((sph_u32)W4); \
690
		V34 = SPH_T32((sph_u32)(W4 >> 32)); \
691
		V25 = SPH_T32((sph_u32)W5); \
692
		V35 = SPH_T32((sph_u32)(W5 >> 32)); \
693
		V26 = SPH_T32((sph_u32)W6); \
694
		V36 = SPH_T32((sph_u32)(W6 >> 32)); \
695
		V27 = SPH_T32((sph_u32)W7); \
696
		V37 = SPH_T32((sph_u32)(W7 >> 32)); \
697
	} while (0)
698

699
#else
700

701
#define P4   do { \
702
		int r; \
703
		TWEAK4; \
704
		for (r = 0; r < 8; r ++) { \
705
			SUB_CRUMB(V00, V01, V02, V03); \
706
			SUB_CRUMB(V05, V06, V07, V04); \
707
			MIX_WORD(V00, V04); \
708
			MIX_WORD(V01, V05); \
709
			MIX_WORD(V02, V06); \
710
			MIX_WORD(V03, V07); \
711
			V00 ^= RC00[r]; \
712
			V04 ^= RC04[r]; \
713
		} \
714
		for (r = 0; r < 8; r ++) { \
715
			SUB_CRUMB(V10, V11, V12, V13); \
716
			SUB_CRUMB(V15, V16, V17, V14); \
717
			MIX_WORD(V10, V14); \
718
			MIX_WORD(V11, V15); \
719
			MIX_WORD(V12, V16); \
720
			MIX_WORD(V13, V17); \
721
			V10 ^= RC10[r]; \
722
			V14 ^= RC14[r]; \
723
		} \
724
		for (r = 0; r < 8; r ++) { \
725
			SUB_CRUMB(V20, V21, V22, V23); \
726
			SUB_CRUMB(V25, V26, V27, V24); \
727
			MIX_WORD(V20, V24); \
728
			MIX_WORD(V21, V25); \
729
			MIX_WORD(V22, V26); \
730
			MIX_WORD(V23, V27); \
731
			V20 ^= RC20[r]; \
732
			V24 ^= RC24[r]; \
733
		} \
734
		for (r = 0; r < 8; r ++) { \
735
			SUB_CRUMB(V30, V31, V32, V33); \
736
			SUB_CRUMB(V35, V36, V37, V34); \
737
			MIX_WORD(V30, V34); \
738
			MIX_WORD(V31, V35); \
739
			MIX_WORD(V32, V36); \
740
			MIX_WORD(V33, V37); \
741
			V30 ^= RC30[r]; \
742
			V34 ^= RC34[r]; \
743
		} \
744
	} while (0)
745

746
#endif
747

748
#define DECL_STATE5 \
749
	sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \
750
	sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \
751
	sph_u32 V20, V21, V22, V23, V24, V25, V26, V27; \
752
	sph_u32 V30, V31, V32, V33, V34, V35, V36, V37; \
753
	sph_u32 V40, V41, V42, V43, V44, V45, V46, V47;
754

755
#define READ_STATE5(state)   do { \
756
		V00 = (state)->V[0][0]; \
757
		V01 = (state)->V[0][1]; \
758
		V02 = (state)->V[0][2]; \
759
		V03 = (state)->V[0][3]; \
760
		V04 = (state)->V[0][4]; \
761
		V05 = (state)->V[0][5]; \
762
		V06 = (state)->V[0][6]; \
763
		V07 = (state)->V[0][7]; \
764
		V10 = (state)->V[1][0]; \
765
		V11 = (state)->V[1][1]; \
766
		V12 = (state)->V[1][2]; \
767
		V13 = (state)->V[1][3]; \
768
		V14 = (state)->V[1][4]; \
769
		V15 = (state)->V[1][5]; \
770
		V16 = (state)->V[1][6]; \
771
		V17 = (state)->V[1][7]; \
772
		V20 = (state)->V[2][0]; \
773
		V21 = (state)->V[2][1]; \
774
		V22 = (state)->V[2][2]; \
775
		V23 = (state)->V[2][3]; \
776
		V24 = (state)->V[2][4]; \
777
		V25 = (state)->V[2][5]; \
778
		V26 = (state)->V[2][6]; \
779
		V27 = (state)->V[2][7]; \
780
		V30 = (state)->V[3][0]; \
781
		V31 = (state)->V[3][1]; \
782
		V32 = (state)->V[3][2]; \
783
		V33 = (state)->V[3][3]; \
784
		V34 = (state)->V[3][4]; \
785
		V35 = (state)->V[3][5]; \
786
		V36 = (state)->V[3][6]; \
787
		V37 = (state)->V[3][7]; \
788
		V40 = (state)->V[4][0]; \
789
		V41 = (state)->V[4][1]; \
790
		V42 = (state)->V[4][2]; \
791
		V43 = (state)->V[4][3]; \
792
		V44 = (state)->V[4][4]; \
793
		V45 = (state)->V[4][5]; \
794
		V46 = (state)->V[4][6]; \
795
		V47 = (state)->V[4][7]; \
796
	} while (0)
797

798
#define WRITE_STATE5(state)   do { \
799
		(state)->V[0][0] = V00; \
800
		(state)->V[0][1] = V01; \
801
		(state)->V[0][2] = V02; \
802
		(state)->V[0][3] = V03; \
803
		(state)->V[0][4] = V04; \
804
		(state)->V[0][5] = V05; \
805
		(state)->V[0][6] = V06; \
806
		(state)->V[0][7] = V07; \
807
		(state)->V[1][0] = V10; \
808
		(state)->V[1][1] = V11; \
809
		(state)->V[1][2] = V12; \
810
		(state)->V[1][3] = V13; \
811
		(state)->V[1][4] = V14; \
812
		(state)->V[1][5] = V15; \
813
		(state)->V[1][6] = V16; \
814
		(state)->V[1][7] = V17; \
815
		(state)->V[2][0] = V20; \
816
		(state)->V[2][1] = V21; \
817
		(state)->V[2][2] = V22; \
818
		(state)->V[2][3] = V23; \
819
		(state)->V[2][4] = V24; \
820
		(state)->V[2][5] = V25; \
821
		(state)->V[2][6] = V26; \
822
		(state)->V[2][7] = V27; \
823
		(state)->V[3][0] = V30; \
824
		(state)->V[3][1] = V31; \
825
		(state)->V[3][2] = V32; \
826
		(state)->V[3][3] = V33; \
827
		(state)->V[3][4] = V34; \
828
		(state)->V[3][5] = V35; \
829
		(state)->V[3][6] = V36; \
830
		(state)->V[3][7] = V37; \
831
		(state)->V[4][0] = V40; \
832
		(state)->V[4][1] = V41; \
833
		(state)->V[4][2] = V42; \
834
		(state)->V[4][3] = V43; \
835
		(state)->V[4][4] = V44; \
836
		(state)->V[4][5] = V45; \
837
		(state)->V[4][6] = V46; \
838
		(state)->V[4][7] = V47; \
839
	} while (0)
840

841
#define MI5   do { \
842
		DECL_TMP8(M) \
843
		DECL_TMP8(a) \
844
		DECL_TMP8(b) \
845
		M0 = sph_dec32be_aligned(buf +  0); \
846
		M1 = sph_dec32be_aligned(buf +  4); \
847
		M2 = sph_dec32be_aligned(buf +  8); \
848
		M3 = sph_dec32be_aligned(buf + 12); \
849
		M4 = sph_dec32be_aligned(buf + 16); \
850
		M5 = sph_dec32be_aligned(buf + 20); \
851
		M6 = sph_dec32be_aligned(buf + 24); \
852
		M7 = sph_dec32be_aligned(buf + 28); \
853
		XOR(a, V0, V1); \
854
		XOR(b, V2, V3); \
855
		XOR(a, a, b); \
856
		XOR(a, a, V4); \
857
		M2(a, a); \
858
		XOR(V0, a, V0); \
859
		XOR(V1, a, V1); \
860
		XOR(V2, a, V2); \
861
		XOR(V3, a, V3); \
862
		XOR(V4, a, V4); \
863
		M2(b, V0); \
864
		XOR(b, b, V1); \
865
		M2(V1, V1); \
866
		XOR(V1, V1, V2); \
867
		M2(V2, V2); \
868
		XOR(V2, V2, V3); \
869
		M2(V3, V3); \
870
		XOR(V3, V3, V4); \
871
		M2(V4, V4); \
872
		XOR(V4, V4, V0); \
873
		M2(V0, b); \
874
		XOR(V0, V0, V4); \
875
		M2(V4, V4); \
876
		XOR(V4, V4, V3); \
877
		M2(V3, V3); \
878
		XOR(V3, V3, V2); \
879
		M2(V2, V2); \
880
		XOR(V2, V2, V1); \
881
		M2(V1, V1); \
882
		XOR(V1, V1, b); \
883
		XOR(V0, V0, M); \
884
		M2(M, M); \
885
		XOR(V1, V1, M); \
886
		M2(M, M); \
887
		XOR(V2, V2, M); \
888
		M2(M, M); \
889
		XOR(V3, V3, M); \
890
		M2(M, M); \
891
		XOR(V4, V4, M); \
892
	} while (0)
893

894
#define TWEAK5   do { \
895
		V14 = SPH_ROTL32(V14, 1); \
896
		V15 = SPH_ROTL32(V15, 1); \
897
		V16 = SPH_ROTL32(V16, 1); \
898
		V17 = SPH_ROTL32(V17, 1); \
899
		V24 = SPH_ROTL32(V24, 2); \
900
		V25 = SPH_ROTL32(V25, 2); \
901
		V26 = SPH_ROTL32(V26, 2); \
902
		V27 = SPH_ROTL32(V27, 2); \
903
		V34 = SPH_ROTL32(V34, 3); \
904
		V35 = SPH_ROTL32(V35, 3); \
905
		V36 = SPH_ROTL32(V36, 3); \
906
		V37 = SPH_ROTL32(V37, 3); \
907
		V44 = SPH_ROTL32(V44, 4); \
908
		V45 = SPH_ROTL32(V45, 4); \
909
		V46 = SPH_ROTL32(V46, 4); \
910
		V47 = SPH_ROTL32(V47, 4); \
911
	} while (0)
912

913
#if SPH_LUFFA_PARALLEL
914

915
#define P5   do { \
916
		int r; \
917
		sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \
918
		TWEAK5; \
919
		W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \
920
		W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \
921
		W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \
922
		W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \
923
		W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \
924
		W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \
925
		W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \
926
		W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \
927
		for (r = 0; r < 8; r ++) { \
928
			SUB_CRUMBW(W0, W1, W2, W3); \
929
			SUB_CRUMBW(W5, W6, W7, W4); \
930
			MIX_WORDW(W0, W4); \
931
			MIX_WORDW(W1, W5); \
932
			MIX_WORDW(W2, W6); \
933
			MIX_WORDW(W3, W7); \
934
			W0 ^= RCW010[r]; \
935
			W4 ^= RCW014[r]; \
936
		} \
937
		V00 = SPH_T32((sph_u32)W0); \
938
		V10 = SPH_T32((sph_u32)(W0 >> 32)); \
939
		V01 = SPH_T32((sph_u32)W1); \
940
		V11 = SPH_T32((sph_u32)(W1 >> 32)); \
941
		V02 = SPH_T32((sph_u32)W2); \
942
		V12 = SPH_T32((sph_u32)(W2 >> 32)); \
943
		V03 = SPH_T32((sph_u32)W3); \
944
		V13 = SPH_T32((sph_u32)(W3 >> 32)); \
945
		V04 = SPH_T32((sph_u32)W4); \
946
		V14 = SPH_T32((sph_u32)(W4 >> 32)); \
947
		V05 = SPH_T32((sph_u32)W5); \
948
		V15 = SPH_T32((sph_u32)(W5 >> 32)); \
949
		V06 = SPH_T32((sph_u32)W6); \
950
		V16 = SPH_T32((sph_u32)(W6 >> 32)); \
951
		V07 = SPH_T32((sph_u32)W7); \
952
		V17 = SPH_T32((sph_u32)(W7 >> 32)); \
953
		W0 = (sph_u64)V20 | ((sph_u64)V30 << 32); \
954
		W1 = (sph_u64)V21 | ((sph_u64)V31 << 32); \
955
		W2 = (sph_u64)V22 | ((sph_u64)V32 << 32); \
956
		W3 = (sph_u64)V23 | ((sph_u64)V33 << 32); \
957
		W4 = (sph_u64)V24 | ((sph_u64)V34 << 32); \
958
		W5 = (sph_u64)V25 | ((sph_u64)V35 << 32); \
959
		W6 = (sph_u64)V26 | ((sph_u64)V36 << 32); \
960
		W7 = (sph_u64)V27 | ((sph_u64)V37 << 32); \
961
		for (r = 0; r < 8; r ++) { \
962
			SUB_CRUMBW(W0, W1, W2, W3); \
963
			SUB_CRUMBW(W5, W6, W7, W4); \
964
			MIX_WORDW(W0, W4); \
965
			MIX_WORDW(W1, W5); \
966
			MIX_WORDW(W2, W6); \
967
			MIX_WORDW(W3, W7); \
968
			W0 ^= RCW230[r]; \
969
			W4 ^= RCW234[r]; \
970
		} \
971
		V20 = SPH_T32((sph_u32)W0); \
972
		V30 = SPH_T32((sph_u32)(W0 >> 32)); \
973
		V21 = SPH_T32((sph_u32)W1); \
974
		V31 = SPH_T32((sph_u32)(W1 >> 32)); \
975
		V22 = SPH_T32((sph_u32)W2); \
976
		V32 = SPH_T32((sph_u32)(W2 >> 32)); \
977
		V23 = SPH_T32((sph_u32)W3); \
978
		V33 = SPH_T32((sph_u32)(W3 >> 32)); \
979
		V24 = SPH_T32((sph_u32)W4); \
980
		V34 = SPH_T32((sph_u32)(W4 >> 32)); \
981
		V25 = SPH_T32((sph_u32)W5); \
982
		V35 = SPH_T32((sph_u32)(W5 >> 32)); \
983
		V26 = SPH_T32((sph_u32)W6); \
984
		V36 = SPH_T32((sph_u32)(W6 >> 32)); \
985
		V27 = SPH_T32((sph_u32)W7); \
986
		V37 = SPH_T32((sph_u32)(W7 >> 32)); \
987
		for (r = 0; r < 8; r ++) { \
988
			SUB_CRUMB(V40, V41, V42, V43); \
989
			SUB_CRUMB(V45, V46, V47, V44); \
990
			MIX_WORD(V40, V44); \
991
			MIX_WORD(V41, V45); \
992
			MIX_WORD(V42, V46); \
993
			MIX_WORD(V43, V47); \
994
			V40 ^= RC40[r]; \
995
			V44 ^= RC44[r]; \
996
		} \
997
	} while (0)
998

999
#else
1000

1001
#define P5   do { \
1002
		int r; \
1003
		TWEAK5; \
1004
		for (r = 0; r < 8; r ++) { \
1005
			SUB_CRUMB(V00, V01, V02, V03); \
1006
			SUB_CRUMB(V05, V06, V07, V04); \
1007
			MIX_WORD(V00, V04); \
1008
			MIX_WORD(V01, V05); \
1009
			MIX_WORD(V02, V06); \
1010
			MIX_WORD(V03, V07); \
1011
			V00 ^= RC00[r]; \
1012
			V04 ^= RC04[r]; \
1013
		} \
1014
		for (r = 0; r < 8; r ++) { \
1015
			SUB_CRUMB(V10, V11, V12, V13); \
1016
			SUB_CRUMB(V15, V16, V17, V14); \
1017
			MIX_WORD(V10, V14); \
1018
			MIX_WORD(V11, V15); \
1019
			MIX_WORD(V12, V16); \
1020
			MIX_WORD(V13, V17); \
1021
			V10 ^= RC10[r]; \
1022
			V14 ^= RC14[r]; \
1023
		} \
1024
		for (r = 0; r < 8; r ++) { \
1025
			SUB_CRUMB(V20, V21, V22, V23); \
1026
			SUB_CRUMB(V25, V26, V27, V24); \
1027
			MIX_WORD(V20, V24); \
1028
			MIX_WORD(V21, V25); \
1029
			MIX_WORD(V22, V26); \
1030
			MIX_WORD(V23, V27); \
1031
			V20 ^= RC20[r]; \
1032
			V24 ^= RC24[r]; \
1033
		} \
1034
		for (r = 0; r < 8; r ++) { \
1035
			SUB_CRUMB(V30, V31, V32, V33); \
1036
			SUB_CRUMB(V35, V36, V37, V34); \
1037
			MIX_WORD(V30, V34); \
1038
			MIX_WORD(V31, V35); \
1039
			MIX_WORD(V32, V36); \
1040
			MIX_WORD(V33, V37); \
1041
			V30 ^= RC30[r]; \
1042
			V34 ^= RC34[r]; \
1043
		} \
1044
		for (r = 0; r < 8; r ++) { \
1045
			SUB_CRUMB(V40, V41, V42, V43); \
1046
			SUB_CRUMB(V45, V46, V47, V44); \
1047
			MIX_WORD(V40, V44); \
1048
			MIX_WORD(V41, V45); \
1049
			MIX_WORD(V42, V46); \
1050
			MIX_WORD(V43, V47); \
1051
			V40 ^= RC40[r]; \
1052
			V44 ^= RC44[r]; \
1053
		} \
1054
	} while (0)
1055

1056
#endif
1057

1058
static void
1059
luffa3(sph_luffa224_context *sc, const void *data, size_t len)
1060
{
1061
	unsigned char *buf;
1062
	size_t ptr;
1063
	DECL_STATE3
1064

1065
	buf = sc->buf;
1066
	ptr = sc->ptr;
1067
	if (len < (sizeof sc->buf) - ptr) {
1068
		memcpy(buf + ptr, data, len);
1069
		ptr += len;
1070
		sc->ptr = ptr;
1071
		return;
1072
	}
1073

1074
	READ_STATE3(sc);
1075
	while (len > 0) {
1076
		size_t clen;
1077

1078
		clen = (sizeof sc->buf) - ptr;
1079
		if (clen > len)
1080
			clen = len;
1081
		memcpy(buf + ptr, data, clen);
1082
		ptr += clen;
1083
		data = (const unsigned char *)data + clen;
1084
		len -= clen;
1085
		if (ptr == sizeof sc->buf) {
1086
			MI3;
1087
			P3;
1088
			ptr = 0;
1089
		}
1090
	}
1091
	WRITE_STATE3(sc);
1092
	sc->ptr = ptr;
1093
}
1094

1095
static void
1096
luffa3_close(sph_luffa224_context *sc, unsigned ub, unsigned n,
1097
	void *dst, unsigned out_size_w32)
1098
{
1099
	unsigned char *buf, *out;
1100
	size_t ptr;
1101
	unsigned z;
1102
	int i;
1103
	DECL_STATE3
1104

1105
	buf = sc->buf;
1106
	ptr = sc->ptr;
1107
	z = 0x80 >> n;
1108
	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
1109
	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
1110
	READ_STATE3(sc);
1111
	for (i = 0; i < 2; i ++) {
1112
		MI3;
1113
		P3;
1114
		memset(buf, 0, sizeof sc->buf);
1115
	}
1116
	out = dst;
1117
	sph_enc32be(out +  0, V00 ^ V10 ^ V20);
1118
	sph_enc32be(out +  4, V01 ^ V11 ^ V21);
1119
	sph_enc32be(out +  8, V02 ^ V12 ^ V22);
1120
	sph_enc32be(out + 12, V03 ^ V13 ^ V23);
1121
	sph_enc32be(out + 16, V04 ^ V14 ^ V24);
1122
	sph_enc32be(out + 20, V05 ^ V15 ^ V25);
1123
	sph_enc32be(out + 24, V06 ^ V16 ^ V26);
1124
	if (out_size_w32 > 7)
1125
		sph_enc32be(out + 28, V07 ^ V17 ^ V27);
1126
}
1127

1128
static void
1129
luffa4(sph_luffa384_context *sc, const void *data, size_t len)
1130
{
1131
	unsigned char *buf;
1132
	size_t ptr;
1133
	DECL_STATE4
1134

1135
	buf = sc->buf;
1136
	ptr = sc->ptr;
1137
	if (len < (sizeof sc->buf) - ptr) {
1138
		memcpy(buf + ptr, data, len);
1139
		ptr += len;
1140
		sc->ptr = ptr;
1141
		return;
1142
	}
1143

1144
	READ_STATE4(sc);
1145
	while (len > 0) {
1146
		size_t clen;
1147

1148
		clen = (sizeof sc->buf) - ptr;
1149
		if (clen > len)
1150
			clen = len;
1151
		memcpy(buf + ptr, data, clen);
1152
		ptr += clen;
1153
		data = (const unsigned char *)data + clen;
1154
		len -= clen;
1155
		if (ptr == sizeof sc->buf) {
1156
			MI4;
1157
			P4;
1158
			ptr = 0;
1159
		}
1160
	}
1161
	WRITE_STATE4(sc);
1162
	sc->ptr = ptr;
1163
}
1164

1165
static void
1166
luffa4_close(sph_luffa384_context *sc, unsigned ub, unsigned n, void *dst)
1167
{
1168
	unsigned char *buf, *out;
1169
	size_t ptr;
1170
	unsigned z;
1171
	int i;
1172
	DECL_STATE4
1173

1174
	buf = sc->buf;
1175
	ptr = sc->ptr;
1176
	out = dst;
1177
	z = 0x80 >> n;
1178
	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
1179
	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
1180
	READ_STATE4(sc);
1181
	for (i = 0; i < 3; i ++) {
1182
		MI4;
1183
		P4;
1184
		switch (i) {
1185
		case 0:
1186
			memset(buf, 0, sizeof sc->buf);
1187
			break;
1188
		case 1:
1189
			sph_enc32be(out +  0, V00 ^ V10 ^ V20 ^ V30);
1190
			sph_enc32be(out +  4, V01 ^ V11 ^ V21 ^ V31);
1191
			sph_enc32be(out +  8, V02 ^ V12 ^ V22 ^ V32);
1192
			sph_enc32be(out + 12, V03 ^ V13 ^ V23 ^ V33);
1193
			sph_enc32be(out + 16, V04 ^ V14 ^ V24 ^ V34);
1194
			sph_enc32be(out + 20, V05 ^ V15 ^ V25 ^ V35);
1195
			sph_enc32be(out + 24, V06 ^ V16 ^ V26 ^ V36);
1196
			sph_enc32be(out + 28, V07 ^ V17 ^ V27 ^ V37);
1197
			break;
1198
		case 2:
1199
			sph_enc32be(out + 32, V00 ^ V10 ^ V20 ^ V30);
1200
			sph_enc32be(out + 36, V01 ^ V11 ^ V21 ^ V31);
1201
			sph_enc32be(out + 40, V02 ^ V12 ^ V22 ^ V32);
1202
			sph_enc32be(out + 44, V03 ^ V13 ^ V23 ^ V33);
1203
			break;
1204
		}
1205
	}
1206
}
1207

1208
static void
1209
luffa5(sph_luffa512_context *sc, const void *data, size_t len)
1210
{
1211
	unsigned char *buf;
1212
	size_t ptr;
1213
	DECL_STATE5
1214

1215
	buf = sc->buf;
1216
	ptr = sc->ptr;
1217
	if (len < (sizeof sc->buf) - ptr) {
1218
		memcpy(buf + ptr, data, len);
1219
		ptr += len;
1220
		sc->ptr = ptr;
1221
		return;
1222
	}
1223

1224
	READ_STATE5(sc);
1225
	while (len > 0) {
1226
		size_t clen;
1227

1228
		clen = (sizeof sc->buf) - ptr;
1229
		if (clen > len)
1230
			clen = len;
1231
		memcpy(buf + ptr, data, clen);
1232
		ptr += clen;
1233
		data = (const unsigned char *)data + clen;
1234
		len -= clen;
1235
		if (ptr == sizeof sc->buf) {
1236
			MI5;
1237
			P5;
1238
			ptr = 0;
1239
		}
1240
	}
1241
	WRITE_STATE5(sc);
1242
	sc->ptr = ptr;
1243
}
1244

1245
static void
1246
luffa5_close(sph_luffa512_context *sc, unsigned ub, unsigned n, void *dst)
1247
{
1248
	unsigned char *buf, *out;
1249
	size_t ptr;
1250
	unsigned z;
1251
	int i;
1252
	DECL_STATE5
1253

1254
	buf = sc->buf;
1255
	ptr = sc->ptr;
1256
	out = dst;
1257
	z = 0x80 >> n;
1258
	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
1259
	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
1260
	READ_STATE5(sc);
1261
	for (i = 0; i < 3; i ++) {
1262
		MI5;
1263
		P5;
1264
		switch (i) {
1265
		case 0:
1266
			memset(buf, 0, sizeof sc->buf);
1267
			break;
1268
		case 1:
1269
			sph_enc32be(out +  0, V00 ^ V10 ^ V20 ^ V30 ^ V40);
1270
			sph_enc32be(out +  4, V01 ^ V11 ^ V21 ^ V31 ^ V41);
1271
			sph_enc32be(out +  8, V02 ^ V12 ^ V22 ^ V32 ^ V42);
1272
			sph_enc32be(out + 12, V03 ^ V13 ^ V23 ^ V33 ^ V43);
1273
			sph_enc32be(out + 16, V04 ^ V14 ^ V24 ^ V34 ^ V44);
1274
			sph_enc32be(out + 20, V05 ^ V15 ^ V25 ^ V35 ^ V45);
1275
			sph_enc32be(out + 24, V06 ^ V16 ^ V26 ^ V36 ^ V46);
1276
			sph_enc32be(out + 28, V07 ^ V17 ^ V27 ^ V37 ^ V47);
1277
			break;
1278
		case 2:
1279
			sph_enc32be(out + 32, V00 ^ V10 ^ V20 ^ V30 ^ V40);
1280
			sph_enc32be(out + 36, V01 ^ V11 ^ V21 ^ V31 ^ V41);
1281
			sph_enc32be(out + 40, V02 ^ V12 ^ V22 ^ V32 ^ V42);
1282
			sph_enc32be(out + 44, V03 ^ V13 ^ V23 ^ V33 ^ V43);
1283
			sph_enc32be(out + 48, V04 ^ V14 ^ V24 ^ V34 ^ V44);
1284
			sph_enc32be(out + 52, V05 ^ V15 ^ V25 ^ V35 ^ V45);
1285
			sph_enc32be(out + 56, V06 ^ V16 ^ V26 ^ V36 ^ V46);
1286
			sph_enc32be(out + 60, V07 ^ V17 ^ V27 ^ V37 ^ V47);
1287
			break;
1288
		}
1289
	}
1290
}
1291

1292
/* see sph_luffa.h */
1293
void
1294
sph_luffa224_init(void *cc)
1295
{
1296
	sph_luffa224_context *sc;
1297

1298
	sc = cc;
1299
	memcpy(sc->V, V_INIT, sizeof(sc->V));
1300
	sc->ptr = 0;
1301
}
1302

1303
/* see sph_luffa.h */
1304
void
1305
sph_luffa224(void *cc, const void *data, size_t len)
1306
{
1307
	luffa3(cc, data, len);
1308
}
1309

1310
/* see sph_luffa.h */
1311
void
1312
sph_luffa224_close(void *cc, void *dst)
1313
{
1314
	sph_luffa224_addbits_and_close(cc, 0, 0, dst);
1315
}
1316

1317
/* see sph_luffa.h */
1318
void
1319
sph_luffa224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1320
{
1321
	luffa3_close(cc, ub, n, dst, 7);
1322
	sph_luffa224_init(cc);
1323
}
1324

1325
/* see sph_luffa.h */
1326
void
1327
sph_luffa256_init(void *cc)
1328
{
1329
	sph_luffa256_context *sc;
1330

1331
	sc = cc;
1332
	memcpy(sc->V, V_INIT, sizeof(sc->V));
1333
	sc->ptr = 0;
1334
}
1335

1336
/* see sph_luffa.h */
1337
void
1338
sph_luffa256(void *cc, const void *data, size_t len)
1339
{
1340
	luffa3(cc, data, len);
1341
}
1342

1343
/* see sph_luffa.h */
1344
void
1345
sph_luffa256_close(void *cc, void *dst)
1346
{
1347
	sph_luffa256_addbits_and_close(cc, 0, 0, dst);
1348
}
1349

1350
/* see sph_luffa.h */
1351
void
1352
sph_luffa256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1353
{
1354
	luffa3_close(cc, ub, n, dst, 8);
1355
	sph_luffa256_init(cc);
1356
}
1357

1358
/* see sph_luffa.h */
1359
void
1360
sph_luffa384_init(void *cc)
1361
{
1362
	sph_luffa384_context *sc;
1363

1364
	sc = cc;
1365
	memcpy(sc->V, V_INIT, sizeof(sc->V));
1366
	sc->ptr = 0;
1367
}
1368

1369
/* see sph_luffa.h */
1370
void
1371
sph_luffa384(void *cc, const void *data, size_t len)
1372
{
1373
	luffa4(cc, data, len);
1374
}
1375

1376
/* see sph_luffa.h */
1377
void
1378
sph_luffa384_close(void *cc, void *dst)
1379
{
1380
	sph_luffa384_addbits_and_close(cc, 0, 0, dst);
1381
}
1382

1383
/* see sph_luffa.h */
1384
void
1385
sph_luffa384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1386
{
1387
	luffa4_close(cc, ub, n, dst);
1388
	sph_luffa384_init(cc);
1389
}
1390

1391
/* see sph_luffa.h */
1392
void
1393
sph_luffa512_init(void *cc)
1394
{
1395
	sph_luffa512_context *sc;
1396

1397
	sc = cc;
1398
	memcpy(sc->V, V_INIT, sizeof(sc->V));
1399
	sc->ptr = 0;
1400
}
1401

1402
/* see sph_luffa.h */
1403
void
1404
sph_luffa512(void *cc, const void *data, size_t len)
1405
{
1406
	luffa5(cc, data, len);
1407
}
1408

1409
/* see sph_luffa.h */
1410
void
1411
sph_luffa512_close(void *cc, void *dst)
1412
{
1413
	sph_luffa512_addbits_and_close(cc, 0, 0, dst);
1414
}
1415

1416
/* see sph_luffa.h */
1417
void
1418
sph_luffa512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1419
{
1420
	luffa5_close(cc, ub, n, dst);
1421
	sph_luffa512_init(cc);
1422
}
1423

1424
#ifdef __cplusplus
1425
}
1426
#endif
1427
Product

Resources

Company