CoCalc -- sph

GitHub Repository: tpruvot/cpuminer-multi
Path: blob/linux/sha3/sph_simd.c
¹⁵²⁸ views
1
/* $Id: simd.c 227 2010-06-16 17:28:38Z tp $ */
2
/*
3
 * SIMD implementation.
4
 *
5
 * ==========================(LICENSE BEGIN)============================
6
 *
7
 * Copyright (c) 2007-2010  Projet RNRT SAPHIR
8
 * 
9
 * Permission is hereby granted, free of charge, to any person obtaining
10
 * a copy of this software and associated documentation files (the
11
 * "Software"), to deal in the Software without restriction, including
12
 * without limitation the rights to use, copy, modify, merge, publish,
13
 * distribute, sublicense, and/or sell copies of the Software, and to
14
 * permit persons to whom the Software is furnished to do so, subject to
15
 * the following conditions:
16
 * 
17
 * The above copyright notice and this permission notice shall be
18
 * included in all copies or substantial portions of the Software.
19
 * 
20
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
 *
28
 * ===========================(LICENSE END)=============================
29
 *
30
 * @author   Thomas Pornin <[email protected]>
31
 */
32

33
#include <stddef.h>
34
#include <string.h>
35
#include <limits.h>
36

37
#include "sph_simd.h"
38

39
#ifdef __cplusplus
40
extern "C"{
41
#endif
42

43
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SIMD
44
#define SPH_SMALL_FOOTPRINT_SIMD   1
45
#endif
46

47
#ifdef _MSC_VER
48
#pragma warning (disable: 4146)
49
#endif
50

51
typedef sph_u32 u32;
52
typedef sph_s32 s32;
53
#define C32     SPH_C32
54
#define T32     SPH_T32
55
#define ROL32   SPH_ROTL32
56

57
#define XCAT(x, y)    XCAT_(x, y)
58
#define XCAT_(x, y)   x ## y
59

60
/*
61
 * The powers of 41 modulo 257. We use exponents from 0 to 255, inclusive.
62
 */
63
static const s32 alpha_tab[] = {
64
	  1,  41, 139,  45,  46,  87, 226,  14,  60, 147, 116, 130,
65
	190,  80, 196,  69,   2,  82,  21,  90,  92, 174, 195,  28,
66
	120,  37, 232,   3, 123, 160, 135, 138,   4, 164,  42, 180,
67
	184,  91, 133,  56, 240,  74, 207,   6, 246,  63,  13,  19,
68
	  8,  71,  84, 103, 111, 182,   9, 112, 223, 148, 157,  12,
69
	235, 126,  26,  38,  16, 142, 168, 206, 222, 107,  18, 224,
70
	189,  39,  57,  24, 213, 252,  52,  76,  32,  27,  79, 155,
71
	187, 214,  36, 191, 121,  78, 114,  48, 169, 247, 104, 152,
72
	 64,  54, 158,  53, 117, 171,  72, 125, 242, 156, 228,  96,
73
	 81, 237, 208,  47, 128, 108,  59, 106, 234,  85, 144, 250,
74
	227,  55, 199, 192, 162, 217, 159,  94, 256, 216, 118, 212,
75
	211, 170,  31, 243, 197, 110, 141, 127,  67, 177,  61, 188,
76
	255, 175, 236, 167, 165,  83,  62, 229, 137, 220,  25, 254,
77
	134,  97, 122, 119, 253,  93, 215,  77,  73, 166, 124, 201,
78
	 17, 183,  50, 251,  11, 194, 244, 238, 249, 186, 173, 154,
79
	146,  75, 248, 145,  34, 109, 100, 245,  22, 131, 231, 219,
80
	241, 115,  89,  51,  35, 150, 239,  33,  68, 218, 200, 233,
81
	 44,   5, 205, 181, 225, 230, 178, 102,  70,  43, 221,  66,
82
	136, 179, 143, 209,  88,  10, 153, 105, 193, 203,  99, 204,
83
	140,  86, 185, 132,  15, 101,  29, 161, 176,  20,  49, 210,
84
	129, 149, 198, 151,  23, 172, 113,   7,  30, 202,  58,  65,
85
	 95,  40,  98, 163
86
};
87

88
/*
89
 * Ranges:
90
 *   REDS1: from -32768..98302 to -383..383
91
 *   REDS2: from -2^31..2^31-1 to -32768..98302
92
 */
93
#define REDS1(x)    (((x) & 0xFF) - ((x) >> 8))
94
#define REDS2(x)    (((x) & 0xFFFF) + ((x) >> 16))
95

96
/*
97
 * If, upon entry, the values of q[] are all in the -N..N range (where
98
 * N >= 98302) then the new values of q[] are in the -2N..2N range.
99
 *
100
 * Since alpha_tab[v] <= 256, maximum allowed range is for N = 8388608.
101
 */
102
#define FFT_LOOP(rb, hk, as, id)   do { \
103
		size_t u, v; \
104
		s32 m = q[(rb)]; \
105
		s32 n = q[(rb) + (hk)]; \
106
		q[(rb)] = m + n; \
107
		q[(rb) + (hk)] = m - n; \
108
		u = v = 0; \
109
		goto id; \
110
		for (; u < (hk); u += 4, v += 4 * (as)) { \
111
			s32 t; \
112
			m = q[(rb) + u + 0]; \
113
			n = q[(rb) + u + 0 + (hk)]; \
114
			t = REDS2(n * alpha_tab[v + 0 * (as)]); \
115
			q[(rb) + u + 0] = m + t; \
116
			q[(rb) + u + 0 + (hk)] = m - t; \
117
		id: \
118
			m = q[(rb) + u + 1]; \
119
			n = q[(rb) + u + 1 + (hk)]; \
120
			t = REDS2(n * alpha_tab[v + 1 * (as)]); \
121
			q[(rb) + u + 1] = m + t; \
122
			q[(rb) + u + 1 + (hk)] = m - t; \
123
			m = q[(rb) + u + 2]; \
124
			n = q[(rb) + u + 2 + (hk)]; \
125
			t = REDS2(n * alpha_tab[v + 2 * (as)]); \
126
			q[(rb) + u + 2] = m + t; \
127
			q[(rb) + u + 2 + (hk)] = m - t; \
128
			m = q[(rb) + u + 3]; \
129
			n = q[(rb) + u + 3 + (hk)]; \
130
			t = REDS2(n * alpha_tab[v + 3 * (as)]); \
131
			q[(rb) + u + 3] = m + t; \
132
			q[(rb) + u + 3 + (hk)] = m - t; \
133
		} \
134
	} while (0)
135

136
/*
137
 * Output ranges:
138
 *   d0:   min=    0   max= 1020
139
 *   d1:   min=  -67   max= 4587
140
 *   d2:   min=-4335   max= 4335
141
 *   d3:   min=-4147   max=  507
142
 *   d4:   min= -510   max=  510
143
 *   d5:   min= -252   max= 4402
144
 *   d6:   min=-4335   max= 4335
145
 *   d7:   min=-4332   max=  322
146
 */
147
#define FFT8(xb, xs, d)   do { \
148
		s32 x0 = x[(xb)]; \
149
		s32 x1 = x[(xb) + (xs)]; \
150
		s32 x2 = x[(xb) + 2 * (xs)]; \
151
		s32 x3 = x[(xb) + 3 * (xs)]; \
152
		s32 a0 = x0 + x2; \
153
		s32 a1 = x0 + (x2 << 4); \
154
		s32 a2 = x0 - x2; \
155
		s32 a3 = x0 - (x2 << 4); \
156
		s32 b0 = x1 + x3; \
157
		s32 b1 = REDS1((x1 << 2) + (x3 << 6)); \
158
		s32 b2 = (x1 << 4) - (x3 << 4); \
159
		s32 b3 = REDS1((x1 << 6) + (x3 << 2)); \
160
		d ## 0 = a0 + b0; \
161
		d ## 1 = a1 + b1; \
162
		d ## 2 = a2 + b2; \
163
		d ## 3 = a3 + b3; \
164
		d ## 4 = a0 - b0; \
165
		d ## 5 = a1 - b1; \
166
		d ## 6 = a2 - b2; \
167
		d ## 7 = a3 - b3; \
168
	} while (0)
169

170
/*
171
 * When k=16, we have alpha=2. Multiplication by alpha^i is then reduced
172
 * to some shifting.
173
 *
174
 * Output: within -591471..591723
175
 */
176
#define FFT16(xb, xs, rb)   do { \
177
		s32 d1_0, d1_1, d1_2, d1_3, d1_4, d1_5, d1_6, d1_7; \
178
		s32 d2_0, d2_1, d2_2, d2_3, d2_4, d2_5, d2_6, d2_7; \
179
		FFT8(xb, (xs) << 1, d1_); \
180
		FFT8((xb) + (xs), (xs) << 1, d2_); \
181
		q[(rb) +  0] = d1_0 + d2_0; \
182
		q[(rb) +  1] = d1_1 + (d2_1 << 1); \
183
		q[(rb) +  2] = d1_2 + (d2_2 << 2); \
184
		q[(rb) +  3] = d1_3 + (d2_3 << 3); \
185
		q[(rb) +  4] = d1_4 + (d2_4 << 4); \
186
		q[(rb) +  5] = d1_5 + (d2_5 << 5); \
187
		q[(rb) +  6] = d1_6 + (d2_6 << 6); \
188
		q[(rb) +  7] = d1_7 + (d2_7 << 7); \
189
		q[(rb) +  8] = d1_0 - d2_0; \
190
		q[(rb) +  9] = d1_1 - (d2_1 << 1); \
191
		q[(rb) + 10] = d1_2 - (d2_2 << 2); \
192
		q[(rb) + 11] = d1_3 - (d2_3 << 3); \
193
		q[(rb) + 12] = d1_4 - (d2_4 << 4); \
194
		q[(rb) + 13] = d1_5 - (d2_5 << 5); \
195
		q[(rb) + 14] = d1_6 - (d2_6 << 6); \
196
		q[(rb) + 15] = d1_7 - (d2_7 << 7); \
197
	} while (0)
198

199
/*
200
 * Output range: |q| <= 1183446
201
 */
202
#define FFT32(xb, xs, rb, id)   do { \
203
		FFT16(xb, (xs) << 1, rb); \
204
		FFT16((xb) + (xs), (xs) << 1, (rb) + 16); \
205
		FFT_LOOP(rb, 16, 8, id); \
206
	} while (0)
207

208
/*
209
 * Output range: |q| <= 2366892
210
 */
211
#define FFT64(xb, xs, rb, id)   do { \
212
		FFT32(xb, (xs) << 1, rb, XCAT(id, a)); \
213
		FFT32((xb) + (xs), (xs) << 1, (rb) + 32, XCAT(id, b)); \
214
		FFT_LOOP(rb, 32, 4, id); \
215
	} while (0)
216

217
#if SPH_SMALL_FOOTPRINT_SIMD
218

219
static void
220
fft32(unsigned char *x, size_t xs, s32 *q)
221
{
222
	size_t xd;
223

224
	xd = xs << 1;
225
	FFT16(0, xd, 0);
226
	FFT16(xs, xd, 16);
227
	FFT_LOOP(0, 16, 8, label_);
228
}
229

230
#define FFT128(xb, xs, rb, id)   do { \
231
		fft32(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) +  0]); \
232
		fft32(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 32]); \
233
		FFT_LOOP(rb, 32, 4, XCAT(id, aa)); \
234
		fft32(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 64]); \
235
		fft32(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 96]); \
236
		FFT_LOOP((rb) + 64, 32, 4, XCAT(id, ab)); \
237
		FFT_LOOP(rb, 64, 2, XCAT(id, a)); \
238
	} while (0)
239

240
#else
241

242
/*
243
 * Output range: |q| <= 4733784
244
 */
245
#define FFT128(xb, xs, rb, id)   do { \
246
		FFT64(xb, (xs) << 1, rb, XCAT(id, a)); \
247
		FFT64((xb) + (xs), (xs) << 1, (rb) + 64, XCAT(id, b)); \
248
		FFT_LOOP(rb, 64, 2, id); \
249
	} while (0)
250

251
#endif
252

253
/*
254
 * For SIMD-384 / SIMD-512, the fully unrolled FFT yields a compression
255
 * function which does not fit in the 32 kB L1 cache of a typical x86
256
 * Intel. We therefore add a function call layer at the FFT64 level.
257
 */
258

259
static void
260
fft64(unsigned char *x, size_t xs, s32 *q)
261
{
262
	size_t xd;
263

264
	xd = xs << 1;
265
	FFT32(0, xd, 0, label_a);
266
	FFT32(xs, xd, 32, label_b);
267
	FFT_LOOP(0, 32, 4, label_);
268
}
269

270
/*
271
 * Output range: |q| <= 9467568
272
 */
273
#define FFT256(xb, xs, rb, id)   do { \
274
		fft64(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) +   0]); \
275
		fft64(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) +  64]); \
276
		FFT_LOOP(rb, 64, 2, XCAT(id, aa)); \
277
		fft64(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 128]); \
278
		fft64(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 192]); \
279
		FFT_LOOP((rb) + 128, 64, 2, XCAT(id, ab)); \
280
		FFT_LOOP(rb, 128, 1, XCAT(id, a)); \
281
	} while (0)
282

283
/*
284
 * alpha^(127*i) mod 257
285
 */
286
static const unsigned short yoff_s_n[] = {
287
	  1,  98,  95,  58,  30, 113,  23, 198, 129,  49, 176,  29,
288
	 15, 185, 140,  99, 193, 153,  88, 143, 136, 221,  70, 178,
289
	225, 205,  44, 200,  68, 239,  35,  89, 241, 231,  22, 100,
290
	 34, 248, 146, 173, 249, 244,  11,  50,  17, 124,  73, 215,
291
	253, 122, 134,  25, 137,  62, 165, 236, 255,  61,  67, 141,
292
	197,  31, 211, 118, 256, 159, 162, 199, 227, 144, 234,  59,
293
	128, 208,  81, 228, 242,  72, 117, 158,  64, 104, 169, 114,
294
	121,  36, 187,  79,  32,  52, 213,  57, 189,  18, 222, 168,
295
	 16,  26, 235, 157, 223,   9, 111,  84,   8,  13, 246, 207,
296
	240, 133, 184,  42,   4, 135, 123, 232, 120, 195,  92,  21,
297
	  2, 196, 190, 116,  60, 226,  46, 139
298
};
299

300
/*
301
 * alpha^(127*i) + alpha^(125*i) mod 257
302
 */
303
static const unsigned short yoff_s_f[] = {
304
	  2, 156, 118, 107,  45, 212, 111, 162,  97, 249, 211,   3,
305
	 49, 101, 151, 223, 189, 178, 253, 204,  76,  82, 232,  65,
306
	 96, 176, 161,  47, 189,  61, 248, 107,   0, 131, 133, 113,
307
	 17,  33,  12, 111, 251, 103,  57, 148,  47,  65, 249, 143,
308
	189,   8, 204, 230, 205, 151, 187, 227, 247, 111, 140,   6,
309
	 77,  10,  21, 149, 255, 101, 139, 150, 212,  45, 146,  95,
310
	160,   8,  46, 254, 208, 156, 106,  34,  68,  79,   4,  53,
311
	181, 175,  25, 192, 161,  81,  96, 210,  68, 196,   9, 150,
312
	  0, 126, 124, 144, 240, 224, 245, 146,   6, 154, 200, 109,
313
	210, 192,   8, 114,  68, 249,  53,  27,  52, 106,  70,  30,
314
	 10, 146, 117, 251, 180, 247, 236, 108
315
};
316

317
/*
318
 * beta^(255*i) mod 257
319
 */
320
static const unsigned short yoff_b_n[] = {
321
	  1, 163,  98,  40,  95,  65,  58, 202,  30,   7, 113, 172,
322
	 23, 151, 198, 149, 129, 210,  49,  20, 176, 161,  29, 101,
323
	 15, 132, 185,  86, 140, 204,  99, 203, 193, 105, 153,  10,
324
	 88, 209, 143, 179, 136,  66, 221,  43,  70, 102, 178, 230,
325
	225, 181, 205,   5,  44, 233, 200, 218,  68,  33, 239, 150,
326
	 35,  51,  89, 115, 241, 219, 231, 131,  22, 245, 100, 109,
327
	 34, 145, 248,  75, 146, 154, 173, 186, 249, 238, 244, 194,
328
	 11, 251,  50, 183,  17, 201, 124, 166,  73,  77, 215,  93,
329
	253, 119, 122,  97, 134, 254,  25, 220, 137, 229,  62,  83,
330
	165, 167, 236, 175, 255, 188,  61, 177,  67, 127, 141, 110,
331
	197, 243,  31, 170, 211, 212, 118, 216, 256,  94, 159, 217,
332
	162, 192, 199,  55, 227, 250, 144,  85, 234, 106,  59, 108,
333
	128,  47, 208, 237,  81,  96, 228, 156, 242, 125,  72, 171,
334
	117,  53, 158,  54,  64, 152, 104, 247, 169,  48, 114,  78,
335
	121, 191,  36, 214, 187, 155,  79,  27,  32,  76,  52, 252,
336
	213,  24,  57,  39, 189, 224,  18, 107, 222, 206, 168, 142,
337
	 16,  38,  26, 126, 235,  12, 157, 148, 223, 112,   9, 182,
338
	111, 103,  84,  71,   8,  19,  13,  63, 246,   6, 207,  74,
339
	240,  56, 133,  91, 184, 180,  42, 164,   4, 138, 135, 160,
340
	123,   3, 232,  37, 120,  28, 195, 174,  92,  90,  21,  82,
341
	  2,  69, 196,  80, 190, 130, 116, 147,  60,  14, 226,  87,
342
	 46,  45, 139,  41
343
};
344

345
/*
346
 * beta^(255*i) + beta^(253*i) mod 257
347
 */
348
static const unsigned short yoff_b_f[] = {
349
	  2, 203, 156,  47, 118, 214, 107, 106,  45,  93, 212,  20,
350
	111,  73, 162, 251,  97, 215, 249,  53, 211,  19,   3,  89,
351
	 49, 207, 101,  67, 151, 130, 223,  23, 189, 202, 178, 239,
352
	253, 127, 204,  49,  76, 236,  82, 137, 232, 157,  65,  79,
353
	 96, 161, 176, 130, 161,  30,  47,   9, 189, 247,  61, 226,
354
	248,  90, 107,  64,   0,  88, 131, 243, 133,  59, 113, 115,
355
	 17, 236,  33, 213,  12, 191, 111,  19, 251,  61, 103, 208,
356
	 57,  35, 148, 248,  47, 116,  65, 119, 249, 178, 143,  40,
357
	189, 129,   8, 163, 204, 227, 230, 196, 205, 122, 151,  45,
358
	187,  19, 227,  72, 247, 125, 111, 121, 140, 220,   6, 107,
359
	 77,  69,  10, 101,  21,  65, 149, 171, 255,  54, 101, 210,
360
	139,  43, 150, 151, 212, 164,  45, 237, 146, 184,  95,   6,
361
	160,  42,   8, 204,  46, 238, 254, 168, 208,  50, 156, 190,
362
	106, 127,  34, 234,  68,  55,  79,  18,   4, 130,  53, 208,
363
	181,  21, 175, 120,  25, 100, 192, 178, 161,  96,  81, 127,
364
	 96, 227, 210, 248,  68,  10, 196,  31,   9, 167, 150, 193,
365
	  0, 169, 126,  14, 124, 198, 144, 142, 240,  21, 224,  44,
366
	245,  66, 146, 238,   6, 196, 154,  49, 200, 222, 109,   9,
367
	210, 141, 192, 138,   8,  79, 114, 217,  68, 128, 249,  94,
368
	 53,  30,  27,  61,  52, 135, 106, 212,  70, 238,  30, 185,
369
	 10, 132, 146, 136, 117,  37, 251, 150, 180, 188, 247, 156,
370
	236, 192, 108,  86
371
};
372

373
#define INNER(l, h, mm)   (((u32)((l) * (mm)) & 0xFFFFU) \
374
                          + ((u32)((h) * (mm)) << 16))
375

376
#define W_SMALL(sb, o1, o2, mm) \
377
	(INNER(q[8 * (sb) + 2 * 0 + o1], q[8 * (sb) + 2 * 0 + o2], mm), \
378
	 INNER(q[8 * (sb) + 2 * 1 + o1], q[8 * (sb) + 2 * 1 + o2], mm), \
379
	 INNER(q[8 * (sb) + 2 * 2 + o1], q[8 * (sb) + 2 * 2 + o2], mm), \
380
	 INNER(q[8 * (sb) + 2 * 3 + o1], q[8 * (sb) + 2 * 3 + o2], mm)
381

382
#define WS_0_0   W_SMALL( 4,    0,    1, 185)
383
#define WS_0_1   W_SMALL( 6,    0,    1, 185)
384
#define WS_0_2   W_SMALL( 0,    0,    1, 185)
385
#define WS_0_3   W_SMALL( 2,    0,    1, 185)
386
#define WS_0_4   W_SMALL( 7,    0,    1, 185)
387
#define WS_0_5   W_SMALL( 5,    0,    1, 185)
388
#define WS_0_6   W_SMALL( 3,    0,    1, 185)
389
#define WS_0_7   W_SMALL( 1,    0,    1, 185)
390
#define WS_1_0   W_SMALL(15,    0,    1, 185)
391
#define WS_1_1   W_SMALL(11,    0,    1, 185)
392
#define WS_1_2   W_SMALL(12,    0,    1, 185)
393
#define WS_1_3   W_SMALL( 8,    0,    1, 185)
394
#define WS_1_4   W_SMALL( 9,    0,    1, 185)
395
#define WS_1_5   W_SMALL(13,    0,    1, 185)
396
#define WS_1_6   W_SMALL(10,    0,    1, 185)
397
#define WS_1_7   W_SMALL(14,    0,    1, 185)
398
#define WS_2_0   W_SMALL(17, -128,  -64, 233)
399
#define WS_2_1   W_SMALL(18, -128,  -64, 233)
400
#define WS_2_2   W_SMALL(23, -128,  -64, 233)
401
#define WS_2_3   W_SMALL(20, -128,  -64, 233)
402
#define WS_2_4   W_SMALL(22, -128,  -64, 233)
403
#define WS_2_5   W_SMALL(21, -128,  -64, 233)
404
#define WS_2_6   W_SMALL(16, -128,  -64, 233)
405
#define WS_2_7   W_SMALL(19, -128,  -64, 233)
406
#define WS_3_0   W_SMALL(30, -191, -127, 233)
407
#define WS_3_1   W_SMALL(24, -191, -127, 233)
408
#define WS_3_2   W_SMALL(25, -191, -127, 233)
409
#define WS_3_3   W_SMALL(31, -191, -127, 233)
410
#define WS_3_4   W_SMALL(27, -191, -127, 233)
411
#define WS_3_5   W_SMALL(29, -191, -127, 233)
412
#define WS_3_6   W_SMALL(28, -191, -127, 233)
413
#define WS_3_7   W_SMALL(26, -191, -127, 233)
414

415
#define W_BIG(sb, o1, o2, mm) \
416
	(INNER(q[16 * (sb) + 2 * 0 + o1], q[16 * (sb) + 2 * 0 + o2], mm), \
417
	 INNER(q[16 * (sb) + 2 * 1 + o1], q[16 * (sb) + 2 * 1 + o2], mm), \
418
	 INNER(q[16 * (sb) + 2 * 2 + o1], q[16 * (sb) + 2 * 2 + o2], mm), \
419
	 INNER(q[16 * (sb) + 2 * 3 + o1], q[16 * (sb) + 2 * 3 + o2], mm), \
420
	 INNER(q[16 * (sb) + 2 * 4 + o1], q[16 * (sb) + 2 * 4 + o2], mm), \
421
	 INNER(q[16 * (sb) + 2 * 5 + o1], q[16 * (sb) + 2 * 5 + o2], mm), \
422
	 INNER(q[16 * (sb) + 2 * 6 + o1], q[16 * (sb) + 2 * 6 + o2], mm), \
423
	 INNER(q[16 * (sb) + 2 * 7 + o1], q[16 * (sb) + 2 * 7 + o2], mm)
424

425
#define WB_0_0   W_BIG( 4,    0,    1, 185)
426
#define WB_0_1   W_BIG( 6,    0,    1, 185)
427
#define WB_0_2   W_BIG( 0,    0,    1, 185)
428
#define WB_0_3   W_BIG( 2,    0,    1, 185)
429
#define WB_0_4   W_BIG( 7,    0,    1, 185)
430
#define WB_0_5   W_BIG( 5,    0,    1, 185)
431
#define WB_0_6   W_BIG( 3,    0,    1, 185)
432
#define WB_0_7   W_BIG( 1,    0,    1, 185)
433
#define WB_1_0   W_BIG(15,    0,    1, 185)
434
#define WB_1_1   W_BIG(11,    0,    1, 185)
435
#define WB_1_2   W_BIG(12,    0,    1, 185)
436
#define WB_1_3   W_BIG( 8,    0,    1, 185)
437
#define WB_1_4   W_BIG( 9,    0,    1, 185)
438
#define WB_1_5   W_BIG(13,    0,    1, 185)
439
#define WB_1_6   W_BIG(10,    0,    1, 185)
440
#define WB_1_7   W_BIG(14,    0,    1, 185)
441
#define WB_2_0   W_BIG(17, -256, -128, 233)
442
#define WB_2_1   W_BIG(18, -256, -128, 233)
443
#define WB_2_2   W_BIG(23, -256, -128, 233)
444
#define WB_2_3   W_BIG(20, -256, -128, 233)
445
#define WB_2_4   W_BIG(22, -256, -128, 233)
446
#define WB_2_5   W_BIG(21, -256, -128, 233)
447
#define WB_2_6   W_BIG(16, -256, -128, 233)
448
#define WB_2_7   W_BIG(19, -256, -128, 233)
449
#define WB_3_0   W_BIG(30, -383, -255, 233)
450
#define WB_3_1   W_BIG(24, -383, -255, 233)
451
#define WB_3_2   W_BIG(25, -383, -255, 233)
452
#define WB_3_3   W_BIG(31, -383, -255, 233)
453
#define WB_3_4   W_BIG(27, -383, -255, 233)
454
#define WB_3_5   W_BIG(29, -383, -255, 233)
455
#define WB_3_6   W_BIG(28, -383, -255, 233)
456
#define WB_3_7   W_BIG(26, -383, -255, 233)
457

458
#define IF(x, y, z)    ((((y) ^ (z)) & (x)) ^ (z))
459
#define MAJ(x, y, z)   (((x) & (y)) | (((x) | (y)) & (z)))
460

461
#define PP4_0_0   1
462
#define PP4_0_1   0
463
#define PP4_0_2   3
464
#define PP4_0_3   2
465
#define PP4_1_0   2
466
#define PP4_1_1   3
467
#define PP4_1_2   0
468
#define PP4_1_3   1
469
#define PP4_2_0   3
470
#define PP4_2_1   2
471
#define PP4_2_2   1
472
#define PP4_2_3   0
473

474
#define PP8_0_0   1
475
#define PP8_0_1   0
476
#define PP8_0_2   3
477
#define PP8_0_3   2
478
#define PP8_0_4   5
479
#define PP8_0_5   4
480
#define PP8_0_6   7
481
#define PP8_0_7   6
482

483
#define PP8_1_0   6
484
#define PP8_1_1   7
485
#define PP8_1_2   4
486
#define PP8_1_3   5
487
#define PP8_1_4   2
488
#define PP8_1_5   3
489
#define PP8_1_6   0
490
#define PP8_1_7   1
491

492
#define PP8_2_0   2
493
#define PP8_2_1   3
494
#define PP8_2_2   0
495
#define PP8_2_3   1
496
#define PP8_2_4   6
497
#define PP8_2_5   7
498
#define PP8_2_6   4
499
#define PP8_2_7   5
500

501
#define PP8_3_0   3
502
#define PP8_3_1   2
503
#define PP8_3_2   1
504
#define PP8_3_3   0
505
#define PP8_3_4   7
506
#define PP8_3_5   6
507
#define PP8_3_6   5
508
#define PP8_3_7   4
509

510
#define PP8_4_0   5
511
#define PP8_4_1   4
512
#define PP8_4_2   7
513
#define PP8_4_3   6
514
#define PP8_4_4   1
515
#define PP8_4_5   0
516
#define PP8_4_6   3
517
#define PP8_4_7   2
518

519
#define PP8_5_0   7
520
#define PP8_5_1   6
521
#define PP8_5_2   5
522
#define PP8_5_3   4
523
#define PP8_5_4   3
524
#define PP8_5_5   2
525
#define PP8_5_6   1
526
#define PP8_5_7   0
527

528
#define PP8_6_0   4
529
#define PP8_6_1   5
530
#define PP8_6_2   6
531
#define PP8_6_3   7
532
#define PP8_6_4   0
533
#define PP8_6_5   1
534
#define PP8_6_6   2
535
#define PP8_6_7   3
536

537
#if SPH_SIMD_NOCOPY
538

539
#define DECL_STATE_SMALL
540
#define READ_STATE_SMALL(sc)
541
#define WRITE_STATE_SMALL(sc)
542
#define DECL_STATE_BIG
543
#define READ_STATE_BIG(sc)
544
#define WRITE_STATE_BIG(sc)
545

546
#else
547

548
#define DECL_STATE_SMALL   \
549
	u32 A0, A1, A2, A3, B0, B1, B2, B3, C0, C1, C2, C3, D0, D1, D2, D3;
550

551
#define READ_STATE_SMALL(sc)   do { \
552
		A0 = (sc)->state[ 0]; \
553
		A1 = (sc)->state[ 1]; \
554
		A2 = (sc)->state[ 2]; \
555
		A3 = (sc)->state[ 3]; \
556
		B0 = (sc)->state[ 4]; \
557
		B1 = (sc)->state[ 5]; \
558
		B2 = (sc)->state[ 6]; \
559
		B3 = (sc)->state[ 7]; \
560
		C0 = (sc)->state[ 8]; \
561
		C1 = (sc)->state[ 9]; \
562
		C2 = (sc)->state[10]; \
563
		C3 = (sc)->state[11]; \
564
		D0 = (sc)->state[12]; \
565
		D1 = (sc)->state[13]; \
566
		D2 = (sc)->state[14]; \
567
		D3 = (sc)->state[15]; \
568
	} while (0)
569

570
#define WRITE_STATE_SMALL(sc)   do { \
571
		(sc)->state[ 0] = A0; \
572
		(sc)->state[ 1] = A1; \
573
		(sc)->state[ 2] = A2; \
574
		(sc)->state[ 3] = A3; \
575
		(sc)->state[ 4] = B0; \
576
		(sc)->state[ 5] = B1; \
577
		(sc)->state[ 6] = B2; \
578
		(sc)->state[ 7] = B3; \
579
		(sc)->state[ 8] = C0; \
580
		(sc)->state[ 9] = C1; \
581
		(sc)->state[10] = C2; \
582
		(sc)->state[11] = C3; \
583
		(sc)->state[12] = D0; \
584
		(sc)->state[13] = D1; \
585
		(sc)->state[14] = D2; \
586
		(sc)->state[15] = D3; \
587
	} while (0)
588

589
#define DECL_STATE_BIG   \
590
	u32 A0, A1, A2, A3, A4, A5, A6, A7; \
591
	u32 B0, B1, B2, B3, B4, B5, B6, B7; \
592
	u32 C0, C1, C2, C3, C4, C5, C6, C7; \
593
	u32 D0, D1, D2, D3, D4, D5, D6, D7;
594

595
#define READ_STATE_BIG(sc)   do { \
596
		A0 = (sc)->state[ 0]; \
597
		A1 = (sc)->state[ 1]; \
598
		A2 = (sc)->state[ 2]; \
599
		A3 = (sc)->state[ 3]; \
600
		A4 = (sc)->state[ 4]; \
601
		A5 = (sc)->state[ 5]; \
602
		A6 = (sc)->state[ 6]; \
603
		A7 = (sc)->state[ 7]; \
604
		B0 = (sc)->state[ 8]; \
605
		B1 = (sc)->state[ 9]; \
606
		B2 = (sc)->state[10]; \
607
		B3 = (sc)->state[11]; \
608
		B4 = (sc)->state[12]; \
609
		B5 = (sc)->state[13]; \
610
		B6 = (sc)->state[14]; \
611
		B7 = (sc)->state[15]; \
612
		C0 = (sc)->state[16]; \
613
		C1 = (sc)->state[17]; \
614
		C2 = (sc)->state[18]; \
615
		C3 = (sc)->state[19]; \
616
		C4 = (sc)->state[20]; \
617
		C5 = (sc)->state[21]; \
618
		C6 = (sc)->state[22]; \
619
		C7 = (sc)->state[23]; \
620
		D0 = (sc)->state[24]; \
621
		D1 = (sc)->state[25]; \
622
		D2 = (sc)->state[26]; \
623
		D3 = (sc)->state[27]; \
624
		D4 = (sc)->state[28]; \
625
		D5 = (sc)->state[29]; \
626
		D6 = (sc)->state[30]; \
627
		D7 = (sc)->state[31]; \
628
	} while (0)
629

630
#define WRITE_STATE_BIG(sc)   do { \
631
		(sc)->state[ 0] = A0; \
632
		(sc)->state[ 1] = A1; \
633
		(sc)->state[ 2] = A2; \
634
		(sc)->state[ 3] = A3; \
635
		(sc)->state[ 4] = A4; \
636
		(sc)->state[ 5] = A5; \
637
		(sc)->state[ 6] = A6; \
638
		(sc)->state[ 7] = A7; \
639
		(sc)->state[ 8] = B0; \
640
		(sc)->state[ 9] = B1; \
641
		(sc)->state[10] = B2; \
642
		(sc)->state[11] = B3; \
643
		(sc)->state[12] = B4; \
644
		(sc)->state[13] = B5; \
645
		(sc)->state[14] = B6; \
646
		(sc)->state[15] = B7; \
647
		(sc)->state[16] = C0; \
648
		(sc)->state[17] = C1; \
649
		(sc)->state[18] = C2; \
650
		(sc)->state[19] = C3; \
651
		(sc)->state[20] = C4; \
652
		(sc)->state[21] = C5; \
653
		(sc)->state[22] = C6; \
654
		(sc)->state[23] = C7; \
655
		(sc)->state[24] = D0; \
656
		(sc)->state[25] = D1; \
657
		(sc)->state[26] = D2; \
658
		(sc)->state[27] = D3; \
659
		(sc)->state[28] = D4; \
660
		(sc)->state[29] = D5; \
661
		(sc)->state[30] = D6; \
662
		(sc)->state[31] = D7; \
663
	} while (0)
664

665
#endif
666

667
#define STEP_ELT(n, w, fun, s, ppb)   do { \
668
		u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
669
		A ## n = T32(ROL32(tt, s) + XCAT(tA, XCAT(ppb, n))); \
670
		D ## n = C ## n; \
671
		C ## n = B ## n; \
672
		B ## n = tA ## n; \
673
	} while (0)
674

675
#define STEP_SMALL(w0, w1, w2, w3, fun, r, s, pp4b)   do { \
676
		u32 tA0 = ROL32(A0, r); \
677
		u32 tA1 = ROL32(A1, r); \
678
		u32 tA2 = ROL32(A2, r); \
679
		u32 tA3 = ROL32(A3, r); \
680
		STEP_ELT(0, w0, fun, s, pp4b); \
681
		STEP_ELT(1, w1, fun, s, pp4b); \
682
		STEP_ELT(2, w2, fun, s, pp4b); \
683
		STEP_ELT(3, w3, fun, s, pp4b); \
684
	} while (0)
685

686
#define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b)   do { \
687
		u32 tA0 = ROL32(A0, r); \
688
		u32 tA1 = ROL32(A1, r); \
689
		u32 tA2 = ROL32(A2, r); \
690
		u32 tA3 = ROL32(A3, r); \
691
		u32 tA4 = ROL32(A4, r); \
692
		u32 tA5 = ROL32(A5, r); \
693
		u32 tA6 = ROL32(A6, r); \
694
		u32 tA7 = ROL32(A7, r); \
695
		STEP_ELT(0, w0, fun, s, pp8b); \
696
		STEP_ELT(1, w1, fun, s, pp8b); \
697
		STEP_ELT(2, w2, fun, s, pp8b); \
698
		STEP_ELT(3, w3, fun, s, pp8b); \
699
		STEP_ELT(4, w4, fun, s, pp8b); \
700
		STEP_ELT(5, w5, fun, s, pp8b); \
701
		STEP_ELT(6, w6, fun, s, pp8b); \
702
		STEP_ELT(7, w7, fun, s, pp8b); \
703
	} while (0)
704

705
#define M3_0_0   0_
706
#define M3_1_0   1_
707
#define M3_2_0   2_
708
#define M3_3_0   0_
709
#define M3_4_0   1_
710
#define M3_5_0   2_
711
#define M3_6_0   0_
712
#define M3_7_0   1_
713

714
#define M3_0_1   1_
715
#define M3_1_1   2_
716
#define M3_2_1   0_
717
#define M3_3_1   1_
718
#define M3_4_1   2_
719
#define M3_5_1   0_
720
#define M3_6_1   1_
721
#define M3_7_1   2_
722

723
#define M3_0_2   2_
724
#define M3_1_2   0_
725
#define M3_2_2   1_
726
#define M3_3_2   2_
727
#define M3_4_2   0_
728
#define M3_5_2   1_
729
#define M3_6_2   2_
730
#define M3_7_2   0_
731

732
#define STEP_SMALL_(w, fun, r, s, pp4b)   STEP_SMALL w, fun, r, s, pp4b)
733

734
#define ONE_ROUND_SMALL(ri, isp, p0, p1, p2, p3)   do { \
735
		STEP_SMALL_(WS_ ## ri ## 0, \
736
			IF,  p0, p1, XCAT(PP4_, M3_0_ ## isp)); \
737
		STEP_SMALL_(WS_ ## ri ## 1, \
738
			IF,  p1, p2, XCAT(PP4_, M3_1_ ## isp)); \
739
		STEP_SMALL_(WS_ ## ri ## 2, \
740
			IF,  p2, p3, XCAT(PP4_, M3_2_ ## isp)); \
741
		STEP_SMALL_(WS_ ## ri ## 3, \
742
			IF,  p3, p0, XCAT(PP4_, M3_3_ ## isp)); \
743
		STEP_SMALL_(WS_ ## ri ## 4, \
744
			MAJ, p0, p1, XCAT(PP4_, M3_4_ ## isp)); \
745
		STEP_SMALL_(WS_ ## ri ## 5, \
746
			MAJ, p1, p2, XCAT(PP4_, M3_5_ ## isp)); \
747
		STEP_SMALL_(WS_ ## ri ## 6, \
748
			MAJ, p2, p3, XCAT(PP4_, M3_6_ ## isp)); \
749
		STEP_SMALL_(WS_ ## ri ## 7, \
750
			MAJ, p3, p0, XCAT(PP4_, M3_7_ ## isp)); \
751
	} while (0)
752

753
#define M7_0_0   0_
754
#define M7_1_0   1_
755
#define M7_2_0   2_
756
#define M7_3_0   3_
757
#define M7_4_0   4_
758
#define M7_5_0   5_
759
#define M7_6_0   6_
760
#define M7_7_0   0_
761

762
#define M7_0_1   1_
763
#define M7_1_1   2_
764
#define M7_2_1   3_
765
#define M7_3_1   4_
766
#define M7_4_1   5_
767
#define M7_5_1   6_
768
#define M7_6_1   0_
769
#define M7_7_1   1_
770

771
#define M7_0_2   2_
772
#define M7_1_2   3_
773
#define M7_2_2   4_
774
#define M7_3_2   5_
775
#define M7_4_2   6_
776
#define M7_5_2   0_
777
#define M7_6_2   1_
778
#define M7_7_2   2_
779

780
#define M7_0_3   3_
781
#define M7_1_3   4_
782
#define M7_2_3   5_
783
#define M7_3_3   6_
784
#define M7_4_3   0_
785
#define M7_5_3   1_
786
#define M7_6_3   2_
787
#define M7_7_3   3_
788

789
#define STEP_BIG_(w, fun, r, s, pp8b)   STEP_BIG w, fun, r, s, pp8b)
790

791
#define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3)   do { \
792
		STEP_BIG_(WB_ ## ri ## 0, \
793
			IF,  p0, p1, XCAT(PP8_, M7_0_ ## isp)); \
794
		STEP_BIG_(WB_ ## ri ## 1, \
795
			IF,  p1, p2, XCAT(PP8_, M7_1_ ## isp)); \
796
		STEP_BIG_(WB_ ## ri ## 2, \
797
			IF,  p2, p3, XCAT(PP8_, M7_2_ ## isp)); \
798
		STEP_BIG_(WB_ ## ri ## 3, \
799
			IF,  p3, p0, XCAT(PP8_, M7_3_ ## isp)); \
800
		STEP_BIG_(WB_ ## ri ## 4, \
801
			MAJ, p0, p1, XCAT(PP8_, M7_4_ ## isp)); \
802
		STEP_BIG_(WB_ ## ri ## 5, \
803
			MAJ, p1, p2, XCAT(PP8_, M7_5_ ## isp)); \
804
		STEP_BIG_(WB_ ## ri ## 6, \
805
			MAJ, p2, p3, XCAT(PP8_, M7_6_ ## isp)); \
806
		STEP_BIG_(WB_ ## ri ## 7, \
807
			MAJ, p3, p0, XCAT(PP8_, M7_7_ ## isp)); \
808
	} while (0)
809

810
#if SPH_SMALL_FOOTPRINT_SIMD
811

812
#define A0   state[ 0]
813
#define A1   state[ 1]
814
#define A2   state[ 2]
815
#define A3   state[ 3]
816
#define B0   state[ 4]
817
#define B1   state[ 5]
818
#define B2   state[ 6]
819
#define B3   state[ 7]
820
#define C0   state[ 8]
821
#define C1   state[ 9]
822
#define C2   state[10]
823
#define C3   state[11]
824
#define D0   state[12]
825
#define D1   state[13]
826
#define D2   state[14]
827
#define D3   state[15]
828

829
#define STEP2_ELT(n, w, fun, s, ppb)   do { \
830
		u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
831
		A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
832
		D ## n = C ## n; \
833
		C ## n = B ## n; \
834
		B ## n = tA[n]; \
835
	} while (0)
836

837
#define STEP2_SMALL(w0, w1, w2, w3, fun, r, s, pp4b)   do { \
838
		u32 tA[4]; \
839
		tA[0] = ROL32(A0, r); \
840
		tA[1] = ROL32(A1, r); \
841
		tA[2] = ROL32(A2, r); \
842
		tA[3] = ROL32(A3, r); \
843
		STEP2_ELT(0, w0, fun, s, pp4b); \
844
		STEP2_ELT(1, w1, fun, s, pp4b); \
845
		STEP2_ELT(2, w2, fun, s, pp4b); \
846
		STEP2_ELT(3, w3, fun, s, pp4b); \
847
	} while (0)
848

849
static void
850
one_round_small(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
851
{
852
	static const int pp4k[] = { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2 };
853

854
	STEP2_SMALL(w[ 0], w[ 1], w[ 2], w[ 3], IF,  p0, p1, pp4k[isp + 0]);
855
	STEP2_SMALL(w[ 4], w[ 5], w[ 6], w[ 7], IF,  p1, p2, pp4k[isp + 1]);
856
	STEP2_SMALL(w[ 8], w[ 9], w[10], w[11], IF,  p2, p3, pp4k[isp + 2]);
857
	STEP2_SMALL(w[12], w[13], w[14], w[15], IF,  p3, p0, pp4k[isp + 3]);
858
	STEP2_SMALL(w[16], w[17], w[18], w[19], MAJ, p0, p1, pp4k[isp + 4]);
859
	STEP2_SMALL(w[20], w[21], w[22], w[23], MAJ, p1, p2, pp4k[isp + 5]);
860
	STEP2_SMALL(w[24], w[25], w[26], w[27], MAJ, p2, p3, pp4k[isp + 6]);
861
	STEP2_SMALL(w[28], w[29], w[30], w[31], MAJ, p3, p0, pp4k[isp + 7]);
862
}
863

864
static void
865
compress_small(sph_simd_small_context *sc, int last)
866
{
867
	unsigned char *x;
868
	s32 q[128];
869
	int i;
870
	u32 w[32];
871
	u32 state[16];
872
	size_t u;
873

874
	static const size_t wsp[32] = {
875
		 4 << 3,  6 << 3,  0 << 3,  2 << 3,
876
		 7 << 3,  5 << 3,  3 << 3,  1 << 3,
877
		15 << 3, 11 << 3, 12 << 3,  8 << 3,
878
		 9 << 3, 13 << 3, 10 << 3, 14 << 3,
879
		17 << 3, 18 << 3, 23 << 3, 20 << 3,
880
		22 << 3, 21 << 3, 16 << 3, 19 << 3,
881
		30 << 3, 24 << 3, 25 << 3, 31 << 3,
882
		27 << 3, 29 << 3, 28 << 3, 26 << 3
883
	};
884

885
	x = sc->buf;
886
	FFT128(0, 1, 0, ll);
887
	if (last) {
888
		for (i = 0; i < 128; i ++) {
889
			s32 tq;
890

891
			tq = q[i] + yoff_s_f[i];
892
			tq = REDS2(tq);
893
			tq = REDS1(tq);
894
			tq = REDS1(tq);
895
			q[i] = (tq <= 128 ? tq : tq - 257);
896
		}
897
	} else {
898
		for (i = 0; i < 128; i ++) {
899
			s32 tq;
900

901
			tq = q[i] + yoff_s_n[i];
902
			tq = REDS2(tq);
903
			tq = REDS1(tq);
904
			tq = REDS1(tq);
905
			q[i] = (tq <= 128 ? tq : tq - 257);
906
		}
907
	}
908

909
	for (i = 0; i < 16; i += 4) {
910
		state[i + 0] = sc->state[i + 0]
911
			^ sph_dec32le_aligned(x + 4 * (i + 0));
912
		state[i + 1] = sc->state[i + 1]
913
			^ sph_dec32le_aligned(x + 4 * (i + 1));
914
		state[i + 2] = sc->state[i + 2]
915
			^ sph_dec32le_aligned(x + 4 * (i + 2));
916
		state[i + 3] = sc->state[i + 3]
917
			^ sph_dec32le_aligned(x + 4 * (i + 3));
918
	}
919

920
#define WSREAD(sb, o1, o2, mm)   do { \
921
		for (u = 0; u < 32; u += 4) { \
922
			size_t v = wsp[(u >> 2) + (sb)]; \
923
			w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
924
				q[v + 2 * 0 + (o2)], mm); \
925
			w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
926
				q[v + 2 * 1 + (o2)], mm); \
927
			w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
928
				q[v + 2 * 2 + (o2)], mm); \
929
			w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
930
				q[v + 2 * 3 + (o2)], mm); \
931
		} \
932
	} while (0)
933

934
	WSREAD( 0,    0,    1, 185);
935
	one_round_small(state, w, 0,  3, 23, 17, 27);
936
	WSREAD( 8,    0,    1, 185);
937
	one_round_small(state, w, 2, 28, 19, 22,  7);
938
	WSREAD(16, -128,  -64, 233);
939
	one_round_small(state, w, 1, 29,  9, 15,  5);
940
	WSREAD(24, -191, -127, 233);
941
	one_round_small(state, w, 0,  4, 13, 10, 25);
942

943
#undef WSREAD
944

945
	STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
946
		IF,  4, 13, PP4_2_);
947
	STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
948
		IF, 13, 10, PP4_0_);
949
	STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
950
		IF, 10, 25, PP4_1_);
951
	STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
952
		IF, 25,  4, PP4_2_);
953

954
	memcpy(sc->state, state, sizeof state);
955
}
956

957
#undef A0
958
#undef A1
959
#undef A2
960
#undef A3
961
#undef B0
962
#undef B1
963
#undef B2
964
#undef B3
965
#undef C0
966
#undef C1
967
#undef C2
968
#undef C3
969
#undef D0
970
#undef D1
971
#undef D2
972
#undef D3
973

974
#else
975

976
#if SPH_SIMD_NOCOPY
977
#define A0   (sc->state[ 0])
978
#define A1   (sc->state[ 1])
979
#define A2   (sc->state[ 2])
980
#define A3   (sc->state[ 3])
981
#define B0   (sc->state[ 4])
982
#define B1   (sc->state[ 5])
983
#define B2   (sc->state[ 6])
984
#define B3   (sc->state[ 7])
985
#define C0   (sc->state[ 8])
986
#define C1   (sc->state[ 9])
987
#define C2   (sc->state[10])
988
#define C3   (sc->state[11])
989
#define D0   (sc->state[12])
990
#define D1   (sc->state[13])
991
#define D2   (sc->state[14])
992
#define D3   (sc->state[15])
993
#endif
994

995
static void
996
compress_small(sph_simd_small_context *sc, int last)
997
{
998
	unsigned char *x;
999
	s32 q[128];
1000
	int i;
1001
	DECL_STATE_SMALL
1002
#if SPH_SIMD_NOCOPY
1003
	sph_u32 saved[16];
1004
#endif
1005

1006
#if SPH_SIMD_NOCOPY
1007
	memcpy(saved, sc->state, sizeof saved);
1008
#endif
1009
	x = sc->buf;
1010
	FFT128(0, 1, 0, ll);
1011
	if (last) {
1012
		for (i = 0; i < 128; i ++) {
1013
			s32 tq;
1014

1015
			tq = q[i] + yoff_s_f[i];
1016
			tq = REDS2(tq);
1017
			tq = REDS1(tq);
1018
			tq = REDS1(tq);
1019
			q[i] = (tq <= 128 ? tq : tq - 257);
1020
		}
1021
	} else {
1022
		for (i = 0; i < 128; i ++) {
1023
			s32 tq;
1024

1025
			tq = q[i] + yoff_s_n[i];
1026
			tq = REDS2(tq);
1027
			tq = REDS1(tq);
1028
			tq = REDS1(tq);
1029
			q[i] = (tq <= 128 ? tq : tq - 257);
1030
		}
1031
	}
1032
	READ_STATE_SMALL(sc);
1033
	A0 ^= sph_dec32le_aligned(x +  0);
1034
	A1 ^= sph_dec32le_aligned(x +  4);
1035
	A2 ^= sph_dec32le_aligned(x +  8);
1036
	A3 ^= sph_dec32le_aligned(x + 12);
1037
	B0 ^= sph_dec32le_aligned(x + 16);
1038
	B1 ^= sph_dec32le_aligned(x + 20);
1039
	B2 ^= sph_dec32le_aligned(x + 24);
1040
	B3 ^= sph_dec32le_aligned(x + 28);
1041
	C0 ^= sph_dec32le_aligned(x + 32);
1042
	C1 ^= sph_dec32le_aligned(x + 36);
1043
	C2 ^= sph_dec32le_aligned(x + 40);
1044
	C3 ^= sph_dec32le_aligned(x + 44);
1045
	D0 ^= sph_dec32le_aligned(x + 48);
1046
	D1 ^= sph_dec32le_aligned(x + 52);
1047
	D2 ^= sph_dec32le_aligned(x + 56);
1048
	D3 ^= sph_dec32le_aligned(x + 60);
1049
	ONE_ROUND_SMALL(0_, 0,  3, 23, 17, 27);
1050
	ONE_ROUND_SMALL(1_, 2, 28, 19, 22,  7);
1051
	ONE_ROUND_SMALL(2_, 1, 29,  9, 15,  5);
1052
	ONE_ROUND_SMALL(3_, 0,  4, 13, 10, 25);
1053
#if SPH_SIMD_NOCOPY
1054
	STEP_SMALL(saved[ 0], saved[ 1], saved[ 2], saved[ 3],
1055
		IF,  4, 13, PP4_2_);
1056
	STEP_SMALL(saved[ 4], saved[ 5], saved[ 6], saved[ 7],
1057
		IF, 13, 10, PP4_0_);
1058
	STEP_SMALL(saved[ 8], saved[ 9], saved[10], saved[11],
1059
		IF, 10, 25, PP4_1_);
1060
	STEP_SMALL(saved[12], saved[13], saved[14], saved[15],
1061
		IF, 25,  4, PP4_2_);
1062
#else
1063
	STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
1064
		IF,  4, 13, PP4_2_);
1065
	STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
1066
		IF, 13, 10, PP4_0_);
1067
	STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
1068
		IF, 10, 25, PP4_1_);
1069
	STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
1070
		IF, 25,  4, PP4_2_);
1071
	WRITE_STATE_SMALL(sc);
1072
#endif
1073
}
1074

1075
#if SPH_SIMD_NOCOPY
1076
#undef A0
1077
#undef A1
1078
#undef A2
1079
#undef A3
1080
#undef B0
1081
#undef B1
1082
#undef B2
1083
#undef B3
1084
#undef C0
1085
#undef C1
1086
#undef C2
1087
#undef C3
1088
#undef D0
1089
#undef D1
1090
#undef D2
1091
#undef D3
1092
#endif
1093

1094
#endif
1095

1096
#if SPH_SMALL_FOOTPRINT_SIMD
1097

1098
#define A0   state[ 0]
1099
#define A1   state[ 1]
1100
#define A2   state[ 2]
1101
#define A3   state[ 3]
1102
#define A4   state[ 4]
1103
#define A5   state[ 5]
1104
#define A6   state[ 6]
1105
#define A7   state[ 7]
1106
#define B0   state[ 8]
1107
#define B1   state[ 9]
1108
#define B2   state[10]
1109
#define B3   state[11]
1110
#define B4   state[12]
1111
#define B5   state[13]
1112
#define B6   state[14]
1113
#define B7   state[15]
1114
#define C0   state[16]
1115
#define C1   state[17]
1116
#define C2   state[18]
1117
#define C3   state[19]
1118
#define C4   state[20]
1119
#define C5   state[21]
1120
#define C6   state[22]
1121
#define C7   state[23]
1122
#define D0   state[24]
1123
#define D1   state[25]
1124
#define D2   state[26]
1125
#define D3   state[27]
1126
#define D4   state[28]
1127
#define D5   state[29]
1128
#define D6   state[30]
1129
#define D7   state[31]
1130

1131
/*
1132
 * Not needed -- already defined for SIMD-224 / SIMD-256
1133
 *
1134
#define STEP2_ELT(n, w, fun, s, ppb)   do { \
1135
		u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
1136
		A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
1137
		D ## n = C ## n; \
1138
		C ## n = B ## n; \
1139
		B ## n = tA[n]; \
1140
	} while (0)
1141
 */
1142

1143
#define STEP2_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b)   do { \
1144
		u32 tA[8]; \
1145
		tA[0] = ROL32(A0, r); \
1146
		tA[1] = ROL32(A1, r); \
1147
		tA[2] = ROL32(A2, r); \
1148
		tA[3] = ROL32(A3, r); \
1149
		tA[4] = ROL32(A4, r); \
1150
		tA[5] = ROL32(A5, r); \
1151
		tA[6] = ROL32(A6, r); \
1152
		tA[7] = ROL32(A7, r); \
1153
		STEP2_ELT(0, w0, fun, s, pp8b); \
1154
		STEP2_ELT(1, w1, fun, s, pp8b); \
1155
		STEP2_ELT(2, w2, fun, s, pp8b); \
1156
		STEP2_ELT(3, w3, fun, s, pp8b); \
1157
		STEP2_ELT(4, w4, fun, s, pp8b); \
1158
		STEP2_ELT(5, w5, fun, s, pp8b); \
1159
		STEP2_ELT(6, w6, fun, s, pp8b); \
1160
		STEP2_ELT(7, w7, fun, s, pp8b); \
1161
	} while (0)
1162

1163
static void
1164
one_round_big(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
1165
{
1166
	static const int pp8k[] = { 1, 6, 2, 3, 5, 7, 4, 1, 6, 2, 3 };
1167

1168
	STEP2_BIG(w[ 0], w[ 1], w[ 2], w[ 3], w[ 4], w[ 5], w[ 6], w[ 7],
1169
		IF,  p0, p1, pp8k[isp + 0]);
1170
	STEP2_BIG(w[ 8], w[ 9], w[10], w[11], w[12], w[13], w[14], w[15],
1171
		IF,  p1, p2, pp8k[isp + 1]);
1172
	STEP2_BIG(w[16], w[17], w[18], w[19], w[20], w[21], w[22], w[23],
1173
		IF,  p2, p3, pp8k[isp + 2]);
1174
	STEP2_BIG(w[24], w[25], w[26], w[27], w[28], w[29], w[30], w[31],
1175
		IF,  p3, p0, pp8k[isp + 3]);
1176
	STEP2_BIG(w[32], w[33], w[34], w[35], w[36], w[37], w[38], w[39],
1177
		MAJ, p0, p1, pp8k[isp + 4]);
1178
	STEP2_BIG(w[40], w[41], w[42], w[43], w[44], w[45], w[46], w[47],
1179
		MAJ, p1, p2, pp8k[isp + 5]);
1180
	STEP2_BIG(w[48], w[49], w[50], w[51], w[52], w[53], w[54], w[55],
1181
		MAJ, p2, p3, pp8k[isp + 6]);
1182
	STEP2_BIG(w[56], w[57], w[58], w[59], w[60], w[61], w[62], w[63],
1183
		MAJ, p3, p0, pp8k[isp + 7]);
1184
}
1185

1186
static void
1187
compress_big(sph_simd_big_context *sc, int last)
1188
{
1189
	unsigned char *x;
1190
	s32 q[256];
1191
	int i;
1192
	u32 w[64];
1193
	u32 state[32];
1194
	size_t u;
1195

1196
	static const size_t wbp[32] = {
1197
		 4 << 4,  6 << 4,  0 << 4,  2 << 4,
1198
		 7 << 4,  5 << 4,  3 << 4,  1 << 4,
1199
		15 << 4, 11 << 4, 12 << 4,  8 << 4,
1200
		 9 << 4, 13 << 4, 10 << 4, 14 << 4,
1201
		17 << 4, 18 << 4, 23 << 4, 20 << 4,
1202
		22 << 4, 21 << 4, 16 << 4, 19 << 4,
1203
		30 << 4, 24 << 4, 25 << 4, 31 << 4,
1204
		27 << 4, 29 << 4, 28 << 4, 26 << 4
1205
	};
1206

1207
	x = sc->buf;
1208
	FFT256(0, 1, 0, ll);
1209
	if (last) {
1210
		for (i = 0; i < 256; i ++) {
1211
			s32 tq;
1212

1213
			tq = q[i] + yoff_b_f[i];
1214
			tq = REDS2(tq);
1215
			tq = REDS1(tq);
1216
			tq = REDS1(tq);
1217
			q[i] = (tq <= 128 ? tq : tq - 257);
1218
		}
1219
	} else {
1220
		for (i = 0; i < 256; i ++) {
1221
			s32 tq;
1222

1223
			tq = q[i] + yoff_b_n[i];
1224
			tq = REDS2(tq);
1225
			tq = REDS1(tq);
1226
			tq = REDS1(tq);
1227
			q[i] = (tq <= 128 ? tq : tq - 257);
1228
		}
1229
	}
1230

1231
	for (i = 0; i < 32; i += 8) {
1232
		state[i + 0] = sc->state[i + 0]
1233
			^ sph_dec32le_aligned(x + 4 * (i + 0));
1234
		state[i + 1] = sc->state[i + 1]
1235
			^ sph_dec32le_aligned(x + 4 * (i + 1));
1236
		state[i + 2] = sc->state[i + 2]
1237
			^ sph_dec32le_aligned(x + 4 * (i + 2));
1238
		state[i + 3] = sc->state[i + 3]
1239
			^ sph_dec32le_aligned(x + 4 * (i + 3));
1240
		state[i + 4] = sc->state[i + 4]
1241
			^ sph_dec32le_aligned(x + 4 * (i + 4));
1242
		state[i + 5] = sc->state[i + 5]
1243
			^ sph_dec32le_aligned(x + 4 * (i + 5));
1244
		state[i + 6] = sc->state[i + 6]
1245
			^ sph_dec32le_aligned(x + 4 * (i + 6));
1246
		state[i + 7] = sc->state[i + 7]
1247
			^ sph_dec32le_aligned(x + 4 * (i + 7));
1248
	}
1249

1250
#define WBREAD(sb, o1, o2, mm)   do { \
1251
		for (u = 0; u < 64; u += 8) { \
1252
			size_t v = wbp[(u >> 3) + (sb)]; \
1253
			w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
1254
				q[v + 2 * 0 + (o2)], mm); \
1255
			w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
1256
				q[v + 2 * 1 + (o2)], mm); \
1257
			w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
1258
				q[v + 2 * 2 + (o2)], mm); \
1259
			w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
1260
				q[v + 2 * 3 + (o2)], mm); \
1261
			w[u + 4] = INNER(q[v + 2 * 4 + (o1)], \
1262
				q[v + 2 * 4 + (o2)], mm); \
1263
			w[u + 5] = INNER(q[v + 2 * 5 + (o1)], \
1264
				q[v + 2 * 5 + (o2)], mm); \
1265
			w[u + 6] = INNER(q[v + 2 * 6 + (o1)], \
1266
				q[v + 2 * 6 + (o2)], mm); \
1267
			w[u + 7] = INNER(q[v + 2 * 7 + (o1)], \
1268
				q[v + 2 * 7 + (o2)], mm); \
1269
		} \
1270
	} while (0)
1271

1272
	WBREAD( 0,    0,    1, 185);
1273
	one_round_big(state, w, 0,  3, 23, 17, 27);
1274
	WBREAD( 8,    0,    1, 185);
1275
	one_round_big(state, w, 1, 28, 19, 22,  7);
1276
	WBREAD(16, -256, -128, 233);
1277
	one_round_big(state, w, 2, 29,  9, 15,  5);
1278
	WBREAD(24, -383, -255, 233);
1279
	one_round_big(state, w, 3,  4, 13, 10, 25);
1280

1281
#undef WBREAD
1282

1283
	STEP_BIG(
1284
		sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
1285
		sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
1286
		IF,  4, 13, PP8_4_);
1287
	STEP_BIG(
1288
		sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
1289
		sc->state[12], sc->state[13], sc->state[14], sc->state[15],
1290
		IF, 13, 10, PP8_5_);
1291
	STEP_BIG(
1292
		sc->state[16], sc->state[17], sc->state[18], sc->state[19],
1293
		sc->state[20], sc->state[21], sc->state[22], sc->state[23],
1294
		IF, 10, 25, PP8_6_);
1295
	STEP_BIG(
1296
		sc->state[24], sc->state[25], sc->state[26], sc->state[27],
1297
		sc->state[28], sc->state[29], sc->state[30], sc->state[31],
1298
		IF, 25,  4, PP8_0_);
1299

1300
	memcpy(sc->state, state, sizeof state);
1301
}
1302

1303
#undef A0
1304
#undef A1
1305
#undef A2
1306
#undef A3
1307
#undef A4
1308
#undef A5
1309
#undef A6
1310
#undef A7
1311
#undef B0
1312
#undef B1
1313
#undef B2
1314
#undef B3
1315
#undef B4
1316
#undef B5
1317
#undef B6
1318
#undef B7
1319
#undef C0
1320
#undef C1
1321
#undef C2
1322
#undef C3
1323
#undef C4
1324
#undef C5
1325
#undef C6
1326
#undef C7
1327
#undef D0
1328
#undef D1
1329
#undef D2
1330
#undef D3
1331
#undef D4
1332
#undef D5
1333
#undef D6
1334
#undef D7
1335

1336
#else
1337

1338
#if SPH_SIMD_NOCOPY
1339
#define A0   (sc->state[ 0])
1340
#define A1   (sc->state[ 1])
1341
#define A2   (sc->state[ 2])
1342
#define A3   (sc->state[ 3])
1343
#define A4   (sc->state[ 4])
1344
#define A5   (sc->state[ 5])
1345
#define A6   (sc->state[ 6])
1346
#define A7   (sc->state[ 7])
1347
#define B0   (sc->state[ 8])
1348
#define B1   (sc->state[ 9])
1349
#define B2   (sc->state[10])
1350
#define B3   (sc->state[11])
1351
#define B4   (sc->state[12])
1352
#define B5   (sc->state[13])
1353
#define B6   (sc->state[14])
1354
#define B7   (sc->state[15])
1355
#define C0   (sc->state[16])
1356
#define C1   (sc->state[17])
1357
#define C2   (sc->state[18])
1358
#define C3   (sc->state[19])
1359
#define C4   (sc->state[20])
1360
#define C5   (sc->state[21])
1361
#define C6   (sc->state[22])
1362
#define C7   (sc->state[23])
1363
#define D0   (sc->state[24])
1364
#define D1   (sc->state[25])
1365
#define D2   (sc->state[26])
1366
#define D3   (sc->state[27])
1367
#define D4   (sc->state[28])
1368
#define D5   (sc->state[29])
1369
#define D6   (sc->state[30])
1370
#define D7   (sc->state[31])
1371
#endif
1372

1373
static void
1374
compress_big(sph_simd_big_context *sc, int last)
1375
{
1376
	unsigned char *x;
1377
	s32 q[256];
1378
	int i;
1379
	DECL_STATE_BIG
1380
#if SPH_SIMD_NOCOPY
1381
	sph_u32 saved[32];
1382
#endif
1383

1384
#if SPH_SIMD_NOCOPY
1385
	memcpy(saved, sc->state, sizeof saved);
1386
#endif
1387

1388
	x = sc->buf;
1389
	FFT256(0, 1, 0, ll);
1390
	if (last) {
1391
		for (i = 0; i < 256; i ++) {
1392
			s32 tq;
1393

1394
			tq = q[i] + yoff_b_f[i];
1395
			tq = REDS2(tq);
1396
			tq = REDS1(tq);
1397
			tq = REDS1(tq);
1398
			q[i] = (tq <= 128 ? tq : tq - 257);
1399
		}
1400
	} else {
1401
		for (i = 0; i < 256; i ++) {
1402
			s32 tq;
1403

1404
			tq = q[i] + yoff_b_n[i];
1405
			tq = REDS2(tq);
1406
			tq = REDS1(tq);
1407
			tq = REDS1(tq);
1408
			q[i] = (tq <= 128 ? tq : tq - 257);
1409
		}
1410
	}
1411
	READ_STATE_BIG(sc);
1412
	A0 ^= sph_dec32le_aligned(x +   0);
1413
	A1 ^= sph_dec32le_aligned(x +   4);
1414
	A2 ^= sph_dec32le_aligned(x +   8);
1415
	A3 ^= sph_dec32le_aligned(x +  12);
1416
	A4 ^= sph_dec32le_aligned(x +  16);
1417
	A5 ^= sph_dec32le_aligned(x +  20);
1418
	A6 ^= sph_dec32le_aligned(x +  24);
1419
	A7 ^= sph_dec32le_aligned(x +  28);
1420
	B0 ^= sph_dec32le_aligned(x +  32);
1421
	B1 ^= sph_dec32le_aligned(x +  36);
1422
	B2 ^= sph_dec32le_aligned(x +  40);
1423
	B3 ^= sph_dec32le_aligned(x +  44);
1424
	B4 ^= sph_dec32le_aligned(x +  48);
1425
	B5 ^= sph_dec32le_aligned(x +  52);
1426
	B6 ^= sph_dec32le_aligned(x +  56);
1427
	B7 ^= sph_dec32le_aligned(x +  60);
1428
	C0 ^= sph_dec32le_aligned(x +  64);
1429
	C1 ^= sph_dec32le_aligned(x +  68);
1430
	C2 ^= sph_dec32le_aligned(x +  72);
1431
	C3 ^= sph_dec32le_aligned(x +  76);
1432
	C4 ^= sph_dec32le_aligned(x +  80);
1433
	C5 ^= sph_dec32le_aligned(x +  84);
1434
	C6 ^= sph_dec32le_aligned(x +  88);
1435
	C7 ^= sph_dec32le_aligned(x +  92);
1436
	D0 ^= sph_dec32le_aligned(x +  96);
1437
	D1 ^= sph_dec32le_aligned(x + 100);
1438
	D2 ^= sph_dec32le_aligned(x + 104);
1439
	D3 ^= sph_dec32le_aligned(x + 108);
1440
	D4 ^= sph_dec32le_aligned(x + 112);
1441
	D5 ^= sph_dec32le_aligned(x + 116);
1442
	D6 ^= sph_dec32le_aligned(x + 120);
1443
	D7 ^= sph_dec32le_aligned(x + 124);
1444

1445
	ONE_ROUND_BIG(0_, 0,  3, 23, 17, 27);
1446
	ONE_ROUND_BIG(1_, 1, 28, 19, 22,  7);
1447
	ONE_ROUND_BIG(2_, 2, 29,  9, 15,  5);
1448
	ONE_ROUND_BIG(3_, 3,  4, 13, 10, 25);
1449
#if SPH_SIMD_NOCOPY
1450
	STEP_BIG(
1451
		saved[ 0], saved[ 1], saved[ 2], saved[ 3],
1452
		saved[ 4], saved[ 5], saved[ 6], saved[ 7],
1453
		IF,  4, 13, PP8_4_);
1454
	STEP_BIG(
1455
		saved[ 8], saved[ 9], saved[10], saved[11],
1456
		saved[12], saved[13], saved[14], saved[15],
1457
		IF, 13, 10, PP8_5_);
1458
	STEP_BIG(
1459
		saved[16], saved[17], saved[18], saved[19],
1460
		saved[20], saved[21], saved[22], saved[23],
1461
		IF, 10, 25, PP8_6_);
1462
	STEP_BIG(
1463
		saved[24], saved[25], saved[26], saved[27],
1464
		saved[28], saved[29], saved[30], saved[31],
1465
		IF, 25,  4, PP8_0_);
1466
#else
1467
	STEP_BIG(
1468
		sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
1469
		sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
1470
		IF,  4, 13, PP8_4_);
1471
	STEP_BIG(
1472
		sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
1473
		sc->state[12], sc->state[13], sc->state[14], sc->state[15],
1474
		IF, 13, 10, PP8_5_);
1475
	STEP_BIG(
1476
		sc->state[16], sc->state[17], sc->state[18], sc->state[19],
1477
		sc->state[20], sc->state[21], sc->state[22], sc->state[23],
1478
		IF, 10, 25, PP8_6_);
1479
	STEP_BIG(
1480
		sc->state[24], sc->state[25], sc->state[26], sc->state[27],
1481
		sc->state[28], sc->state[29], sc->state[30], sc->state[31],
1482
		IF, 25,  4, PP8_0_);
1483
	WRITE_STATE_BIG(sc);
1484
#endif
1485
}
1486

1487
#if SPH_SIMD_NOCOPY
1488
#undef A0
1489
#undef A1
1490
#undef A2
1491
#undef A3
1492
#undef A4
1493
#undef A5
1494
#undef A6
1495
#undef A7
1496
#undef B0
1497
#undef B1
1498
#undef B2
1499
#undef B3
1500
#undef B4
1501
#undef B5
1502
#undef B6
1503
#undef B7
1504
#undef C0
1505
#undef C1
1506
#undef C2
1507
#undef C3
1508
#undef C4
1509
#undef C5
1510
#undef C6
1511
#undef C7
1512
#undef D0
1513
#undef D1
1514
#undef D2
1515
#undef D3
1516
#undef D4
1517
#undef D5
1518
#undef D6
1519
#undef D7
1520
#endif
1521

1522
#endif
1523

1524
static const u32 IV224[] = {
1525
	C32(0x33586E9F), C32(0x12FFF033), C32(0xB2D9F64D), C32(0x6F8FEA53),
1526
	C32(0xDE943106), C32(0x2742E439), C32(0x4FBAB5AC), C32(0x62B9FF96),
1527
	C32(0x22E7B0AF), C32(0xC862B3A8), C32(0x33E00CDC), C32(0x236B86A6),
1528
	C32(0xF64AE77C), C32(0xFA373B76), C32(0x7DC1EE5B), C32(0x7FB29CE8)
1529
};
1530

1531
static const u32 IV256[] = {
1532
	C32(0x4D567983), C32(0x07190BA9), C32(0x8474577B), C32(0x39D726E9),
1533
	C32(0xAAF3D925), C32(0x3EE20B03), C32(0xAFD5E751), C32(0xC96006D3),
1534
	C32(0xC2C2BA14), C32(0x49B3BCB4), C32(0xF67CAF46), C32(0x668626C9),
1535
	C32(0xE2EAA8D2), C32(0x1FF47833), C32(0xD0C661A5), C32(0x55693DE1)
1536
};
1537

1538
static const u32 IV384[] = {
1539
	C32(0x8A36EEBC), C32(0x94A3BD90), C32(0xD1537B83), C32(0xB25B070B),
1540
	C32(0xF463F1B5), C32(0xB6F81E20), C32(0x0055C339), C32(0xB4D144D1),
1541
	C32(0x7360CA61), C32(0x18361A03), C32(0x17DCB4B9), C32(0x3414C45A),
1542
	C32(0xA699A9D2), C32(0xE39E9664), C32(0x468BFE77), C32(0x51D062F8),
1543
	C32(0xB9E3BFE8), C32(0x63BECE2A), C32(0x8FE506B9), C32(0xF8CC4AC2),
1544
	C32(0x7AE11542), C32(0xB1AADDA1), C32(0x64B06794), C32(0x28D2F462),
1545
	C32(0xE64071EC), C32(0x1DEB91A8), C32(0x8AC8DB23), C32(0x3F782AB5),
1546
	C32(0x039B5CB8), C32(0x71DDD962), C32(0xFADE2CEA), C32(0x1416DF71)
1547
};
1548

1549
static const u32 IV512[] = {
1550
	C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
1551
	C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
1552
	C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
1553
	C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
1554
	C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
1555
	C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
1556
	C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
1557
	C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22)
1558
};
1559

1560
static void
1561
init_small(void *cc, const u32 *iv)
1562
{
1563
	sph_simd_small_context *sc;
1564

1565
	sc = cc;
1566
	memcpy(sc->state, iv, sizeof sc->state);
1567
	sc->count_low = sc->count_high = 0;
1568
	sc->ptr = 0;
1569
}
1570

1571
static void
1572
init_big(void *cc, const u32 *iv)
1573
{
1574
	sph_simd_big_context *sc;
1575

1576
	sc = cc;
1577
	memcpy(sc->state, iv, sizeof sc->state);
1578
	sc->count_low = sc->count_high = 0;
1579
	sc->ptr = 0;
1580
}
1581

1582
static void
1583
update_small(void *cc, const void *data, size_t len)
1584
{
1585
	sph_simd_small_context *sc;
1586

1587
	sc = cc;
1588
	while (len > 0) {
1589
		size_t clen;
1590

1591
		clen = (sizeof sc->buf) - sc->ptr;
1592
		if (clen > len)
1593
			clen = len;
1594
		memcpy(sc->buf + sc->ptr, data, clen);
1595
		data = (const unsigned char *)data + clen;
1596
		len -= clen;
1597
		if ((sc->ptr += clen) == sizeof sc->buf) {
1598
			compress_small(sc, 0);
1599
			sc->ptr = 0;
1600
			sc->count_low = T32(sc->count_low + 1);
1601
			if (sc->count_low == 0)
1602
				sc->count_high ++;
1603
		}
1604
	}
1605
}
1606

1607
static void
1608
update_big(void *cc, const void *data, size_t len)
1609
{
1610
	sph_simd_big_context *sc;
1611

1612
	sc = cc;
1613
	while (len > 0) {
1614
		size_t clen;
1615

1616
		clen = (sizeof sc->buf) - sc->ptr;
1617
		if (clen > len)
1618
			clen = len;
1619
		memcpy(sc->buf + sc->ptr, data, clen);
1620
		data = (const unsigned char *)data + clen;
1621
		len -= clen;
1622
		if ((sc->ptr += clen) == sizeof sc->buf) {
1623
			compress_big(sc, 0);
1624
			sc->ptr = 0;
1625
			sc->count_low = T32(sc->count_low + 1);
1626
			if (sc->count_low == 0)
1627
				sc->count_high ++;
1628
		}
1629
	}
1630
}
1631

1632
static void
1633
encode_count_small(unsigned char *dst,
1634
	u32 low, u32 high, size_t ptr, unsigned n)
1635
{
1636
	low = T32(low << 9);
1637
	high = T32(high << 9) + (low >> 23);
1638
	low += T32(ptr << 3) + n;
1639
	sph_enc32le(dst, low);
1640
	sph_enc32le(dst + 4, high);
1641
}
1642

1643
static void
1644
encode_count_big(unsigned char *dst,
1645
	u32 low, u32 high, size_t ptr, unsigned n)
1646
{
1647
	low = T32(low << 10);
1648
	high = T32(high << 10) + (low >> 22);
1649
	low += T32(ptr << 3) + n;
1650
	sph_enc32le(dst, low);
1651
	sph_enc32le(dst + 4, high);
1652
}
1653

1654
static void
1655
finalize_small(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
1656
{
1657
	sph_simd_small_context *sc;
1658
	unsigned char *d;
1659
	size_t u;
1660

1661
	sc = cc;
1662
	if (sc->ptr > 0 || n > 0) {
1663
		memset(sc->buf + sc->ptr, 0,
1664
			(sizeof sc->buf) - sc->ptr);
1665
		sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
1666
		compress_small(sc, 0);
1667
	}
1668
	memset(sc->buf, 0, sizeof sc->buf);
1669
	encode_count_small(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
1670
	compress_small(sc, 1);
1671
	d = dst;
1672
	for (d = dst, u = 0; u < dst_len; u ++)
1673
		sph_enc32le(d + (u << 2), sc->state[u]);
1674
}
1675

1676
static void
1677
finalize_big(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
1678
{
1679
	sph_simd_big_context *sc;
1680
	unsigned char *d;
1681
	size_t u;
1682

1683
	sc = cc;
1684
	if (sc->ptr > 0 || n > 0) {
1685
		memset(sc->buf + sc->ptr, 0,
1686
			(sizeof sc->buf) - sc->ptr);
1687
		sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
1688
		compress_big(sc, 0);
1689
	}
1690
	memset(sc->buf, 0, sizeof sc->buf);
1691
	encode_count_big(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
1692
	compress_big(sc, 1);
1693
	d = dst;
1694
	for (d = dst, u = 0; u < dst_len; u ++)
1695
		sph_enc32le(d + (u << 2), sc->state[u]);
1696
}
1697

1698
void
1699
sph_simd224_init(void *cc)
1700
{
1701
	init_small(cc, IV224);
1702
}
1703

1704
void
1705
sph_simd224(void *cc, const void *data, size_t len)
1706
{
1707
	update_small(cc, data, len);
1708
}
1709

1710
void
1711
sph_simd224_close(void *cc, void *dst)
1712
{
1713
	sph_simd224_addbits_and_close(cc, 0, 0, dst);
1714
}
1715

1716
void
1717
sph_simd224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1718
{
1719
	finalize_small(cc, ub, n, dst, 7);
1720
	sph_simd224_init(cc);
1721
}
1722

1723
void
1724
sph_simd256_init(void *cc)
1725
{
1726
	init_small(cc, IV256);
1727
}
1728

1729
void
1730
sph_simd256(void *cc, const void *data, size_t len)
1731
{
1732
	update_small(cc, data, len);
1733
}
1734

1735
void
1736
sph_simd256_close(void *cc, void *dst)
1737
{
1738
	sph_simd256_addbits_and_close(cc, 0, 0, dst);
1739
}
1740

1741
void
1742
sph_simd256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1743
{
1744
	finalize_small(cc, ub, n, dst, 8);
1745
	sph_simd256_init(cc);
1746
}
1747

1748
void
1749
sph_simd384_init(void *cc)
1750
{
1751
	init_big(cc, IV384);
1752
}
1753

1754
void
1755
sph_simd384(void *cc, const void *data, size_t len)
1756
{
1757
	update_big(cc, data, len);
1758
}
1759

1760
void
1761
sph_simd384_close(void *cc, void *dst)
1762
{
1763
	sph_simd384_addbits_and_close(cc, 0, 0, dst);
1764
}
1765

1766
void
1767
sph_simd384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1768
{
1769
	finalize_big(cc, ub, n, dst, 12);
1770
	sph_simd384_init(cc);
1771
}
1772

1773
void
1774
sph_simd512_init(void *cc)
1775
{
1776
	init_big(cc, IV512);
1777
}
1778

1779
void
1780
sph_simd512(void *cc, const void *data, size_t len)
1781
{
1782
	update_big(cc, data, len);
1783
}
1784

1785
void
1786
sph_simd512_close(void *cc, void *dst)
1787
{
1788
	sph_simd512_addbits_and_close(cc, 0, 0, dst);
1789
}
1790

1791
void
1792
sph_simd512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1793
{
1794
	finalize_big(cc, ub, n, dst, 16);
1795
	sph_simd512_init(cc);
1796
}
1797
#ifdef __cplusplus
1798
}
1799
#endif
1800
Product

Resources

Company