Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
tpruvot
GitHub Repository: tpruvot/cpuminer-multi
Path: blob/linux/sha3/sph_simd.c
1201 views
1
/* $Id: simd.c 227 2010-06-16 17:28:38Z tp $ */
2
/*
3
* SIMD implementation.
4
*
5
* ==========================(LICENSE BEGIN)============================
6
*
7
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
8
*
9
* Permission is hereby granted, free of charge, to any person obtaining
10
* a copy of this software and associated documentation files (the
11
* "Software"), to deal in the Software without restriction, including
12
* without limitation the rights to use, copy, modify, merge, publish,
13
* distribute, sublicense, and/or sell copies of the Software, and to
14
* permit persons to whom the Software is furnished to do so, subject to
15
* the following conditions:
16
*
17
* The above copyright notice and this permission notice shall be
18
* included in all copies or substantial portions of the Software.
19
*
20
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
*
28
* ===========================(LICENSE END)=============================
29
*
30
* @author Thomas Pornin <[email protected]>
31
*/
32
33
#include <stddef.h>
34
#include <string.h>
35
#include <limits.h>
36
37
#include "sph_simd.h"
38
39
#ifdef __cplusplus
40
extern "C"{
41
#endif
42
43
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SIMD
44
#define SPH_SMALL_FOOTPRINT_SIMD 1
45
#endif
46
47
#ifdef _MSC_VER
48
#pragma warning (disable: 4146)
49
#endif
50
51
typedef sph_u32 u32;
52
typedef sph_s32 s32;
53
#define C32 SPH_C32
54
#define T32 SPH_T32
55
#define ROL32 SPH_ROTL32
56
57
#define XCAT(x, y) XCAT_(x, y)
58
#define XCAT_(x, y) x ## y
59
60
/*
61
* The powers of 41 modulo 257. We use exponents from 0 to 255, inclusive.
62
*/
63
static const s32 alpha_tab[] = {
64
1, 41, 139, 45, 46, 87, 226, 14, 60, 147, 116, 130,
65
190, 80, 196, 69, 2, 82, 21, 90, 92, 174, 195, 28,
66
120, 37, 232, 3, 123, 160, 135, 138, 4, 164, 42, 180,
67
184, 91, 133, 56, 240, 74, 207, 6, 246, 63, 13, 19,
68
8, 71, 84, 103, 111, 182, 9, 112, 223, 148, 157, 12,
69
235, 126, 26, 38, 16, 142, 168, 206, 222, 107, 18, 224,
70
189, 39, 57, 24, 213, 252, 52, 76, 32, 27, 79, 155,
71
187, 214, 36, 191, 121, 78, 114, 48, 169, 247, 104, 152,
72
64, 54, 158, 53, 117, 171, 72, 125, 242, 156, 228, 96,
73
81, 237, 208, 47, 128, 108, 59, 106, 234, 85, 144, 250,
74
227, 55, 199, 192, 162, 217, 159, 94, 256, 216, 118, 212,
75
211, 170, 31, 243, 197, 110, 141, 127, 67, 177, 61, 188,
76
255, 175, 236, 167, 165, 83, 62, 229, 137, 220, 25, 254,
77
134, 97, 122, 119, 253, 93, 215, 77, 73, 166, 124, 201,
78
17, 183, 50, 251, 11, 194, 244, 238, 249, 186, 173, 154,
79
146, 75, 248, 145, 34, 109, 100, 245, 22, 131, 231, 219,
80
241, 115, 89, 51, 35, 150, 239, 33, 68, 218, 200, 233,
81
44, 5, 205, 181, 225, 230, 178, 102, 70, 43, 221, 66,
82
136, 179, 143, 209, 88, 10, 153, 105, 193, 203, 99, 204,
83
140, 86, 185, 132, 15, 101, 29, 161, 176, 20, 49, 210,
84
129, 149, 198, 151, 23, 172, 113, 7, 30, 202, 58, 65,
85
95, 40, 98, 163
86
};
87
88
/*
89
* Ranges:
90
* REDS1: from -32768..98302 to -383..383
91
* REDS2: from -2^31..2^31-1 to -32768..98302
92
*/
93
#define REDS1(x) (((x) & 0xFF) - ((x) >> 8))
94
#define REDS2(x) (((x) & 0xFFFF) + ((x) >> 16))
95
96
/*
97
* If, upon entry, the values of q[] are all in the -N..N range (where
98
* N >= 98302) then the new values of q[] are in the -2N..2N range.
99
*
100
* Since alpha_tab[v] <= 256, maximum allowed range is for N = 8388608.
101
*/
102
#define FFT_LOOP(rb, hk, as, id) do { \
103
size_t u, v; \
104
s32 m = q[(rb)]; \
105
s32 n = q[(rb) + (hk)]; \
106
q[(rb)] = m + n; \
107
q[(rb) + (hk)] = m - n; \
108
u = v = 0; \
109
goto id; \
110
for (; u < (hk); u += 4, v += 4 * (as)) { \
111
s32 t; \
112
m = q[(rb) + u + 0]; \
113
n = q[(rb) + u + 0 + (hk)]; \
114
t = REDS2(n * alpha_tab[v + 0 * (as)]); \
115
q[(rb) + u + 0] = m + t; \
116
q[(rb) + u + 0 + (hk)] = m - t; \
117
id: \
118
m = q[(rb) + u + 1]; \
119
n = q[(rb) + u + 1 + (hk)]; \
120
t = REDS2(n * alpha_tab[v + 1 * (as)]); \
121
q[(rb) + u + 1] = m + t; \
122
q[(rb) + u + 1 + (hk)] = m - t; \
123
m = q[(rb) + u + 2]; \
124
n = q[(rb) + u + 2 + (hk)]; \
125
t = REDS2(n * alpha_tab[v + 2 * (as)]); \
126
q[(rb) + u + 2] = m + t; \
127
q[(rb) + u + 2 + (hk)] = m - t; \
128
m = q[(rb) + u + 3]; \
129
n = q[(rb) + u + 3 + (hk)]; \
130
t = REDS2(n * alpha_tab[v + 3 * (as)]); \
131
q[(rb) + u + 3] = m + t; \
132
q[(rb) + u + 3 + (hk)] = m - t; \
133
} \
134
} while (0)
135
136
/*
137
* Output ranges:
138
* d0: min= 0 max= 1020
139
* d1: min= -67 max= 4587
140
* d2: min=-4335 max= 4335
141
* d3: min=-4147 max= 507
142
* d4: min= -510 max= 510
143
* d5: min= -252 max= 4402
144
* d6: min=-4335 max= 4335
145
* d7: min=-4332 max= 322
146
*/
147
#define FFT8(xb, xs, d) do { \
148
s32 x0 = x[(xb)]; \
149
s32 x1 = x[(xb) + (xs)]; \
150
s32 x2 = x[(xb) + 2 * (xs)]; \
151
s32 x3 = x[(xb) + 3 * (xs)]; \
152
s32 a0 = x0 + x2; \
153
s32 a1 = x0 + (x2 << 4); \
154
s32 a2 = x0 - x2; \
155
s32 a3 = x0 - (x2 << 4); \
156
s32 b0 = x1 + x3; \
157
s32 b1 = REDS1((x1 << 2) + (x3 << 6)); \
158
s32 b2 = (x1 << 4) - (x3 << 4); \
159
s32 b3 = REDS1((x1 << 6) + (x3 << 2)); \
160
d ## 0 = a0 + b0; \
161
d ## 1 = a1 + b1; \
162
d ## 2 = a2 + b2; \
163
d ## 3 = a3 + b3; \
164
d ## 4 = a0 - b0; \
165
d ## 5 = a1 - b1; \
166
d ## 6 = a2 - b2; \
167
d ## 7 = a3 - b3; \
168
} while (0)
169
170
/*
171
* When k=16, we have alpha=2. Multiplication by alpha^i is then reduced
172
* to some shifting.
173
*
174
* Output: within -591471..591723
175
*/
176
#define FFT16(xb, xs, rb) do { \
177
s32 d1_0, d1_1, d1_2, d1_3, d1_4, d1_5, d1_6, d1_7; \
178
s32 d2_0, d2_1, d2_2, d2_3, d2_4, d2_5, d2_6, d2_7; \
179
FFT8(xb, (xs) << 1, d1_); \
180
FFT8((xb) + (xs), (xs) << 1, d2_); \
181
q[(rb) + 0] = d1_0 + d2_0; \
182
q[(rb) + 1] = d1_1 + (d2_1 << 1); \
183
q[(rb) + 2] = d1_2 + (d2_2 << 2); \
184
q[(rb) + 3] = d1_3 + (d2_3 << 3); \
185
q[(rb) + 4] = d1_4 + (d2_4 << 4); \
186
q[(rb) + 5] = d1_5 + (d2_5 << 5); \
187
q[(rb) + 6] = d1_6 + (d2_6 << 6); \
188
q[(rb) + 7] = d1_7 + (d2_7 << 7); \
189
q[(rb) + 8] = d1_0 - d2_0; \
190
q[(rb) + 9] = d1_1 - (d2_1 << 1); \
191
q[(rb) + 10] = d1_2 - (d2_2 << 2); \
192
q[(rb) + 11] = d1_3 - (d2_3 << 3); \
193
q[(rb) + 12] = d1_4 - (d2_4 << 4); \
194
q[(rb) + 13] = d1_5 - (d2_5 << 5); \
195
q[(rb) + 14] = d1_6 - (d2_6 << 6); \
196
q[(rb) + 15] = d1_7 - (d2_7 << 7); \
197
} while (0)
198
199
/*
200
* Output range: |q| <= 1183446
201
*/
202
#define FFT32(xb, xs, rb, id) do { \
203
FFT16(xb, (xs) << 1, rb); \
204
FFT16((xb) + (xs), (xs) << 1, (rb) + 16); \
205
FFT_LOOP(rb, 16, 8, id); \
206
} while (0)
207
208
/*
209
* Output range: |q| <= 2366892
210
*/
211
#define FFT64(xb, xs, rb, id) do { \
212
FFT32(xb, (xs) << 1, rb, XCAT(id, a)); \
213
FFT32((xb) + (xs), (xs) << 1, (rb) + 32, XCAT(id, b)); \
214
FFT_LOOP(rb, 32, 4, id); \
215
} while (0)
216
217
#if SPH_SMALL_FOOTPRINT_SIMD
218
219
static void
220
fft32(unsigned char *x, size_t xs, s32 *q)
221
{
222
size_t xd;
223
224
xd = xs << 1;
225
FFT16(0, xd, 0);
226
FFT16(xs, xd, 16);
227
FFT_LOOP(0, 16, 8, label_);
228
}
229
230
#define FFT128(xb, xs, rb, id) do { \
231
fft32(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) + 0]); \
232
fft32(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 32]); \
233
FFT_LOOP(rb, 32, 4, XCAT(id, aa)); \
234
fft32(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 64]); \
235
fft32(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 96]); \
236
FFT_LOOP((rb) + 64, 32, 4, XCAT(id, ab)); \
237
FFT_LOOP(rb, 64, 2, XCAT(id, a)); \
238
} while (0)
239
240
#else
241
242
/*
243
* Output range: |q| <= 4733784
244
*/
245
#define FFT128(xb, xs, rb, id) do { \
246
FFT64(xb, (xs) << 1, rb, XCAT(id, a)); \
247
FFT64((xb) + (xs), (xs) << 1, (rb) + 64, XCAT(id, b)); \
248
FFT_LOOP(rb, 64, 2, id); \
249
} while (0)
250
251
#endif
252
253
/*
254
* For SIMD-384 / SIMD-512, the fully unrolled FFT yields a compression
255
* function which does not fit in the 32 kB L1 cache of a typical x86
256
* Intel. We therefore add a function call layer at the FFT64 level.
257
*/
258
259
static void
260
fft64(unsigned char *x, size_t xs, s32 *q)
261
{
262
size_t xd;
263
264
xd = xs << 1;
265
FFT32(0, xd, 0, label_a);
266
FFT32(xs, xd, 32, label_b);
267
FFT_LOOP(0, 32, 4, label_);
268
}
269
270
/*
271
* Output range: |q| <= 9467568
272
*/
273
#define FFT256(xb, xs, rb, id) do { \
274
fft64(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) + 0]); \
275
fft64(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 64]); \
276
FFT_LOOP(rb, 64, 2, XCAT(id, aa)); \
277
fft64(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 128]); \
278
fft64(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 192]); \
279
FFT_LOOP((rb) + 128, 64, 2, XCAT(id, ab)); \
280
FFT_LOOP(rb, 128, 1, XCAT(id, a)); \
281
} while (0)
282
283
/*
284
* alpha^(127*i) mod 257
285
*/
286
static const unsigned short yoff_s_n[] = {
287
1, 98, 95, 58, 30, 113, 23, 198, 129, 49, 176, 29,
288
15, 185, 140, 99, 193, 153, 88, 143, 136, 221, 70, 178,
289
225, 205, 44, 200, 68, 239, 35, 89, 241, 231, 22, 100,
290
34, 248, 146, 173, 249, 244, 11, 50, 17, 124, 73, 215,
291
253, 122, 134, 25, 137, 62, 165, 236, 255, 61, 67, 141,
292
197, 31, 211, 118, 256, 159, 162, 199, 227, 144, 234, 59,
293
128, 208, 81, 228, 242, 72, 117, 158, 64, 104, 169, 114,
294
121, 36, 187, 79, 32, 52, 213, 57, 189, 18, 222, 168,
295
16, 26, 235, 157, 223, 9, 111, 84, 8, 13, 246, 207,
296
240, 133, 184, 42, 4, 135, 123, 232, 120, 195, 92, 21,
297
2, 196, 190, 116, 60, 226, 46, 139
298
};
299
300
/*
301
* alpha^(127*i) + alpha^(125*i) mod 257
302
*/
303
static const unsigned short yoff_s_f[] = {
304
2, 156, 118, 107, 45, 212, 111, 162, 97, 249, 211, 3,
305
49, 101, 151, 223, 189, 178, 253, 204, 76, 82, 232, 65,
306
96, 176, 161, 47, 189, 61, 248, 107, 0, 131, 133, 113,
307
17, 33, 12, 111, 251, 103, 57, 148, 47, 65, 249, 143,
308
189, 8, 204, 230, 205, 151, 187, 227, 247, 111, 140, 6,
309
77, 10, 21, 149, 255, 101, 139, 150, 212, 45, 146, 95,
310
160, 8, 46, 254, 208, 156, 106, 34, 68, 79, 4, 53,
311
181, 175, 25, 192, 161, 81, 96, 210, 68, 196, 9, 150,
312
0, 126, 124, 144, 240, 224, 245, 146, 6, 154, 200, 109,
313
210, 192, 8, 114, 68, 249, 53, 27, 52, 106, 70, 30,
314
10, 146, 117, 251, 180, 247, 236, 108
315
};
316
317
/*
318
* beta^(255*i) mod 257
319
*/
320
static const unsigned short yoff_b_n[] = {
321
1, 163, 98, 40, 95, 65, 58, 202, 30, 7, 113, 172,
322
23, 151, 198, 149, 129, 210, 49, 20, 176, 161, 29, 101,
323
15, 132, 185, 86, 140, 204, 99, 203, 193, 105, 153, 10,
324
88, 209, 143, 179, 136, 66, 221, 43, 70, 102, 178, 230,
325
225, 181, 205, 5, 44, 233, 200, 218, 68, 33, 239, 150,
326
35, 51, 89, 115, 241, 219, 231, 131, 22, 245, 100, 109,
327
34, 145, 248, 75, 146, 154, 173, 186, 249, 238, 244, 194,
328
11, 251, 50, 183, 17, 201, 124, 166, 73, 77, 215, 93,
329
253, 119, 122, 97, 134, 254, 25, 220, 137, 229, 62, 83,
330
165, 167, 236, 175, 255, 188, 61, 177, 67, 127, 141, 110,
331
197, 243, 31, 170, 211, 212, 118, 216, 256, 94, 159, 217,
332
162, 192, 199, 55, 227, 250, 144, 85, 234, 106, 59, 108,
333
128, 47, 208, 237, 81, 96, 228, 156, 242, 125, 72, 171,
334
117, 53, 158, 54, 64, 152, 104, 247, 169, 48, 114, 78,
335
121, 191, 36, 214, 187, 155, 79, 27, 32, 76, 52, 252,
336
213, 24, 57, 39, 189, 224, 18, 107, 222, 206, 168, 142,
337
16, 38, 26, 126, 235, 12, 157, 148, 223, 112, 9, 182,
338
111, 103, 84, 71, 8, 19, 13, 63, 246, 6, 207, 74,
339
240, 56, 133, 91, 184, 180, 42, 164, 4, 138, 135, 160,
340
123, 3, 232, 37, 120, 28, 195, 174, 92, 90, 21, 82,
341
2, 69, 196, 80, 190, 130, 116, 147, 60, 14, 226, 87,
342
46, 45, 139, 41
343
};
344
345
/*
346
* beta^(255*i) + beta^(253*i) mod 257
347
*/
348
static const unsigned short yoff_b_f[] = {
349
2, 203, 156, 47, 118, 214, 107, 106, 45, 93, 212, 20,
350
111, 73, 162, 251, 97, 215, 249, 53, 211, 19, 3, 89,
351
49, 207, 101, 67, 151, 130, 223, 23, 189, 202, 178, 239,
352
253, 127, 204, 49, 76, 236, 82, 137, 232, 157, 65, 79,
353
96, 161, 176, 130, 161, 30, 47, 9, 189, 247, 61, 226,
354
248, 90, 107, 64, 0, 88, 131, 243, 133, 59, 113, 115,
355
17, 236, 33, 213, 12, 191, 111, 19, 251, 61, 103, 208,
356
57, 35, 148, 248, 47, 116, 65, 119, 249, 178, 143, 40,
357
189, 129, 8, 163, 204, 227, 230, 196, 205, 122, 151, 45,
358
187, 19, 227, 72, 247, 125, 111, 121, 140, 220, 6, 107,
359
77, 69, 10, 101, 21, 65, 149, 171, 255, 54, 101, 210,
360
139, 43, 150, 151, 212, 164, 45, 237, 146, 184, 95, 6,
361
160, 42, 8, 204, 46, 238, 254, 168, 208, 50, 156, 190,
362
106, 127, 34, 234, 68, 55, 79, 18, 4, 130, 53, 208,
363
181, 21, 175, 120, 25, 100, 192, 178, 161, 96, 81, 127,
364
96, 227, 210, 248, 68, 10, 196, 31, 9, 167, 150, 193,
365
0, 169, 126, 14, 124, 198, 144, 142, 240, 21, 224, 44,
366
245, 66, 146, 238, 6, 196, 154, 49, 200, 222, 109, 9,
367
210, 141, 192, 138, 8, 79, 114, 217, 68, 128, 249, 94,
368
53, 30, 27, 61, 52, 135, 106, 212, 70, 238, 30, 185,
369
10, 132, 146, 136, 117, 37, 251, 150, 180, 188, 247, 156,
370
236, 192, 108, 86
371
};
372
373
#define INNER(l, h, mm) (((u32)((l) * (mm)) & 0xFFFFU) \
374
+ ((u32)((h) * (mm)) << 16))
375
376
#define W_SMALL(sb, o1, o2, mm) \
377
(INNER(q[8 * (sb) + 2 * 0 + o1], q[8 * (sb) + 2 * 0 + o2], mm), \
378
INNER(q[8 * (sb) + 2 * 1 + o1], q[8 * (sb) + 2 * 1 + o2], mm), \
379
INNER(q[8 * (sb) + 2 * 2 + o1], q[8 * (sb) + 2 * 2 + o2], mm), \
380
INNER(q[8 * (sb) + 2 * 3 + o1], q[8 * (sb) + 2 * 3 + o2], mm)
381
382
#define WS_0_0 W_SMALL( 4, 0, 1, 185)
383
#define WS_0_1 W_SMALL( 6, 0, 1, 185)
384
#define WS_0_2 W_SMALL( 0, 0, 1, 185)
385
#define WS_0_3 W_SMALL( 2, 0, 1, 185)
386
#define WS_0_4 W_SMALL( 7, 0, 1, 185)
387
#define WS_0_5 W_SMALL( 5, 0, 1, 185)
388
#define WS_0_6 W_SMALL( 3, 0, 1, 185)
389
#define WS_0_7 W_SMALL( 1, 0, 1, 185)
390
#define WS_1_0 W_SMALL(15, 0, 1, 185)
391
#define WS_1_1 W_SMALL(11, 0, 1, 185)
392
#define WS_1_2 W_SMALL(12, 0, 1, 185)
393
#define WS_1_3 W_SMALL( 8, 0, 1, 185)
394
#define WS_1_4 W_SMALL( 9, 0, 1, 185)
395
#define WS_1_5 W_SMALL(13, 0, 1, 185)
396
#define WS_1_6 W_SMALL(10, 0, 1, 185)
397
#define WS_1_7 W_SMALL(14, 0, 1, 185)
398
#define WS_2_0 W_SMALL(17, -128, -64, 233)
399
#define WS_2_1 W_SMALL(18, -128, -64, 233)
400
#define WS_2_2 W_SMALL(23, -128, -64, 233)
401
#define WS_2_3 W_SMALL(20, -128, -64, 233)
402
#define WS_2_4 W_SMALL(22, -128, -64, 233)
403
#define WS_2_5 W_SMALL(21, -128, -64, 233)
404
#define WS_2_6 W_SMALL(16, -128, -64, 233)
405
#define WS_2_7 W_SMALL(19, -128, -64, 233)
406
#define WS_3_0 W_SMALL(30, -191, -127, 233)
407
#define WS_3_1 W_SMALL(24, -191, -127, 233)
408
#define WS_3_2 W_SMALL(25, -191, -127, 233)
409
#define WS_3_3 W_SMALL(31, -191, -127, 233)
410
#define WS_3_4 W_SMALL(27, -191, -127, 233)
411
#define WS_3_5 W_SMALL(29, -191, -127, 233)
412
#define WS_3_6 W_SMALL(28, -191, -127, 233)
413
#define WS_3_7 W_SMALL(26, -191, -127, 233)
414
415
#define W_BIG(sb, o1, o2, mm) \
416
(INNER(q[16 * (sb) + 2 * 0 + o1], q[16 * (sb) + 2 * 0 + o2], mm), \
417
INNER(q[16 * (sb) + 2 * 1 + o1], q[16 * (sb) + 2 * 1 + o2], mm), \
418
INNER(q[16 * (sb) + 2 * 2 + o1], q[16 * (sb) + 2 * 2 + o2], mm), \
419
INNER(q[16 * (sb) + 2 * 3 + o1], q[16 * (sb) + 2 * 3 + o2], mm), \
420
INNER(q[16 * (sb) + 2 * 4 + o1], q[16 * (sb) + 2 * 4 + o2], mm), \
421
INNER(q[16 * (sb) + 2 * 5 + o1], q[16 * (sb) + 2 * 5 + o2], mm), \
422
INNER(q[16 * (sb) + 2 * 6 + o1], q[16 * (sb) + 2 * 6 + o2], mm), \
423
INNER(q[16 * (sb) + 2 * 7 + o1], q[16 * (sb) + 2 * 7 + o2], mm)
424
425
#define WB_0_0 W_BIG( 4, 0, 1, 185)
426
#define WB_0_1 W_BIG( 6, 0, 1, 185)
427
#define WB_0_2 W_BIG( 0, 0, 1, 185)
428
#define WB_0_3 W_BIG( 2, 0, 1, 185)
429
#define WB_0_4 W_BIG( 7, 0, 1, 185)
430
#define WB_0_5 W_BIG( 5, 0, 1, 185)
431
#define WB_0_6 W_BIG( 3, 0, 1, 185)
432
#define WB_0_7 W_BIG( 1, 0, 1, 185)
433
#define WB_1_0 W_BIG(15, 0, 1, 185)
434
#define WB_1_1 W_BIG(11, 0, 1, 185)
435
#define WB_1_2 W_BIG(12, 0, 1, 185)
436
#define WB_1_3 W_BIG( 8, 0, 1, 185)
437
#define WB_1_4 W_BIG( 9, 0, 1, 185)
438
#define WB_1_5 W_BIG(13, 0, 1, 185)
439
#define WB_1_6 W_BIG(10, 0, 1, 185)
440
#define WB_1_7 W_BIG(14, 0, 1, 185)
441
#define WB_2_0 W_BIG(17, -256, -128, 233)
442
#define WB_2_1 W_BIG(18, -256, -128, 233)
443
#define WB_2_2 W_BIG(23, -256, -128, 233)
444
#define WB_2_3 W_BIG(20, -256, -128, 233)
445
#define WB_2_4 W_BIG(22, -256, -128, 233)
446
#define WB_2_5 W_BIG(21, -256, -128, 233)
447
#define WB_2_6 W_BIG(16, -256, -128, 233)
448
#define WB_2_7 W_BIG(19, -256, -128, 233)
449
#define WB_3_0 W_BIG(30, -383, -255, 233)
450
#define WB_3_1 W_BIG(24, -383, -255, 233)
451
#define WB_3_2 W_BIG(25, -383, -255, 233)
452
#define WB_3_3 W_BIG(31, -383, -255, 233)
453
#define WB_3_4 W_BIG(27, -383, -255, 233)
454
#define WB_3_5 W_BIG(29, -383, -255, 233)
455
#define WB_3_6 W_BIG(28, -383, -255, 233)
456
#define WB_3_7 W_BIG(26, -383, -255, 233)
457
458
#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
459
#define MAJ(x, y, z) (((x) & (y)) | (((x) | (y)) & (z)))
460
461
#define PP4_0_0 1
462
#define PP4_0_1 0
463
#define PP4_0_2 3
464
#define PP4_0_3 2
465
#define PP4_1_0 2
466
#define PP4_1_1 3
467
#define PP4_1_2 0
468
#define PP4_1_3 1
469
#define PP4_2_0 3
470
#define PP4_2_1 2
471
#define PP4_2_2 1
472
#define PP4_2_3 0
473
474
#define PP8_0_0 1
475
#define PP8_0_1 0
476
#define PP8_0_2 3
477
#define PP8_0_3 2
478
#define PP8_0_4 5
479
#define PP8_0_5 4
480
#define PP8_0_6 7
481
#define PP8_0_7 6
482
483
#define PP8_1_0 6
484
#define PP8_1_1 7
485
#define PP8_1_2 4
486
#define PP8_1_3 5
487
#define PP8_1_4 2
488
#define PP8_1_5 3
489
#define PP8_1_6 0
490
#define PP8_1_7 1
491
492
#define PP8_2_0 2
493
#define PP8_2_1 3
494
#define PP8_2_2 0
495
#define PP8_2_3 1
496
#define PP8_2_4 6
497
#define PP8_2_5 7
498
#define PP8_2_6 4
499
#define PP8_2_7 5
500
501
#define PP8_3_0 3
502
#define PP8_3_1 2
503
#define PP8_3_2 1
504
#define PP8_3_3 0
505
#define PP8_3_4 7
506
#define PP8_3_5 6
507
#define PP8_3_6 5
508
#define PP8_3_7 4
509
510
#define PP8_4_0 5
511
#define PP8_4_1 4
512
#define PP8_4_2 7
513
#define PP8_4_3 6
514
#define PP8_4_4 1
515
#define PP8_4_5 0
516
#define PP8_4_6 3
517
#define PP8_4_7 2
518
519
#define PP8_5_0 7
520
#define PP8_5_1 6
521
#define PP8_5_2 5
522
#define PP8_5_3 4
523
#define PP8_5_4 3
524
#define PP8_5_5 2
525
#define PP8_5_6 1
526
#define PP8_5_7 0
527
528
#define PP8_6_0 4
529
#define PP8_6_1 5
530
#define PP8_6_2 6
531
#define PP8_6_3 7
532
#define PP8_6_4 0
533
#define PP8_6_5 1
534
#define PP8_6_6 2
535
#define PP8_6_7 3
536
537
#if SPH_SIMD_NOCOPY
538
539
#define DECL_STATE_SMALL
540
#define READ_STATE_SMALL(sc)
541
#define WRITE_STATE_SMALL(sc)
542
#define DECL_STATE_BIG
543
#define READ_STATE_BIG(sc)
544
#define WRITE_STATE_BIG(sc)
545
546
#else
547
548
#define DECL_STATE_SMALL \
549
u32 A0, A1, A2, A3, B0, B1, B2, B3, C0, C1, C2, C3, D0, D1, D2, D3;
550
551
#define READ_STATE_SMALL(sc) do { \
552
A0 = (sc)->state[ 0]; \
553
A1 = (sc)->state[ 1]; \
554
A2 = (sc)->state[ 2]; \
555
A3 = (sc)->state[ 3]; \
556
B0 = (sc)->state[ 4]; \
557
B1 = (sc)->state[ 5]; \
558
B2 = (sc)->state[ 6]; \
559
B3 = (sc)->state[ 7]; \
560
C0 = (sc)->state[ 8]; \
561
C1 = (sc)->state[ 9]; \
562
C2 = (sc)->state[10]; \
563
C3 = (sc)->state[11]; \
564
D0 = (sc)->state[12]; \
565
D1 = (sc)->state[13]; \
566
D2 = (sc)->state[14]; \
567
D3 = (sc)->state[15]; \
568
} while (0)
569
570
#define WRITE_STATE_SMALL(sc) do { \
571
(sc)->state[ 0] = A0; \
572
(sc)->state[ 1] = A1; \
573
(sc)->state[ 2] = A2; \
574
(sc)->state[ 3] = A3; \
575
(sc)->state[ 4] = B0; \
576
(sc)->state[ 5] = B1; \
577
(sc)->state[ 6] = B2; \
578
(sc)->state[ 7] = B3; \
579
(sc)->state[ 8] = C0; \
580
(sc)->state[ 9] = C1; \
581
(sc)->state[10] = C2; \
582
(sc)->state[11] = C3; \
583
(sc)->state[12] = D0; \
584
(sc)->state[13] = D1; \
585
(sc)->state[14] = D2; \
586
(sc)->state[15] = D3; \
587
} while (0)
588
589
#define DECL_STATE_BIG \
590
u32 A0, A1, A2, A3, A4, A5, A6, A7; \
591
u32 B0, B1, B2, B3, B4, B5, B6, B7; \
592
u32 C0, C1, C2, C3, C4, C5, C6, C7; \
593
u32 D0, D1, D2, D3, D4, D5, D6, D7;
594
595
#define READ_STATE_BIG(sc) do { \
596
A0 = (sc)->state[ 0]; \
597
A1 = (sc)->state[ 1]; \
598
A2 = (sc)->state[ 2]; \
599
A3 = (sc)->state[ 3]; \
600
A4 = (sc)->state[ 4]; \
601
A5 = (sc)->state[ 5]; \
602
A6 = (sc)->state[ 6]; \
603
A7 = (sc)->state[ 7]; \
604
B0 = (sc)->state[ 8]; \
605
B1 = (sc)->state[ 9]; \
606
B2 = (sc)->state[10]; \
607
B3 = (sc)->state[11]; \
608
B4 = (sc)->state[12]; \
609
B5 = (sc)->state[13]; \
610
B6 = (sc)->state[14]; \
611
B7 = (sc)->state[15]; \
612
C0 = (sc)->state[16]; \
613
C1 = (sc)->state[17]; \
614
C2 = (sc)->state[18]; \
615
C3 = (sc)->state[19]; \
616
C4 = (sc)->state[20]; \
617
C5 = (sc)->state[21]; \
618
C6 = (sc)->state[22]; \
619
C7 = (sc)->state[23]; \
620
D0 = (sc)->state[24]; \
621
D1 = (sc)->state[25]; \
622
D2 = (sc)->state[26]; \
623
D3 = (sc)->state[27]; \
624
D4 = (sc)->state[28]; \
625
D5 = (sc)->state[29]; \
626
D6 = (sc)->state[30]; \
627
D7 = (sc)->state[31]; \
628
} while (0)
629
630
#define WRITE_STATE_BIG(sc) do { \
631
(sc)->state[ 0] = A0; \
632
(sc)->state[ 1] = A1; \
633
(sc)->state[ 2] = A2; \
634
(sc)->state[ 3] = A3; \
635
(sc)->state[ 4] = A4; \
636
(sc)->state[ 5] = A5; \
637
(sc)->state[ 6] = A6; \
638
(sc)->state[ 7] = A7; \
639
(sc)->state[ 8] = B0; \
640
(sc)->state[ 9] = B1; \
641
(sc)->state[10] = B2; \
642
(sc)->state[11] = B3; \
643
(sc)->state[12] = B4; \
644
(sc)->state[13] = B5; \
645
(sc)->state[14] = B6; \
646
(sc)->state[15] = B7; \
647
(sc)->state[16] = C0; \
648
(sc)->state[17] = C1; \
649
(sc)->state[18] = C2; \
650
(sc)->state[19] = C3; \
651
(sc)->state[20] = C4; \
652
(sc)->state[21] = C5; \
653
(sc)->state[22] = C6; \
654
(sc)->state[23] = C7; \
655
(sc)->state[24] = D0; \
656
(sc)->state[25] = D1; \
657
(sc)->state[26] = D2; \
658
(sc)->state[27] = D3; \
659
(sc)->state[28] = D4; \
660
(sc)->state[29] = D5; \
661
(sc)->state[30] = D6; \
662
(sc)->state[31] = D7; \
663
} while (0)
664
665
#endif
666
667
#define STEP_ELT(n, w, fun, s, ppb) do { \
668
u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
669
A ## n = T32(ROL32(tt, s) + XCAT(tA, XCAT(ppb, n))); \
670
D ## n = C ## n; \
671
C ## n = B ## n; \
672
B ## n = tA ## n; \
673
} while (0)
674
675
#define STEP_SMALL(w0, w1, w2, w3, fun, r, s, pp4b) do { \
676
u32 tA0 = ROL32(A0, r); \
677
u32 tA1 = ROL32(A1, r); \
678
u32 tA2 = ROL32(A2, r); \
679
u32 tA3 = ROL32(A3, r); \
680
STEP_ELT(0, w0, fun, s, pp4b); \
681
STEP_ELT(1, w1, fun, s, pp4b); \
682
STEP_ELT(2, w2, fun, s, pp4b); \
683
STEP_ELT(3, w3, fun, s, pp4b); \
684
} while (0)
685
686
#define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \
687
u32 tA0 = ROL32(A0, r); \
688
u32 tA1 = ROL32(A1, r); \
689
u32 tA2 = ROL32(A2, r); \
690
u32 tA3 = ROL32(A3, r); \
691
u32 tA4 = ROL32(A4, r); \
692
u32 tA5 = ROL32(A5, r); \
693
u32 tA6 = ROL32(A6, r); \
694
u32 tA7 = ROL32(A7, r); \
695
STEP_ELT(0, w0, fun, s, pp8b); \
696
STEP_ELT(1, w1, fun, s, pp8b); \
697
STEP_ELT(2, w2, fun, s, pp8b); \
698
STEP_ELT(3, w3, fun, s, pp8b); \
699
STEP_ELT(4, w4, fun, s, pp8b); \
700
STEP_ELT(5, w5, fun, s, pp8b); \
701
STEP_ELT(6, w6, fun, s, pp8b); \
702
STEP_ELT(7, w7, fun, s, pp8b); \
703
} while (0)
704
705
#define M3_0_0 0_
706
#define M3_1_0 1_
707
#define M3_2_0 2_
708
#define M3_3_0 0_
709
#define M3_4_0 1_
710
#define M3_5_0 2_
711
#define M3_6_0 0_
712
#define M3_7_0 1_
713
714
#define M3_0_1 1_
715
#define M3_1_1 2_
716
#define M3_2_1 0_
717
#define M3_3_1 1_
718
#define M3_4_1 2_
719
#define M3_5_1 0_
720
#define M3_6_1 1_
721
#define M3_7_1 2_
722
723
#define M3_0_2 2_
724
#define M3_1_2 0_
725
#define M3_2_2 1_
726
#define M3_3_2 2_
727
#define M3_4_2 0_
728
#define M3_5_2 1_
729
#define M3_6_2 2_
730
#define M3_7_2 0_
731
732
#define STEP_SMALL_(w, fun, r, s, pp4b) STEP_SMALL w, fun, r, s, pp4b)
733
734
#define ONE_ROUND_SMALL(ri, isp, p0, p1, p2, p3) do { \
735
STEP_SMALL_(WS_ ## ri ## 0, \
736
IF, p0, p1, XCAT(PP4_, M3_0_ ## isp)); \
737
STEP_SMALL_(WS_ ## ri ## 1, \
738
IF, p1, p2, XCAT(PP4_, M3_1_ ## isp)); \
739
STEP_SMALL_(WS_ ## ri ## 2, \
740
IF, p2, p3, XCAT(PP4_, M3_2_ ## isp)); \
741
STEP_SMALL_(WS_ ## ri ## 3, \
742
IF, p3, p0, XCAT(PP4_, M3_3_ ## isp)); \
743
STEP_SMALL_(WS_ ## ri ## 4, \
744
MAJ, p0, p1, XCAT(PP4_, M3_4_ ## isp)); \
745
STEP_SMALL_(WS_ ## ri ## 5, \
746
MAJ, p1, p2, XCAT(PP4_, M3_5_ ## isp)); \
747
STEP_SMALL_(WS_ ## ri ## 6, \
748
MAJ, p2, p3, XCAT(PP4_, M3_6_ ## isp)); \
749
STEP_SMALL_(WS_ ## ri ## 7, \
750
MAJ, p3, p0, XCAT(PP4_, M3_7_ ## isp)); \
751
} while (0)
752
753
#define M7_0_0 0_
754
#define M7_1_0 1_
755
#define M7_2_0 2_
756
#define M7_3_0 3_
757
#define M7_4_0 4_
758
#define M7_5_0 5_
759
#define M7_6_0 6_
760
#define M7_7_0 0_
761
762
#define M7_0_1 1_
763
#define M7_1_1 2_
764
#define M7_2_1 3_
765
#define M7_3_1 4_
766
#define M7_4_1 5_
767
#define M7_5_1 6_
768
#define M7_6_1 0_
769
#define M7_7_1 1_
770
771
#define M7_0_2 2_
772
#define M7_1_2 3_
773
#define M7_2_2 4_
774
#define M7_3_2 5_
775
#define M7_4_2 6_
776
#define M7_5_2 0_
777
#define M7_6_2 1_
778
#define M7_7_2 2_
779
780
#define M7_0_3 3_
781
#define M7_1_3 4_
782
#define M7_2_3 5_
783
#define M7_3_3 6_
784
#define M7_4_3 0_
785
#define M7_5_3 1_
786
#define M7_6_3 2_
787
#define M7_7_3 3_
788
789
#define STEP_BIG_(w, fun, r, s, pp8b) STEP_BIG w, fun, r, s, pp8b)
790
791
#define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3) do { \
792
STEP_BIG_(WB_ ## ri ## 0, \
793
IF, p0, p1, XCAT(PP8_, M7_0_ ## isp)); \
794
STEP_BIG_(WB_ ## ri ## 1, \
795
IF, p1, p2, XCAT(PP8_, M7_1_ ## isp)); \
796
STEP_BIG_(WB_ ## ri ## 2, \
797
IF, p2, p3, XCAT(PP8_, M7_2_ ## isp)); \
798
STEP_BIG_(WB_ ## ri ## 3, \
799
IF, p3, p0, XCAT(PP8_, M7_3_ ## isp)); \
800
STEP_BIG_(WB_ ## ri ## 4, \
801
MAJ, p0, p1, XCAT(PP8_, M7_4_ ## isp)); \
802
STEP_BIG_(WB_ ## ri ## 5, \
803
MAJ, p1, p2, XCAT(PP8_, M7_5_ ## isp)); \
804
STEP_BIG_(WB_ ## ri ## 6, \
805
MAJ, p2, p3, XCAT(PP8_, M7_6_ ## isp)); \
806
STEP_BIG_(WB_ ## ri ## 7, \
807
MAJ, p3, p0, XCAT(PP8_, M7_7_ ## isp)); \
808
} while (0)
809
810
#if SPH_SMALL_FOOTPRINT_SIMD
811
812
#define A0 state[ 0]
813
#define A1 state[ 1]
814
#define A2 state[ 2]
815
#define A3 state[ 3]
816
#define B0 state[ 4]
817
#define B1 state[ 5]
818
#define B2 state[ 6]
819
#define B3 state[ 7]
820
#define C0 state[ 8]
821
#define C1 state[ 9]
822
#define C2 state[10]
823
#define C3 state[11]
824
#define D0 state[12]
825
#define D1 state[13]
826
#define D2 state[14]
827
#define D3 state[15]
828
829
#define STEP2_ELT(n, w, fun, s, ppb) do { \
830
u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
831
A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
832
D ## n = C ## n; \
833
C ## n = B ## n; \
834
B ## n = tA[n]; \
835
} while (0)
836
837
#define STEP2_SMALL(w0, w1, w2, w3, fun, r, s, pp4b) do { \
838
u32 tA[4]; \
839
tA[0] = ROL32(A0, r); \
840
tA[1] = ROL32(A1, r); \
841
tA[2] = ROL32(A2, r); \
842
tA[3] = ROL32(A3, r); \
843
STEP2_ELT(0, w0, fun, s, pp4b); \
844
STEP2_ELT(1, w1, fun, s, pp4b); \
845
STEP2_ELT(2, w2, fun, s, pp4b); \
846
STEP2_ELT(3, w3, fun, s, pp4b); \
847
} while (0)
848
849
static void
850
one_round_small(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
851
{
852
static const int pp4k[] = { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2 };
853
854
STEP2_SMALL(w[ 0], w[ 1], w[ 2], w[ 3], IF, p0, p1, pp4k[isp + 0]);
855
STEP2_SMALL(w[ 4], w[ 5], w[ 6], w[ 7], IF, p1, p2, pp4k[isp + 1]);
856
STEP2_SMALL(w[ 8], w[ 9], w[10], w[11], IF, p2, p3, pp4k[isp + 2]);
857
STEP2_SMALL(w[12], w[13], w[14], w[15], IF, p3, p0, pp4k[isp + 3]);
858
STEP2_SMALL(w[16], w[17], w[18], w[19], MAJ, p0, p1, pp4k[isp + 4]);
859
STEP2_SMALL(w[20], w[21], w[22], w[23], MAJ, p1, p2, pp4k[isp + 5]);
860
STEP2_SMALL(w[24], w[25], w[26], w[27], MAJ, p2, p3, pp4k[isp + 6]);
861
STEP2_SMALL(w[28], w[29], w[30], w[31], MAJ, p3, p0, pp4k[isp + 7]);
862
}
863
864
static void
865
compress_small(sph_simd_small_context *sc, int last)
866
{
867
unsigned char *x;
868
s32 q[128];
869
int i;
870
u32 w[32];
871
u32 state[16];
872
size_t u;
873
874
static const size_t wsp[32] = {
875
4 << 3, 6 << 3, 0 << 3, 2 << 3,
876
7 << 3, 5 << 3, 3 << 3, 1 << 3,
877
15 << 3, 11 << 3, 12 << 3, 8 << 3,
878
9 << 3, 13 << 3, 10 << 3, 14 << 3,
879
17 << 3, 18 << 3, 23 << 3, 20 << 3,
880
22 << 3, 21 << 3, 16 << 3, 19 << 3,
881
30 << 3, 24 << 3, 25 << 3, 31 << 3,
882
27 << 3, 29 << 3, 28 << 3, 26 << 3
883
};
884
885
x = sc->buf;
886
FFT128(0, 1, 0, ll);
887
if (last) {
888
for (i = 0; i < 128; i ++) {
889
s32 tq;
890
891
tq = q[i] + yoff_s_f[i];
892
tq = REDS2(tq);
893
tq = REDS1(tq);
894
tq = REDS1(tq);
895
q[i] = (tq <= 128 ? tq : tq - 257);
896
}
897
} else {
898
for (i = 0; i < 128; i ++) {
899
s32 tq;
900
901
tq = q[i] + yoff_s_n[i];
902
tq = REDS2(tq);
903
tq = REDS1(tq);
904
tq = REDS1(tq);
905
q[i] = (tq <= 128 ? tq : tq - 257);
906
}
907
}
908
909
for (i = 0; i < 16; i += 4) {
910
state[i + 0] = sc->state[i + 0]
911
^ sph_dec32le_aligned(x + 4 * (i + 0));
912
state[i + 1] = sc->state[i + 1]
913
^ sph_dec32le_aligned(x + 4 * (i + 1));
914
state[i + 2] = sc->state[i + 2]
915
^ sph_dec32le_aligned(x + 4 * (i + 2));
916
state[i + 3] = sc->state[i + 3]
917
^ sph_dec32le_aligned(x + 4 * (i + 3));
918
}
919
920
#define WSREAD(sb, o1, o2, mm) do { \
921
for (u = 0; u < 32; u += 4) { \
922
size_t v = wsp[(u >> 2) + (sb)]; \
923
w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
924
q[v + 2 * 0 + (o2)], mm); \
925
w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
926
q[v + 2 * 1 + (o2)], mm); \
927
w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
928
q[v + 2 * 2 + (o2)], mm); \
929
w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
930
q[v + 2 * 3 + (o2)], mm); \
931
} \
932
} while (0)
933
934
WSREAD( 0, 0, 1, 185);
935
one_round_small(state, w, 0, 3, 23, 17, 27);
936
WSREAD( 8, 0, 1, 185);
937
one_round_small(state, w, 2, 28, 19, 22, 7);
938
WSREAD(16, -128, -64, 233);
939
one_round_small(state, w, 1, 29, 9, 15, 5);
940
WSREAD(24, -191, -127, 233);
941
one_round_small(state, w, 0, 4, 13, 10, 25);
942
943
#undef WSREAD
944
945
STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
946
IF, 4, 13, PP4_2_);
947
STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
948
IF, 13, 10, PP4_0_);
949
STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
950
IF, 10, 25, PP4_1_);
951
STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
952
IF, 25, 4, PP4_2_);
953
954
memcpy(sc->state, state, sizeof state);
955
}
956
957
#undef A0
958
#undef A1
959
#undef A2
960
#undef A3
961
#undef B0
962
#undef B1
963
#undef B2
964
#undef B3
965
#undef C0
966
#undef C1
967
#undef C2
968
#undef C3
969
#undef D0
970
#undef D1
971
#undef D2
972
#undef D3
973
974
#else
975
976
#if SPH_SIMD_NOCOPY
977
#define A0 (sc->state[ 0])
978
#define A1 (sc->state[ 1])
979
#define A2 (sc->state[ 2])
980
#define A3 (sc->state[ 3])
981
#define B0 (sc->state[ 4])
982
#define B1 (sc->state[ 5])
983
#define B2 (sc->state[ 6])
984
#define B3 (sc->state[ 7])
985
#define C0 (sc->state[ 8])
986
#define C1 (sc->state[ 9])
987
#define C2 (sc->state[10])
988
#define C3 (sc->state[11])
989
#define D0 (sc->state[12])
990
#define D1 (sc->state[13])
991
#define D2 (sc->state[14])
992
#define D3 (sc->state[15])
993
#endif
994
995
static void
996
compress_small(sph_simd_small_context *sc, int last)
997
{
998
unsigned char *x;
999
s32 q[128];
1000
int i;
1001
DECL_STATE_SMALL
1002
#if SPH_SIMD_NOCOPY
1003
sph_u32 saved[16];
1004
#endif
1005
1006
#if SPH_SIMD_NOCOPY
1007
memcpy(saved, sc->state, sizeof saved);
1008
#endif
1009
x = sc->buf;
1010
FFT128(0, 1, 0, ll);
1011
if (last) {
1012
for (i = 0; i < 128; i ++) {
1013
s32 tq;
1014
1015
tq = q[i] + yoff_s_f[i];
1016
tq = REDS2(tq);
1017
tq = REDS1(tq);
1018
tq = REDS1(tq);
1019
q[i] = (tq <= 128 ? tq : tq - 257);
1020
}
1021
} else {
1022
for (i = 0; i < 128; i ++) {
1023
s32 tq;
1024
1025
tq = q[i] + yoff_s_n[i];
1026
tq = REDS2(tq);
1027
tq = REDS1(tq);
1028
tq = REDS1(tq);
1029
q[i] = (tq <= 128 ? tq : tq - 257);
1030
}
1031
}
1032
READ_STATE_SMALL(sc);
1033
A0 ^= sph_dec32le_aligned(x + 0);
1034
A1 ^= sph_dec32le_aligned(x + 4);
1035
A2 ^= sph_dec32le_aligned(x + 8);
1036
A3 ^= sph_dec32le_aligned(x + 12);
1037
B0 ^= sph_dec32le_aligned(x + 16);
1038
B1 ^= sph_dec32le_aligned(x + 20);
1039
B2 ^= sph_dec32le_aligned(x + 24);
1040
B3 ^= sph_dec32le_aligned(x + 28);
1041
C0 ^= sph_dec32le_aligned(x + 32);
1042
C1 ^= sph_dec32le_aligned(x + 36);
1043
C2 ^= sph_dec32le_aligned(x + 40);
1044
C3 ^= sph_dec32le_aligned(x + 44);
1045
D0 ^= sph_dec32le_aligned(x + 48);
1046
D1 ^= sph_dec32le_aligned(x + 52);
1047
D2 ^= sph_dec32le_aligned(x + 56);
1048
D3 ^= sph_dec32le_aligned(x + 60);
1049
ONE_ROUND_SMALL(0_, 0, 3, 23, 17, 27);
1050
ONE_ROUND_SMALL(1_, 2, 28, 19, 22, 7);
1051
ONE_ROUND_SMALL(2_, 1, 29, 9, 15, 5);
1052
ONE_ROUND_SMALL(3_, 0, 4, 13, 10, 25);
1053
#if SPH_SIMD_NOCOPY
1054
STEP_SMALL(saved[ 0], saved[ 1], saved[ 2], saved[ 3],
1055
IF, 4, 13, PP4_2_);
1056
STEP_SMALL(saved[ 4], saved[ 5], saved[ 6], saved[ 7],
1057
IF, 13, 10, PP4_0_);
1058
STEP_SMALL(saved[ 8], saved[ 9], saved[10], saved[11],
1059
IF, 10, 25, PP4_1_);
1060
STEP_SMALL(saved[12], saved[13], saved[14], saved[15],
1061
IF, 25, 4, PP4_2_);
1062
#else
1063
STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
1064
IF, 4, 13, PP4_2_);
1065
STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
1066
IF, 13, 10, PP4_0_);
1067
STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
1068
IF, 10, 25, PP4_1_);
1069
STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
1070
IF, 25, 4, PP4_2_);
1071
WRITE_STATE_SMALL(sc);
1072
#endif
1073
}
1074
1075
#if SPH_SIMD_NOCOPY
1076
#undef A0
1077
#undef A1
1078
#undef A2
1079
#undef A3
1080
#undef B0
1081
#undef B1
1082
#undef B2
1083
#undef B3
1084
#undef C0
1085
#undef C1
1086
#undef C2
1087
#undef C3
1088
#undef D0
1089
#undef D1
1090
#undef D2
1091
#undef D3
1092
#endif
1093
1094
#endif
1095
1096
#if SPH_SMALL_FOOTPRINT_SIMD
1097
1098
#define A0 state[ 0]
1099
#define A1 state[ 1]
1100
#define A2 state[ 2]
1101
#define A3 state[ 3]
1102
#define A4 state[ 4]
1103
#define A5 state[ 5]
1104
#define A6 state[ 6]
1105
#define A7 state[ 7]
1106
#define B0 state[ 8]
1107
#define B1 state[ 9]
1108
#define B2 state[10]
1109
#define B3 state[11]
1110
#define B4 state[12]
1111
#define B5 state[13]
1112
#define B6 state[14]
1113
#define B7 state[15]
1114
#define C0 state[16]
1115
#define C1 state[17]
1116
#define C2 state[18]
1117
#define C3 state[19]
1118
#define C4 state[20]
1119
#define C5 state[21]
1120
#define C6 state[22]
1121
#define C7 state[23]
1122
#define D0 state[24]
1123
#define D1 state[25]
1124
#define D2 state[26]
1125
#define D3 state[27]
1126
#define D4 state[28]
1127
#define D5 state[29]
1128
#define D6 state[30]
1129
#define D7 state[31]
1130
1131
/*
1132
* Not needed -- already defined for SIMD-224 / SIMD-256
1133
*
1134
#define STEP2_ELT(n, w, fun, s, ppb) do { \
1135
u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
1136
A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
1137
D ## n = C ## n; \
1138
C ## n = B ## n; \
1139
B ## n = tA[n]; \
1140
} while (0)
1141
*/
1142
1143
#define STEP2_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \
1144
u32 tA[8]; \
1145
tA[0] = ROL32(A0, r); \
1146
tA[1] = ROL32(A1, r); \
1147
tA[2] = ROL32(A2, r); \
1148
tA[3] = ROL32(A3, r); \
1149
tA[4] = ROL32(A4, r); \
1150
tA[5] = ROL32(A5, r); \
1151
tA[6] = ROL32(A6, r); \
1152
tA[7] = ROL32(A7, r); \
1153
STEP2_ELT(0, w0, fun, s, pp8b); \
1154
STEP2_ELT(1, w1, fun, s, pp8b); \
1155
STEP2_ELT(2, w2, fun, s, pp8b); \
1156
STEP2_ELT(3, w3, fun, s, pp8b); \
1157
STEP2_ELT(4, w4, fun, s, pp8b); \
1158
STEP2_ELT(5, w5, fun, s, pp8b); \
1159
STEP2_ELT(6, w6, fun, s, pp8b); \
1160
STEP2_ELT(7, w7, fun, s, pp8b); \
1161
} while (0)
1162
1163
static void
1164
one_round_big(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
1165
{
1166
static const int pp8k[] = { 1, 6, 2, 3, 5, 7, 4, 1, 6, 2, 3 };
1167
1168
STEP2_BIG(w[ 0], w[ 1], w[ 2], w[ 3], w[ 4], w[ 5], w[ 6], w[ 7],
1169
IF, p0, p1, pp8k[isp + 0]);
1170
STEP2_BIG(w[ 8], w[ 9], w[10], w[11], w[12], w[13], w[14], w[15],
1171
IF, p1, p2, pp8k[isp + 1]);
1172
STEP2_BIG(w[16], w[17], w[18], w[19], w[20], w[21], w[22], w[23],
1173
IF, p2, p3, pp8k[isp + 2]);
1174
STEP2_BIG(w[24], w[25], w[26], w[27], w[28], w[29], w[30], w[31],
1175
IF, p3, p0, pp8k[isp + 3]);
1176
STEP2_BIG(w[32], w[33], w[34], w[35], w[36], w[37], w[38], w[39],
1177
MAJ, p0, p1, pp8k[isp + 4]);
1178
STEP2_BIG(w[40], w[41], w[42], w[43], w[44], w[45], w[46], w[47],
1179
MAJ, p1, p2, pp8k[isp + 5]);
1180
STEP2_BIG(w[48], w[49], w[50], w[51], w[52], w[53], w[54], w[55],
1181
MAJ, p2, p3, pp8k[isp + 6]);
1182
STEP2_BIG(w[56], w[57], w[58], w[59], w[60], w[61], w[62], w[63],
1183
MAJ, p3, p0, pp8k[isp + 7]);
1184
}
1185
1186
static void
1187
compress_big(sph_simd_big_context *sc, int last)
1188
{
1189
unsigned char *x;
1190
s32 q[256];
1191
int i;
1192
u32 w[64];
1193
u32 state[32];
1194
size_t u;
1195
1196
static const size_t wbp[32] = {
1197
4 << 4, 6 << 4, 0 << 4, 2 << 4,
1198
7 << 4, 5 << 4, 3 << 4, 1 << 4,
1199
15 << 4, 11 << 4, 12 << 4, 8 << 4,
1200
9 << 4, 13 << 4, 10 << 4, 14 << 4,
1201
17 << 4, 18 << 4, 23 << 4, 20 << 4,
1202
22 << 4, 21 << 4, 16 << 4, 19 << 4,
1203
30 << 4, 24 << 4, 25 << 4, 31 << 4,
1204
27 << 4, 29 << 4, 28 << 4, 26 << 4
1205
};
1206
1207
x = sc->buf;
1208
FFT256(0, 1, 0, ll);
1209
if (last) {
1210
for (i = 0; i < 256; i ++) {
1211
s32 tq;
1212
1213
tq = q[i] + yoff_b_f[i];
1214
tq = REDS2(tq);
1215
tq = REDS1(tq);
1216
tq = REDS1(tq);
1217
q[i] = (tq <= 128 ? tq : tq - 257);
1218
}
1219
} else {
1220
for (i = 0; i < 256; i ++) {
1221
s32 tq;
1222
1223
tq = q[i] + yoff_b_n[i];
1224
tq = REDS2(tq);
1225
tq = REDS1(tq);
1226
tq = REDS1(tq);
1227
q[i] = (tq <= 128 ? tq : tq - 257);
1228
}
1229
}
1230
1231
for (i = 0; i < 32; i += 8) {
1232
state[i + 0] = sc->state[i + 0]
1233
^ sph_dec32le_aligned(x + 4 * (i + 0));
1234
state[i + 1] = sc->state[i + 1]
1235
^ sph_dec32le_aligned(x + 4 * (i + 1));
1236
state[i + 2] = sc->state[i + 2]
1237
^ sph_dec32le_aligned(x + 4 * (i + 2));
1238
state[i + 3] = sc->state[i + 3]
1239
^ sph_dec32le_aligned(x + 4 * (i + 3));
1240
state[i + 4] = sc->state[i + 4]
1241
^ sph_dec32le_aligned(x + 4 * (i + 4));
1242
state[i + 5] = sc->state[i + 5]
1243
^ sph_dec32le_aligned(x + 4 * (i + 5));
1244
state[i + 6] = sc->state[i + 6]
1245
^ sph_dec32le_aligned(x + 4 * (i + 6));
1246
state[i + 7] = sc->state[i + 7]
1247
^ sph_dec32le_aligned(x + 4 * (i + 7));
1248
}
1249
1250
#define WBREAD(sb, o1, o2, mm) do { \
1251
for (u = 0; u < 64; u += 8) { \
1252
size_t v = wbp[(u >> 3) + (sb)]; \
1253
w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
1254
q[v + 2 * 0 + (o2)], mm); \
1255
w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
1256
q[v + 2 * 1 + (o2)], mm); \
1257
w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
1258
q[v + 2 * 2 + (o2)], mm); \
1259
w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
1260
q[v + 2 * 3 + (o2)], mm); \
1261
w[u + 4] = INNER(q[v + 2 * 4 + (o1)], \
1262
q[v + 2 * 4 + (o2)], mm); \
1263
w[u + 5] = INNER(q[v + 2 * 5 + (o1)], \
1264
q[v + 2 * 5 + (o2)], mm); \
1265
w[u + 6] = INNER(q[v + 2 * 6 + (o1)], \
1266
q[v + 2 * 6 + (o2)], mm); \
1267
w[u + 7] = INNER(q[v + 2 * 7 + (o1)], \
1268
q[v + 2 * 7 + (o2)], mm); \
1269
} \
1270
} while (0)
1271
1272
WBREAD( 0, 0, 1, 185);
1273
one_round_big(state, w, 0, 3, 23, 17, 27);
1274
WBREAD( 8, 0, 1, 185);
1275
one_round_big(state, w, 1, 28, 19, 22, 7);
1276
WBREAD(16, -256, -128, 233);
1277
one_round_big(state, w, 2, 29, 9, 15, 5);
1278
WBREAD(24, -383, -255, 233);
1279
one_round_big(state, w, 3, 4, 13, 10, 25);
1280
1281
#undef WBREAD
1282
1283
STEP_BIG(
1284
sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
1285
sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
1286
IF, 4, 13, PP8_4_);
1287
STEP_BIG(
1288
sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
1289
sc->state[12], sc->state[13], sc->state[14], sc->state[15],
1290
IF, 13, 10, PP8_5_);
1291
STEP_BIG(
1292
sc->state[16], sc->state[17], sc->state[18], sc->state[19],
1293
sc->state[20], sc->state[21], sc->state[22], sc->state[23],
1294
IF, 10, 25, PP8_6_);
1295
STEP_BIG(
1296
sc->state[24], sc->state[25], sc->state[26], sc->state[27],
1297
sc->state[28], sc->state[29], sc->state[30], sc->state[31],
1298
IF, 25, 4, PP8_0_);
1299
1300
memcpy(sc->state, state, sizeof state);
1301
}
1302
1303
#undef A0
1304
#undef A1
1305
#undef A2
1306
#undef A3
1307
#undef A4
1308
#undef A5
1309
#undef A6
1310
#undef A7
1311
#undef B0
1312
#undef B1
1313
#undef B2
1314
#undef B3
1315
#undef B4
1316
#undef B5
1317
#undef B6
1318
#undef B7
1319
#undef C0
1320
#undef C1
1321
#undef C2
1322
#undef C3
1323
#undef C4
1324
#undef C5
1325
#undef C6
1326
#undef C7
1327
#undef D0
1328
#undef D1
1329
#undef D2
1330
#undef D3
1331
#undef D4
1332
#undef D5
1333
#undef D6
1334
#undef D7
1335
1336
#else
1337
1338
#if SPH_SIMD_NOCOPY
1339
#define A0 (sc->state[ 0])
1340
#define A1 (sc->state[ 1])
1341
#define A2 (sc->state[ 2])
1342
#define A3 (sc->state[ 3])
1343
#define A4 (sc->state[ 4])
1344
#define A5 (sc->state[ 5])
1345
#define A6 (sc->state[ 6])
1346
#define A7 (sc->state[ 7])
1347
#define B0 (sc->state[ 8])
1348
#define B1 (sc->state[ 9])
1349
#define B2 (sc->state[10])
1350
#define B3 (sc->state[11])
1351
#define B4 (sc->state[12])
1352
#define B5 (sc->state[13])
1353
#define B6 (sc->state[14])
1354
#define B7 (sc->state[15])
1355
#define C0 (sc->state[16])
1356
#define C1 (sc->state[17])
1357
#define C2 (sc->state[18])
1358
#define C3 (sc->state[19])
1359
#define C4 (sc->state[20])
1360
#define C5 (sc->state[21])
1361
#define C6 (sc->state[22])
1362
#define C7 (sc->state[23])
1363
#define D0 (sc->state[24])
1364
#define D1 (sc->state[25])
1365
#define D2 (sc->state[26])
1366
#define D3 (sc->state[27])
1367
#define D4 (sc->state[28])
1368
#define D5 (sc->state[29])
1369
#define D6 (sc->state[30])
1370
#define D7 (sc->state[31])
1371
#endif
1372
1373
static void
1374
compress_big(sph_simd_big_context *sc, int last)
1375
{
1376
unsigned char *x;
1377
s32 q[256];
1378
int i;
1379
DECL_STATE_BIG
1380
#if SPH_SIMD_NOCOPY
1381
sph_u32 saved[32];
1382
#endif
1383
1384
#if SPH_SIMD_NOCOPY
1385
memcpy(saved, sc->state, sizeof saved);
1386
#endif
1387
1388
x = sc->buf;
1389
FFT256(0, 1, 0, ll);
1390
if (last) {
1391
for (i = 0; i < 256; i ++) {
1392
s32 tq;
1393
1394
tq = q[i] + yoff_b_f[i];
1395
tq = REDS2(tq);
1396
tq = REDS1(tq);
1397
tq = REDS1(tq);
1398
q[i] = (tq <= 128 ? tq : tq - 257);
1399
}
1400
} else {
1401
for (i = 0; i < 256; i ++) {
1402
s32 tq;
1403
1404
tq = q[i] + yoff_b_n[i];
1405
tq = REDS2(tq);
1406
tq = REDS1(tq);
1407
tq = REDS1(tq);
1408
q[i] = (tq <= 128 ? tq : tq - 257);
1409
}
1410
}
1411
READ_STATE_BIG(sc);
1412
A0 ^= sph_dec32le_aligned(x + 0);
1413
A1 ^= sph_dec32le_aligned(x + 4);
1414
A2 ^= sph_dec32le_aligned(x + 8);
1415
A3 ^= sph_dec32le_aligned(x + 12);
1416
A4 ^= sph_dec32le_aligned(x + 16);
1417
A5 ^= sph_dec32le_aligned(x + 20);
1418
A6 ^= sph_dec32le_aligned(x + 24);
1419
A7 ^= sph_dec32le_aligned(x + 28);
1420
B0 ^= sph_dec32le_aligned(x + 32);
1421
B1 ^= sph_dec32le_aligned(x + 36);
1422
B2 ^= sph_dec32le_aligned(x + 40);
1423
B3 ^= sph_dec32le_aligned(x + 44);
1424
B4 ^= sph_dec32le_aligned(x + 48);
1425
B5 ^= sph_dec32le_aligned(x + 52);
1426
B6 ^= sph_dec32le_aligned(x + 56);
1427
B7 ^= sph_dec32le_aligned(x + 60);
1428
C0 ^= sph_dec32le_aligned(x + 64);
1429
C1 ^= sph_dec32le_aligned(x + 68);
1430
C2 ^= sph_dec32le_aligned(x + 72);
1431
C3 ^= sph_dec32le_aligned(x + 76);
1432
C4 ^= sph_dec32le_aligned(x + 80);
1433
C5 ^= sph_dec32le_aligned(x + 84);
1434
C6 ^= sph_dec32le_aligned(x + 88);
1435
C7 ^= sph_dec32le_aligned(x + 92);
1436
D0 ^= sph_dec32le_aligned(x + 96);
1437
D1 ^= sph_dec32le_aligned(x + 100);
1438
D2 ^= sph_dec32le_aligned(x + 104);
1439
D3 ^= sph_dec32le_aligned(x + 108);
1440
D4 ^= sph_dec32le_aligned(x + 112);
1441
D5 ^= sph_dec32le_aligned(x + 116);
1442
D6 ^= sph_dec32le_aligned(x + 120);
1443
D7 ^= sph_dec32le_aligned(x + 124);
1444
1445
ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27);
1446
ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7);
1447
ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5);
1448
ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25);
1449
#if SPH_SIMD_NOCOPY
1450
STEP_BIG(
1451
saved[ 0], saved[ 1], saved[ 2], saved[ 3],
1452
saved[ 4], saved[ 5], saved[ 6], saved[ 7],
1453
IF, 4, 13, PP8_4_);
1454
STEP_BIG(
1455
saved[ 8], saved[ 9], saved[10], saved[11],
1456
saved[12], saved[13], saved[14], saved[15],
1457
IF, 13, 10, PP8_5_);
1458
STEP_BIG(
1459
saved[16], saved[17], saved[18], saved[19],
1460
saved[20], saved[21], saved[22], saved[23],
1461
IF, 10, 25, PP8_6_);
1462
STEP_BIG(
1463
saved[24], saved[25], saved[26], saved[27],
1464
saved[28], saved[29], saved[30], saved[31],
1465
IF, 25, 4, PP8_0_);
1466
#else
1467
STEP_BIG(
1468
sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
1469
sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
1470
IF, 4, 13, PP8_4_);
1471
STEP_BIG(
1472
sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
1473
sc->state[12], sc->state[13], sc->state[14], sc->state[15],
1474
IF, 13, 10, PP8_5_);
1475
STEP_BIG(
1476
sc->state[16], sc->state[17], sc->state[18], sc->state[19],
1477
sc->state[20], sc->state[21], sc->state[22], sc->state[23],
1478
IF, 10, 25, PP8_6_);
1479
STEP_BIG(
1480
sc->state[24], sc->state[25], sc->state[26], sc->state[27],
1481
sc->state[28], sc->state[29], sc->state[30], sc->state[31],
1482
IF, 25, 4, PP8_0_);
1483
WRITE_STATE_BIG(sc);
1484
#endif
1485
}
1486
1487
#if SPH_SIMD_NOCOPY
1488
#undef A0
1489
#undef A1
1490
#undef A2
1491
#undef A3
1492
#undef A4
1493
#undef A5
1494
#undef A6
1495
#undef A7
1496
#undef B0
1497
#undef B1
1498
#undef B2
1499
#undef B3
1500
#undef B4
1501
#undef B5
1502
#undef B6
1503
#undef B7
1504
#undef C0
1505
#undef C1
1506
#undef C2
1507
#undef C3
1508
#undef C4
1509
#undef C5
1510
#undef C6
1511
#undef C7
1512
#undef D0
1513
#undef D1
1514
#undef D2
1515
#undef D3
1516
#undef D4
1517
#undef D5
1518
#undef D6
1519
#undef D7
1520
#endif
1521
1522
#endif
1523
1524
static const u32 IV224[] = {
1525
C32(0x33586E9F), C32(0x12FFF033), C32(0xB2D9F64D), C32(0x6F8FEA53),
1526
C32(0xDE943106), C32(0x2742E439), C32(0x4FBAB5AC), C32(0x62B9FF96),
1527
C32(0x22E7B0AF), C32(0xC862B3A8), C32(0x33E00CDC), C32(0x236B86A6),
1528
C32(0xF64AE77C), C32(0xFA373B76), C32(0x7DC1EE5B), C32(0x7FB29CE8)
1529
};
1530
1531
static const u32 IV256[] = {
1532
C32(0x4D567983), C32(0x07190BA9), C32(0x8474577B), C32(0x39D726E9),
1533
C32(0xAAF3D925), C32(0x3EE20B03), C32(0xAFD5E751), C32(0xC96006D3),
1534
C32(0xC2C2BA14), C32(0x49B3BCB4), C32(0xF67CAF46), C32(0x668626C9),
1535
C32(0xE2EAA8D2), C32(0x1FF47833), C32(0xD0C661A5), C32(0x55693DE1)
1536
};
1537
1538
static const u32 IV384[] = {
1539
C32(0x8A36EEBC), C32(0x94A3BD90), C32(0xD1537B83), C32(0xB25B070B),
1540
C32(0xF463F1B5), C32(0xB6F81E20), C32(0x0055C339), C32(0xB4D144D1),
1541
C32(0x7360CA61), C32(0x18361A03), C32(0x17DCB4B9), C32(0x3414C45A),
1542
C32(0xA699A9D2), C32(0xE39E9664), C32(0x468BFE77), C32(0x51D062F8),
1543
C32(0xB9E3BFE8), C32(0x63BECE2A), C32(0x8FE506B9), C32(0xF8CC4AC2),
1544
C32(0x7AE11542), C32(0xB1AADDA1), C32(0x64B06794), C32(0x28D2F462),
1545
C32(0xE64071EC), C32(0x1DEB91A8), C32(0x8AC8DB23), C32(0x3F782AB5),
1546
C32(0x039B5CB8), C32(0x71DDD962), C32(0xFADE2CEA), C32(0x1416DF71)
1547
};
1548
1549
static const u32 IV512[] = {
1550
C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
1551
C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
1552
C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
1553
C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
1554
C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
1555
C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
1556
C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
1557
C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22)
1558
};
1559
1560
static void
1561
init_small(void *cc, const u32 *iv)
1562
{
1563
sph_simd_small_context *sc;
1564
1565
sc = cc;
1566
memcpy(sc->state, iv, sizeof sc->state);
1567
sc->count_low = sc->count_high = 0;
1568
sc->ptr = 0;
1569
}
1570
1571
static void
1572
init_big(void *cc, const u32 *iv)
1573
{
1574
sph_simd_big_context *sc;
1575
1576
sc = cc;
1577
memcpy(sc->state, iv, sizeof sc->state);
1578
sc->count_low = sc->count_high = 0;
1579
sc->ptr = 0;
1580
}
1581
1582
static void
1583
update_small(void *cc, const void *data, size_t len)
1584
{
1585
sph_simd_small_context *sc;
1586
1587
sc = cc;
1588
while (len > 0) {
1589
size_t clen;
1590
1591
clen = (sizeof sc->buf) - sc->ptr;
1592
if (clen > len)
1593
clen = len;
1594
memcpy(sc->buf + sc->ptr, data, clen);
1595
data = (const unsigned char *)data + clen;
1596
len -= clen;
1597
if ((sc->ptr += clen) == sizeof sc->buf) {
1598
compress_small(sc, 0);
1599
sc->ptr = 0;
1600
sc->count_low = T32(sc->count_low + 1);
1601
if (sc->count_low == 0)
1602
sc->count_high ++;
1603
}
1604
}
1605
}
1606
1607
static void
1608
update_big(void *cc, const void *data, size_t len)
1609
{
1610
sph_simd_big_context *sc;
1611
1612
sc = cc;
1613
while (len > 0) {
1614
size_t clen;
1615
1616
clen = (sizeof sc->buf) - sc->ptr;
1617
if (clen > len)
1618
clen = len;
1619
memcpy(sc->buf + sc->ptr, data, clen);
1620
data = (const unsigned char *)data + clen;
1621
len -= clen;
1622
if ((sc->ptr += clen) == sizeof sc->buf) {
1623
compress_big(sc, 0);
1624
sc->ptr = 0;
1625
sc->count_low = T32(sc->count_low + 1);
1626
if (sc->count_low == 0)
1627
sc->count_high ++;
1628
}
1629
}
1630
}
1631
1632
static void
1633
encode_count_small(unsigned char *dst,
1634
u32 low, u32 high, size_t ptr, unsigned n)
1635
{
1636
low = T32(low << 9);
1637
high = T32(high << 9) + (low >> 23);
1638
low += T32(ptr << 3) + n;
1639
sph_enc32le(dst, low);
1640
sph_enc32le(dst + 4, high);
1641
}
1642
1643
static void
1644
encode_count_big(unsigned char *dst,
1645
u32 low, u32 high, size_t ptr, unsigned n)
1646
{
1647
low = T32(low << 10);
1648
high = T32(high << 10) + (low >> 22);
1649
low += T32(ptr << 3) + n;
1650
sph_enc32le(dst, low);
1651
sph_enc32le(dst + 4, high);
1652
}
1653
1654
static void
1655
finalize_small(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
1656
{
1657
sph_simd_small_context *sc;
1658
unsigned char *d;
1659
size_t u;
1660
1661
sc = cc;
1662
if (sc->ptr > 0 || n > 0) {
1663
memset(sc->buf + sc->ptr, 0,
1664
(sizeof sc->buf) - sc->ptr);
1665
sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
1666
compress_small(sc, 0);
1667
}
1668
memset(sc->buf, 0, sizeof sc->buf);
1669
encode_count_small(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
1670
compress_small(sc, 1);
1671
d = dst;
1672
for (d = dst, u = 0; u < dst_len; u ++)
1673
sph_enc32le(d + (u << 2), sc->state[u]);
1674
}
1675
1676
static void
1677
finalize_big(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
1678
{
1679
sph_simd_big_context *sc;
1680
unsigned char *d;
1681
size_t u;
1682
1683
sc = cc;
1684
if (sc->ptr > 0 || n > 0) {
1685
memset(sc->buf + sc->ptr, 0,
1686
(sizeof sc->buf) - sc->ptr);
1687
sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
1688
compress_big(sc, 0);
1689
}
1690
memset(sc->buf, 0, sizeof sc->buf);
1691
encode_count_big(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
1692
compress_big(sc, 1);
1693
d = dst;
1694
for (d = dst, u = 0; u < dst_len; u ++)
1695
sph_enc32le(d + (u << 2), sc->state[u]);
1696
}
1697
1698
void
1699
sph_simd224_init(void *cc)
1700
{
1701
init_small(cc, IV224);
1702
}
1703
1704
void
1705
sph_simd224(void *cc, const void *data, size_t len)
1706
{
1707
update_small(cc, data, len);
1708
}
1709
1710
void
1711
sph_simd224_close(void *cc, void *dst)
1712
{
1713
sph_simd224_addbits_and_close(cc, 0, 0, dst);
1714
}
1715
1716
void
1717
sph_simd224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1718
{
1719
finalize_small(cc, ub, n, dst, 7);
1720
sph_simd224_init(cc);
1721
}
1722
1723
void
1724
sph_simd256_init(void *cc)
1725
{
1726
init_small(cc, IV256);
1727
}
1728
1729
void
1730
sph_simd256(void *cc, const void *data, size_t len)
1731
{
1732
update_small(cc, data, len);
1733
}
1734
1735
void
1736
sph_simd256_close(void *cc, void *dst)
1737
{
1738
sph_simd256_addbits_and_close(cc, 0, 0, dst);
1739
}
1740
1741
void
1742
sph_simd256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1743
{
1744
finalize_small(cc, ub, n, dst, 8);
1745
sph_simd256_init(cc);
1746
}
1747
1748
void
1749
sph_simd384_init(void *cc)
1750
{
1751
init_big(cc, IV384);
1752
}
1753
1754
void
1755
sph_simd384(void *cc, const void *data, size_t len)
1756
{
1757
update_big(cc, data, len);
1758
}
1759
1760
void
1761
sph_simd384_close(void *cc, void *dst)
1762
{
1763
sph_simd384_addbits_and_close(cc, 0, 0, dst);
1764
}
1765
1766
void
1767
sph_simd384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1768
{
1769
finalize_big(cc, ub, n, dst, 12);
1770
sph_simd384_init(cc);
1771
}
1772
1773
void
1774
sph_simd512_init(void *cc)
1775
{
1776
init_big(cc, IV512);
1777
}
1778
1779
void
1780
sph_simd512(void *cc, const void *data, size_t len)
1781
{
1782
update_big(cc, data, len);
1783
}
1784
1785
void
1786
sph_simd512_close(void *cc, void *dst)
1787
{
1788
sph_simd512_addbits_and_close(cc, 0, 0, dst);
1789
}
1790
1791
void
1792
sph_simd512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1793
{
1794
finalize_big(cc, ub, n, dst, 16);
1795
sph_simd512_init(cc);
1796
}
1797
#ifdef __cplusplus
1798
}
1799
#endif
1800