Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
tpruvot
GitHub Repository: tpruvot/cpuminer-multi
Path: blob/linux/scryptjane/scrypt-jane-mix_chacha-avx.h
1201 views
1
/* x86 */
2
#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
3
4
#define SCRYPT_CHACHA_AVX
5
6
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
7
asm_naked_fn(scrypt_ChunkMix_avx)
8
a1(push ebx)
9
a1(push edi)
10
a1(push esi)
11
a1(push ebp)
12
a2(mov ebp,esp)
13
a2(mov edi,[ebp+20])
14
a2(mov esi,[ebp+24])
15
a2(mov eax,[ebp+28])
16
a2(mov ebx,[ebp+32])
17
a2(sub esp,64)
18
a2(and esp,~63)
19
a2(lea edx,[ebx*2])
20
a2(shl edx,6)
21
a2(lea ecx,[edx-64])
22
a2(and eax, eax)
23
a2(mov ebx, 0x01000302)
24
a2(vmovd xmm4, ebx)
25
a2(mov ebx, 0x05040706)
26
a2(vmovd xmm0, ebx)
27
a2(mov ebx, 0x09080b0a)
28
a2(vmovd xmm1, ebx)
29
a2(mov ebx, 0x0d0c0f0e)
30
a2(vmovd xmm2, ebx)
31
a2(mov ebx, 0x02010003)
32
a2(vmovd xmm5, ebx)
33
a2(mov ebx, 0x06050407)
34
a2(vmovd xmm3, ebx)
35
a2(mov ebx, 0x0a09080b)
36
a2(vmovd xmm6, ebx)
37
a2(mov ebx, 0x0e0d0c0f)
38
a2(vmovd xmm7, ebx)
39
a3(vpunpckldq xmm4, xmm4, xmm0)
40
a3(vpunpckldq xmm5, xmm5, xmm3)
41
a3(vpunpckldq xmm1, xmm1, xmm2)
42
a3(vpunpckldq xmm6, xmm6, xmm7)
43
a3(vpunpcklqdq xmm4, xmm4, xmm1)
44
a3(vpunpcklqdq xmm5, xmm5, xmm6)
45
a2(vmovdqa xmm0,[ecx+esi+0])
46
a2(vmovdqa xmm1,[ecx+esi+16])
47
a2(vmovdqa xmm2,[ecx+esi+32])
48
a2(vmovdqa xmm3,[ecx+esi+48])
49
a1(jz scrypt_ChunkMix_avx_no_xor1)
50
a3(vpxor xmm0,xmm0,[ecx+eax+0])
51
a3(vpxor xmm1,xmm1,[ecx+eax+16])
52
a3(vpxor xmm2,xmm2,[ecx+eax+32])
53
a3(vpxor xmm3,xmm3,[ecx+eax+48])
54
a1(scrypt_ChunkMix_avx_no_xor1:)
55
a2(xor ecx,ecx)
56
a2(xor ebx,ebx)
57
a1(scrypt_ChunkMix_avx_loop:)
58
a2(and eax, eax)
59
a3(vpxor xmm0,xmm0,[esi+ecx+0])
60
a3(vpxor xmm1,xmm1,[esi+ecx+16])
61
a3(vpxor xmm2,xmm2,[esi+ecx+32])
62
a3(vpxor xmm3,xmm3,[esi+ecx+48])
63
a1(jz scrypt_ChunkMix_avx_no_xor2)
64
a3(vpxor xmm0,xmm0,[eax+ecx+0])
65
a3(vpxor xmm1,xmm1,[eax+ecx+16])
66
a3(vpxor xmm2,xmm2,[eax+ecx+32])
67
a3(vpxor xmm3,xmm3,[eax+ecx+48])
68
a1(scrypt_ChunkMix_avx_no_xor2:)
69
a2(vmovdqa [esp+0],xmm0)
70
a2(vmovdqa [esp+16],xmm1)
71
a2(vmovdqa [esp+32],xmm2)
72
a2(vmovdqa [esp+48],xmm3)
73
a2(mov eax,8)
74
a1(scrypt_chacha_avx_loop: )
75
a3(vpaddd xmm0,xmm0,xmm1)
76
a3(vpxor xmm3,xmm3,xmm0)
77
a3(vpshufb xmm3,xmm3,xmm4)
78
a3(vpaddd xmm2,xmm2,xmm3)
79
a3(vpxor xmm1,xmm1,xmm2)
80
a3(vpsrld xmm6,xmm1,20)
81
a3(vpslld xmm1,xmm1,12)
82
a3(vpxor xmm1,xmm1,xmm6)
83
a3(vpaddd xmm0,xmm0,xmm1)
84
a3(vpxor xmm3,xmm3,xmm0)
85
a3(vpshufb xmm3,xmm3,xmm5)
86
a3(vpshufd xmm0,xmm0,0x93)
87
a3(vpaddd xmm2,xmm2,xmm3)
88
a3(vpshufd xmm3,xmm3,0x4e)
89
a3(vpxor xmm1,xmm1,xmm2)
90
a3(vpshufd xmm2,xmm2,0x39)
91
a3(vpsrld xmm6,xmm1,25)
92
a3(vpslld xmm1,xmm1,7)
93
a3(vpxor xmm1,xmm1,xmm6)
94
a2(sub eax,2)
95
a3(vpaddd xmm0,xmm0,xmm1)
96
a3(vpxor xmm3,xmm3,xmm0)
97
a3(vpshufb xmm3,xmm3,xmm4)
98
a3(vpaddd xmm2,xmm2,xmm3)
99
a3(vpxor xmm1,xmm1,xmm2)
100
a3(vpsrld xmm6,xmm1,20)
101
a3(vpslld xmm1,xmm1,12)
102
a3(vpxor xmm1,xmm1,xmm6)
103
a3(vpaddd xmm0,xmm0,xmm1)
104
a3(vpxor xmm3,xmm3,xmm0)
105
a3(vpshufb xmm3,xmm3,xmm5)
106
a3(vpshufd xmm0,xmm0,0x39)
107
a3(vpaddd xmm2,xmm2,xmm3)
108
a3(pshufd xmm3,xmm3,0x4e)
109
a3(vpxor xmm1,xmm1,xmm2)
110
a3(pshufd xmm2,xmm2,0x93)
111
a3(vpsrld xmm6,xmm1,25)
112
a3(vpslld xmm1,xmm1,7)
113
a3(vpxor xmm1,xmm1,xmm6)
114
a1(ja scrypt_chacha_avx_loop)
115
a3(vpaddd xmm0,xmm0,[esp+0])
116
a3(vpaddd xmm1,xmm1,[esp+16])
117
a3(vpaddd xmm2,xmm2,[esp+32])
118
a3(vpaddd xmm3,xmm3,[esp+48])
119
a2(lea eax,[ebx+ecx])
120
a2(xor ebx,edx)
121
a2(and eax,~0x7f)
122
a2(add ecx,64)
123
a2(shr eax,1)
124
a2(add eax, edi)
125
a2(cmp ecx,edx)
126
a2(vmovdqa [eax+0],xmm0)
127
a2(vmovdqa [eax+16],xmm1)
128
a2(vmovdqa [eax+32],xmm2)
129
a2(vmovdqa [eax+48],xmm3)
130
a2(mov eax,[ebp+28])
131
a1(jne scrypt_ChunkMix_avx_loop)
132
a2(mov esp,ebp)
133
a1(pop ebp)
134
a1(pop esi)
135
a1(pop edi)
136
a1(pop ebx)
137
aret(16)
138
asm_naked_fn_end(scrypt_ChunkMix_avx)
139
140
#endif
141
142
143
144
/* x64 */
145
#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
146
147
#define SCRYPT_CHACHA_AVX
148
149
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
150
asm_naked_fn(scrypt_ChunkMix_avx)
151
a2(lea rcx,[rcx*2])
152
a2(shl rcx,6)
153
a2(lea r9,[rcx-64])
154
a2(lea rax,[rsi+r9])
155
a2(lea r9,[rdx+r9])
156
a2(and rdx, rdx)
157
a2(vmovdqa xmm0,[rax+0])
158
a2(vmovdqa xmm1,[rax+16])
159
a2(vmovdqa xmm2,[rax+32])
160
a2(vmovdqa xmm3,[rax+48])
161
a2(mov r8, 0x0504070601000302)
162
a2(mov rax, 0x0d0c0f0e09080b0a)
163
a2(movq xmm4, r8)
164
a2(movq xmm6, rax)
165
a2(mov r8, 0x0605040702010003)
166
a2(mov rax, 0x0e0d0c0f0a09080b)
167
a2(movq xmm5, r8)
168
a2(movq xmm7, rax)
169
a3(vpunpcklqdq xmm4, xmm4, xmm6)
170
a3(vpunpcklqdq xmm5, xmm5, xmm7)
171
a1(jz scrypt_ChunkMix_avx_no_xor1)
172
a3(vpxor xmm0,xmm0,[r9+0])
173
a3(vpxor xmm1,xmm1,[r9+16])
174
a3(vpxor xmm2,xmm2,[r9+32])
175
a3(vpxor xmm3,xmm3,[r9+48])
176
a1(scrypt_ChunkMix_avx_no_xor1:)
177
a2(xor r8,r8)
178
a2(xor r9,r9)
179
a1(scrypt_ChunkMix_avx_loop:)
180
a2(and rdx, rdx)
181
a3(vpxor xmm0,xmm0,[rsi+r9+0])
182
a3(vpxor xmm1,xmm1,[rsi+r9+16])
183
a3(vpxor xmm2,xmm2,[rsi+r9+32])
184
a3(vpxor xmm3,xmm3,[rsi+r9+48])
185
a1(jz scrypt_ChunkMix_avx_no_xor2)
186
a3(vpxor xmm0,xmm0,[rdx+r9+0])
187
a3(vpxor xmm1,xmm1,[rdx+r9+16])
188
a3(vpxor xmm2,xmm2,[rdx+r9+32])
189
a3(vpxor xmm3,xmm3,[rdx+r9+48])
190
a1(scrypt_ChunkMix_avx_no_xor2:)
191
a2(vmovdqa xmm8,xmm0)
192
a2(vmovdqa xmm9,xmm1)
193
a2(vmovdqa xmm10,xmm2)
194
a2(vmovdqa xmm11,xmm3)
195
a2(mov rax,8)
196
a1(scrypt_chacha_avx_loop: )
197
a3(vpaddd xmm0,xmm0,xmm1)
198
a3(vpxor xmm3,xmm3,xmm0)
199
a3(vpshufb xmm3,xmm3,xmm4)
200
a3(vpaddd xmm2,xmm2,xmm3)
201
a3(vpxor xmm1,xmm1,xmm2)
202
a3(vpsrld xmm12,xmm1,20)
203
a3(vpslld xmm1,xmm1,12)
204
a3(vpxor xmm1,xmm1,xmm12)
205
a3(vpaddd xmm0,xmm0,xmm1)
206
a3(vpxor xmm3,xmm3,xmm0)
207
a3(vpshufb xmm3,xmm3,xmm5)
208
a3(vpshufd xmm0,xmm0,0x93)
209
a3(vpaddd xmm2,xmm2,xmm3)
210
a3(vpshufd xmm3,xmm3,0x4e)
211
a3(vpxor xmm1,xmm1,xmm2)
212
a3(vpshufd xmm2,xmm2,0x39)
213
a3(vpsrld xmm12,xmm1,25)
214
a3(vpslld xmm1,xmm1,7)
215
a3(vpxor xmm1,xmm1,xmm12)
216
a2(sub rax,2)
217
a3(vpaddd xmm0,xmm0,xmm1)
218
a3(vpxor xmm3,xmm3,xmm0)
219
a3(vpshufb xmm3,xmm3,xmm4)
220
a3(vpaddd xmm2,xmm2,xmm3)
221
a3(vpxor xmm1,xmm1,xmm2)
222
a3(vpsrld xmm12,xmm1,20)
223
a3(vpslld xmm1,xmm1,12)
224
a3(vpxor xmm1,xmm1,xmm12)
225
a3(vpaddd xmm0,xmm0,xmm1)
226
a3(vpxor xmm3,xmm3,xmm0)
227
a3(vpshufb xmm3,xmm3,xmm5)
228
a3(vpshufd xmm0,xmm0,0x39)
229
a3(vpaddd xmm2,xmm2,xmm3)
230
a3(pshufd xmm3,xmm3,0x4e)
231
a3(vpxor xmm1,xmm1,xmm2)
232
a3(pshufd xmm2,xmm2,0x93)
233
a3(vpsrld xmm12,xmm1,25)
234
a3(vpslld xmm1,xmm1,7)
235
a3(vpxor xmm1,xmm1,xmm12)
236
a1(ja scrypt_chacha_avx_loop)
237
a3(vpaddd xmm0,xmm0,xmm8)
238
a3(vpaddd xmm1,xmm1,xmm9)
239
a3(vpaddd xmm2,xmm2,xmm10)
240
a3(vpaddd xmm3,xmm3,xmm11)
241
a2(lea rax,[r8+r9])
242
a2(xor r8,rcx)
243
a2(and rax,~0x7f)
244
a2(add r9,64)
245
a2(shr rax,1)
246
a2(add rax, rdi)
247
a2(cmp r9,rcx)
248
a2(vmovdqa [rax+0],xmm0)
249
a2(vmovdqa [rax+16],xmm1)
250
a2(vmovdqa [rax+32],xmm2)
251
a2(vmovdqa [rax+48],xmm3)
252
a1(jne scrypt_ChunkMix_avx_loop)
253
a1(ret)
254
asm_naked_fn_end(scrypt_ChunkMix_avx)
255
256
#endif
257
258
259
/* intrinsic */
260
#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
261
262
#define SCRYPT_CHACHA_AVX
263
264
static void NOINLINE
265
scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
266
uint32_t i, blocksPerChunk = r * 2, half = 0;
267
xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
268
const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
269
size_t rounds;
270
271
/* 1: X = B_{2r - 1} */
272
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
273
x0 = xmmp[0];
274
x1 = xmmp[1];
275
x2 = xmmp[2];
276
x3 = xmmp[3];
277
278
if (Bxor) {
279
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
280
x0 = _mm_xor_si128(x0, xmmp[0]);
281
x1 = _mm_xor_si128(x1, xmmp[1]);
282
x2 = _mm_xor_si128(x2, xmmp[2]);
283
x3 = _mm_xor_si128(x3, xmmp[3]);
284
}
285
286
/* 2: for i = 0 to 2r - 1 do */
287
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
288
/* 3: X = H(X ^ B_i) */
289
xmmp = (xmmi *)scrypt_block(Bin, i);
290
x0 = _mm_xor_si128(x0, xmmp[0]);
291
x1 = _mm_xor_si128(x1, xmmp[1]);
292
x2 = _mm_xor_si128(x2, xmmp[2]);
293
x3 = _mm_xor_si128(x3, xmmp[3]);
294
295
if (Bxor) {
296
xmmp = (xmmi *)scrypt_block(Bxor, i);
297
x0 = _mm_xor_si128(x0, xmmp[0]);
298
x1 = _mm_xor_si128(x1, xmmp[1]);
299
x2 = _mm_xor_si128(x2, xmmp[2]);
300
x3 = _mm_xor_si128(x3, xmmp[3]);
301
}
302
303
t0 = x0;
304
t1 = x1;
305
t2 = x2;
306
t3 = x3;
307
308
for (rounds = 8; rounds; rounds -= 2) {
309
x0 = _mm_add_epi32(x0, x1);
310
x3 = _mm_xor_si128(x3, x0);
311
x3 = _mm_shuffle_epi8(x3, x4);
312
x2 = _mm_add_epi32(x2, x3);
313
x1 = _mm_xor_si128(x1, x2);
314
x6 = _mm_srli_epi32(x1, 20);
315
x1 = _mm_slli_epi32(x1, 12);
316
x1 = _mm_or_si128(x1, x6);
317
x0 = _mm_add_epi32(x0, x1);
318
x3 = _mm_xor_si128(x3, x0);
319
x3 = _mm_shuffle_epi8(x3, x5);
320
x0 = _mm_shuffle_epi32(x0, 0x93);
321
x2 = _mm_add_epi32(x2, x3);
322
x3 = _mm_shuffle_epi32(x3, 0x4e);
323
x1 = _mm_xor_si128(x1, x2);
324
x2 = _mm_shuffle_epi32(x2, 0x39);
325
x6 = _mm_srli_epi32(x1, 25);
326
x1 = _mm_slli_epi32(x1, 7);
327
x1 = _mm_or_si128(x1, x6);
328
x0 = _mm_add_epi32(x0, x1);
329
x3 = _mm_xor_si128(x3, x0);
330
x3 = _mm_shuffle_epi8(x3, x4);
331
x2 = _mm_add_epi32(x2, x3);
332
x1 = _mm_xor_si128(x1, x2);
333
x6 = _mm_srli_epi32(x1, 20);
334
x1 = _mm_slli_epi32(x1, 12);
335
x1 = _mm_or_si128(x1, x6);
336
x0 = _mm_add_epi32(x0, x1);
337
x3 = _mm_xor_si128(x3, x0);
338
x3 = _mm_shuffle_epi8(x3, x5);
339
x0 = _mm_shuffle_epi32(x0, 0x39);
340
x2 = _mm_add_epi32(x2, x3);
341
x3 = _mm_shuffle_epi32(x3, 0x4e);
342
x1 = _mm_xor_si128(x1, x2);
343
x2 = _mm_shuffle_epi32(x2, 0x93);
344
x6 = _mm_srli_epi32(x1, 25);
345
x1 = _mm_slli_epi32(x1, 7);
346
x1 = _mm_or_si128(x1, x6);
347
}
348
349
x0 = _mm_add_epi32(x0, t0);
350
x1 = _mm_add_epi32(x1, t1);
351
x2 = _mm_add_epi32(x2, t2);
352
x3 = _mm_add_epi32(x3, t3);
353
354
/* 4: Y_i = X */
355
/* 6: B'[0..r-1] = Y_even */
356
/* 6: B'[r..2r-1] = Y_odd */
357
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
358
xmmp[0] = x0;
359
xmmp[1] = x1;
360
xmmp[2] = x2;
361
xmmp[3] = x3;
362
}
363
}
364
365
/*
366
* Special version with r = 1 and no XORing
367
* - mikaelh
368
*/
369
static void NOINLINE
370
scrypt_ChunkMix_avx_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) {
371
const uint32_t r = 1;
372
uint32_t i, blocksPerChunk = r * 2, half = 0;
373
xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
374
const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
375
size_t rounds;
376
377
/* 1: X = B_{2r - 1} */
378
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
379
x0 = xmmp[0];
380
x1 = xmmp[1];
381
x2 = xmmp[2];
382
x3 = xmmp[3];
383
384
/* 2: for i = 0 to 2r - 1 do */
385
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
386
/* 3: X = H(X ^ B_i) */
387
xmmp = (xmmi *)scrypt_block(Bin, i);
388
x0 = _mm_xor_si128(x0, xmmp[0]);
389
x1 = _mm_xor_si128(x1, xmmp[1]);
390
x2 = _mm_xor_si128(x2, xmmp[2]);
391
x3 = _mm_xor_si128(x3, xmmp[3]);
392
393
t0 = x0;
394
t1 = x1;
395
t2 = x2;
396
t3 = x3;
397
398
for (rounds = 8; rounds; rounds -= 2) {
399
x0 = _mm_add_epi32(x0, x1);
400
x3 = _mm_xor_si128(x3, x0);
401
x3 = _mm_shuffle_epi8(x3, x4);
402
x2 = _mm_add_epi32(x2, x3);
403
x1 = _mm_xor_si128(x1, x2);
404
x6 = _mm_srli_epi32(x1, 20);
405
x1 = _mm_slli_epi32(x1, 12);
406
x1 = _mm_or_si128(x1, x6);
407
x0 = _mm_add_epi32(x0, x1);
408
x3 = _mm_xor_si128(x3, x0);
409
x3 = _mm_shuffle_epi8(x3, x5);
410
x0 = _mm_shuffle_epi32(x0, 0x93);
411
x2 = _mm_add_epi32(x2, x3);
412
x3 = _mm_shuffle_epi32(x3, 0x4e);
413
x1 = _mm_xor_si128(x1, x2);
414
x2 = _mm_shuffle_epi32(x2, 0x39);
415
x6 = _mm_srli_epi32(x1, 25);
416
x1 = _mm_slli_epi32(x1, 7);
417
x1 = _mm_or_si128(x1, x6);
418
x0 = _mm_add_epi32(x0, x1);
419
x3 = _mm_xor_si128(x3, x0);
420
x3 = _mm_shuffle_epi8(x3, x4);
421
x2 = _mm_add_epi32(x2, x3);
422
x1 = _mm_xor_si128(x1, x2);
423
x6 = _mm_srli_epi32(x1, 20);
424
x1 = _mm_slli_epi32(x1, 12);
425
x1 = _mm_or_si128(x1, x6);
426
x0 = _mm_add_epi32(x0, x1);
427
x3 = _mm_xor_si128(x3, x0);
428
x3 = _mm_shuffle_epi8(x3, x5);
429
x0 = _mm_shuffle_epi32(x0, 0x39);
430
x2 = _mm_add_epi32(x2, x3);
431
x3 = _mm_shuffle_epi32(x3, 0x4e);
432
x1 = _mm_xor_si128(x1, x2);
433
x2 = _mm_shuffle_epi32(x2, 0x93);
434
x6 = _mm_srli_epi32(x1, 25);
435
x1 = _mm_slli_epi32(x1, 7);
436
x1 = _mm_or_si128(x1, x6);
437
}
438
439
x0 = _mm_add_epi32(x0, t0);
440
x1 = _mm_add_epi32(x1, t1);
441
x2 = _mm_add_epi32(x2, t2);
442
x3 = _mm_add_epi32(x3, t3);
443
444
/* 4: Y_i = X */
445
/* 6: B'[0..r-1] = Y_even */
446
/* 6: B'[r..2r-1] = Y_odd */
447
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
448
xmmp[0] = x0;
449
xmmp[1] = x1;
450
xmmp[2] = x2;
451
xmmp[3] = x3;
452
}
453
}
454
455
/*
456
* Special version with r = 1 and unconditional XORing
457
* - mikaelh
458
*/
459
static void NOINLINE
460
scrypt_ChunkMix_avx_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) {
461
const uint32_t r = 1;
462
uint32_t i, blocksPerChunk = r * 2, half = 0;
463
xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
464
const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
465
size_t rounds;
466
467
/* 1: X = B_{2r - 1} */
468
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
469
x0 = xmmp[0];
470
x1 = xmmp[1];
471
x2 = xmmp[2];
472
x3 = xmmp[3];
473
474
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
475
x0 = _mm_xor_si128(x0, xmmp[0]);
476
x1 = _mm_xor_si128(x1, xmmp[1]);
477
x2 = _mm_xor_si128(x2, xmmp[2]);
478
x3 = _mm_xor_si128(x3, xmmp[3]);
479
480
/* 2: for i = 0 to 2r - 1 do */
481
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
482
/* 3: X = H(X ^ B_i) */
483
xmmp = (xmmi *)scrypt_block(Bin, i);
484
x0 = _mm_xor_si128(x0, xmmp[0]);
485
x1 = _mm_xor_si128(x1, xmmp[1]);
486
x2 = _mm_xor_si128(x2, xmmp[2]);
487
x3 = _mm_xor_si128(x3, xmmp[3]);
488
489
xmmp = (xmmi *)scrypt_block(Bxor, i);
490
x0 = _mm_xor_si128(x0, xmmp[0]);
491
x1 = _mm_xor_si128(x1, xmmp[1]);
492
x2 = _mm_xor_si128(x2, xmmp[2]);
493
x3 = _mm_xor_si128(x3, xmmp[3]);
494
495
t0 = x0;
496
t1 = x1;
497
t2 = x2;
498
t3 = x3;
499
500
for (rounds = 8; rounds; rounds -= 2) {
501
x0 = _mm_add_epi32(x0, x1);
502
x3 = _mm_xor_si128(x3, x0);
503
x3 = _mm_shuffle_epi8(x3, x4);
504
x2 = _mm_add_epi32(x2, x3);
505
x1 = _mm_xor_si128(x1, x2);
506
x6 = _mm_srli_epi32(x1, 20);
507
x1 = _mm_slli_epi32(x1, 12);
508
x1 = _mm_or_si128(x1, x6);
509
x0 = _mm_add_epi32(x0, x1);
510
x3 = _mm_xor_si128(x3, x0);
511
x3 = _mm_shuffle_epi8(x3, x5);
512
x0 = _mm_shuffle_epi32(x0, 0x93);
513
x2 = _mm_add_epi32(x2, x3);
514
x3 = _mm_shuffle_epi32(x3, 0x4e);
515
x1 = _mm_xor_si128(x1, x2);
516
x2 = _mm_shuffle_epi32(x2, 0x39);
517
x6 = _mm_srli_epi32(x1, 25);
518
x1 = _mm_slli_epi32(x1, 7);
519
x1 = _mm_or_si128(x1, x6);
520
x0 = _mm_add_epi32(x0, x1);
521
x3 = _mm_xor_si128(x3, x0);
522
x3 = _mm_shuffle_epi8(x3, x4);
523
x2 = _mm_add_epi32(x2, x3);
524
x1 = _mm_xor_si128(x1, x2);
525
x6 = _mm_srli_epi32(x1, 20);
526
x1 = _mm_slli_epi32(x1, 12);
527
x1 = _mm_or_si128(x1, x6);
528
x0 = _mm_add_epi32(x0, x1);
529
x3 = _mm_xor_si128(x3, x0);
530
x3 = _mm_shuffle_epi8(x3, x5);
531
x0 = _mm_shuffle_epi32(x0, 0x39);
532
x2 = _mm_add_epi32(x2, x3);
533
x3 = _mm_shuffle_epi32(x3, 0x4e);
534
x1 = _mm_xor_si128(x1, x2);
535
x2 = _mm_shuffle_epi32(x2, 0x93);
536
x6 = _mm_srli_epi32(x1, 25);
537
x1 = _mm_slli_epi32(x1, 7);
538
x1 = _mm_or_si128(x1, x6);
539
}
540
541
x0 = _mm_add_epi32(x0, t0);
542
x1 = _mm_add_epi32(x1, t1);
543
x2 = _mm_add_epi32(x2, t2);
544
x3 = _mm_add_epi32(x3, t3);
545
546
/* 4: Y_i = X */
547
/* 6: B'[0..r-1] = Y_even */
548
/* 6: B'[r..2r-1] = Y_odd */
549
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
550
xmmp[0] = x0;
551
xmmp[1] = x1;
552
xmmp[2] = x2;
553
xmmp[3] = x3;
554
}
555
}
556
557
#endif
558
559
#if defined(SCRYPT_CHACHA_AVX)
560
#undef SCRYPT_MIX
561
#define SCRYPT_MIX "ChaCha/8-AVX"
562
#undef SCRYPT_CHACHA_INCLUDED
563
#define SCRYPT_CHACHA_INCLUDED
564
#endif
565
566