Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
tpruvot
GitHub Repository: tpruvot/cpuminer-multi
Path: blob/linux/scryptjane/scrypt-jane-mix_chacha-sse2.h
1201 views
1
/* x86 */
2
#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
3
4
#define SCRYPT_CHACHA_SSE2
5
6
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
7
asm_naked_fn(scrypt_ChunkMix_sse2)
8
a1(push ebx)
9
a1(push edi)
10
a1(push esi)
11
a1(push ebp)
12
a2(mov ebp,esp)
13
a2(mov edi,[ebp+20])
14
a2(mov esi,[ebp+24])
15
a2(mov eax,[ebp+28])
16
a2(mov ebx,[ebp+32])
17
a2(sub esp,16)
18
a2(and esp,~15)
19
a2(lea edx,[ebx*2])
20
a2(shl edx,6)
21
a2(lea ecx,[edx-64])
22
a2(and eax, eax)
23
a2(movdqa xmm0,[ecx+esi+0])
24
a2(movdqa xmm1,[ecx+esi+16])
25
a2(movdqa xmm2,[ecx+esi+32])
26
a2(movdqa xmm3,[ecx+esi+48])
27
a1(jz scrypt_ChunkMix_sse2_no_xor1)
28
a2(pxor xmm0,[ecx+eax+0])
29
a2(pxor xmm1,[ecx+eax+16])
30
a2(pxor xmm2,[ecx+eax+32])
31
a2(pxor xmm3,[ecx+eax+48])
32
a1(scrypt_ChunkMix_sse2_no_xor1:)
33
a2(xor ecx,ecx)
34
a2(xor ebx,ebx)
35
a1(scrypt_ChunkMix_sse2_loop:)
36
a2(and eax, eax)
37
a2(pxor xmm0,[esi+ecx+0])
38
a2(pxor xmm1,[esi+ecx+16])
39
a2(pxor xmm2,[esi+ecx+32])
40
a2(pxor xmm3,[esi+ecx+48])
41
a1(jz scrypt_ChunkMix_sse2_no_xor2)
42
a2(pxor xmm0,[eax+ecx+0])
43
a2(pxor xmm1,[eax+ecx+16])
44
a2(pxor xmm2,[eax+ecx+32])
45
a2(pxor xmm3,[eax+ecx+48])
46
a1(scrypt_ChunkMix_sse2_no_xor2:)
47
a2(movdqa [esp+0],xmm0)
48
a2(movdqa xmm4,xmm1)
49
a2(movdqa xmm5,xmm2)
50
a2(movdqa xmm7,xmm3)
51
a2(mov eax,8)
52
a1(scrypt_chacha_sse2_loop: )
53
a2(paddd xmm0,xmm1)
54
a2(pxor xmm3,xmm0)
55
a2(movdqa xmm6,xmm3)
56
a2(pslld xmm3,16)
57
a2(psrld xmm6,16)
58
a2(pxor xmm3,xmm6)
59
a2(paddd xmm2,xmm3)
60
a2(pxor xmm1,xmm2)
61
a2(movdqa xmm6,xmm1)
62
a2(pslld xmm1,12)
63
a2(psrld xmm6,20)
64
a2(pxor xmm1,xmm6)
65
a2(paddd xmm0,xmm1)
66
a2(pxor xmm3,xmm0)
67
a2(movdqa xmm6,xmm3)
68
a2(pslld xmm3,8)
69
a2(psrld xmm6,24)
70
a2(pxor xmm3,xmm6)
71
a3(pshufd xmm0,xmm0,0x93)
72
a2(paddd xmm2,xmm3)
73
a3(pshufd xmm3,xmm3,0x4e)
74
a2(pxor xmm1,xmm2)
75
a3(pshufd xmm2,xmm2,0x39)
76
a2(movdqa xmm6,xmm1)
77
a2(pslld xmm1,7)
78
a2(psrld xmm6,25)
79
a2(pxor xmm1,xmm6)
80
a2(sub eax,2)
81
a2(paddd xmm0,xmm1)
82
a2(pxor xmm3,xmm0)
83
a2(movdqa xmm6,xmm3)
84
a2(pslld xmm3,16)
85
a2(psrld xmm6,16)
86
a2(pxor xmm3,xmm6)
87
a2(paddd xmm2,xmm3)
88
a2(pxor xmm1,xmm2)
89
a2(movdqa xmm6,xmm1)
90
a2(pslld xmm1,12)
91
a2(psrld xmm6,20)
92
a2(pxor xmm1,xmm6)
93
a2(paddd xmm0,xmm1)
94
a2(pxor xmm3,xmm0)
95
a2(movdqa xmm6,xmm3)
96
a2(pslld xmm3,8)
97
a2(psrld xmm6,24)
98
a2(pxor xmm3,xmm6)
99
a3(pshufd xmm0,xmm0,0x39)
100
a2(paddd xmm2,xmm3)
101
a3(pshufd xmm3,xmm3,0x4e)
102
a2(pxor xmm1,xmm2)
103
a3(pshufd xmm2,xmm2,0x93)
104
a2(movdqa xmm6,xmm1)
105
a2(pslld xmm1,7)
106
a2(psrld xmm6,25)
107
a2(pxor xmm1,xmm6)
108
a1(ja scrypt_chacha_sse2_loop)
109
a2(paddd xmm0,[esp+0])
110
a2(paddd xmm1,xmm4)
111
a2(paddd xmm2,xmm5)
112
a2(paddd xmm3,xmm7)
113
a2(lea eax,[ebx+ecx])
114
a2(xor ebx,edx)
115
a2(and eax,~0x7f)
116
a2(add ecx,64)
117
a2(shr eax,1)
118
a2(add eax, edi)
119
a2(cmp ecx,edx)
120
a2(movdqa [eax+0],xmm0)
121
a2(movdqa [eax+16],xmm1)
122
a2(movdqa [eax+32],xmm2)
123
a2(movdqa [eax+48],xmm3)
124
a2(mov eax,[ebp+28])
125
a1(jne scrypt_ChunkMix_sse2_loop)
126
a2(mov esp,ebp)
127
a1(pop ebp)
128
a1(pop esi)
129
a1(pop edi)
130
a1(pop ebx)
131
aret(16)
132
asm_naked_fn_end(scrypt_ChunkMix_sse2)
133
134
#endif
135
136
137
138
/* x64 */
139
#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
140
141
#define SCRYPT_CHACHA_SSE2
142
143
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
144
asm_naked_fn(scrypt_ChunkMix_sse2)
145
a2(lea rcx,[rcx*2])
146
a2(shl rcx,6)
147
a2(lea r9,[rcx-64])
148
a2(lea rax,[rsi+r9])
149
a2(lea r9,[rdx+r9])
150
a2(and rdx, rdx)
151
a2(movdqa xmm0,[rax+0])
152
a2(movdqa xmm1,[rax+16])
153
a2(movdqa xmm2,[rax+32])
154
a2(movdqa xmm3,[rax+48])
155
a1(jz scrypt_ChunkMix_sse2_no_xor1)
156
a2(pxor xmm0,[r9+0])
157
a2(pxor xmm1,[r9+16])
158
a2(pxor xmm2,[r9+32])
159
a2(pxor xmm3,[r9+48])
160
a1(scrypt_ChunkMix_sse2_no_xor1:)
161
a2(xor r9,r9)
162
a2(xor r8,r8)
163
a1(scrypt_ChunkMix_sse2_loop:)
164
a2(and rdx, rdx)
165
a2(pxor xmm0,[rsi+r9+0])
166
a2(pxor xmm1,[rsi+r9+16])
167
a2(pxor xmm2,[rsi+r9+32])
168
a2(pxor xmm3,[rsi+r9+48])
169
a1(jz scrypt_ChunkMix_sse2_no_xor2)
170
a2(pxor xmm0,[rdx+r9+0])
171
a2(pxor xmm1,[rdx+r9+16])
172
a2(pxor xmm2,[rdx+r9+32])
173
a2(pxor xmm3,[rdx+r9+48])
174
a1(scrypt_ChunkMix_sse2_no_xor2:)
175
a2(movdqa xmm8,xmm0)
176
a2(movdqa xmm9,xmm1)
177
a2(movdqa xmm10,xmm2)
178
a2(movdqa xmm11,xmm3)
179
a2(mov rax,8)
180
a1(scrypt_chacha_sse2_loop: )
181
a2(paddd xmm0,xmm1)
182
a2(pxor xmm3,xmm0)
183
a2(movdqa xmm6,xmm3)
184
a2(pslld xmm3,16)
185
a2(psrld xmm6,16)
186
a2(pxor xmm3,xmm6)
187
a2(paddd xmm2,xmm3)
188
a2(pxor xmm1,xmm2)
189
a2(movdqa xmm6,xmm1)
190
a2(pslld xmm1,12)
191
a2(psrld xmm6,20)
192
a2(pxor xmm1,xmm6)
193
a2(paddd xmm0,xmm1)
194
a2(pxor xmm3,xmm0)
195
a2(movdqa xmm6,xmm3)
196
a2(pslld xmm3,8)
197
a2(psrld xmm6,24)
198
a2(pxor xmm3,xmm6)
199
a3(pshufd xmm0,xmm0,0x93)
200
a2(paddd xmm2,xmm3)
201
a3(pshufd xmm3,xmm3,0x4e)
202
a2(pxor xmm1,xmm2)
203
a3(pshufd xmm2,xmm2,0x39)
204
a2(movdqa xmm6,xmm1)
205
a2(pslld xmm1,7)
206
a2(psrld xmm6,25)
207
a2(pxor xmm1,xmm6)
208
a2(sub rax,2)
209
a2(paddd xmm0,xmm1)
210
a2(pxor xmm3,xmm0)
211
a2(movdqa xmm6,xmm3)
212
a2(pslld xmm3,16)
213
a2(psrld xmm6,16)
214
a2(pxor xmm3,xmm6)
215
a2(paddd xmm2,xmm3)
216
a2(pxor xmm1,xmm2)
217
a2(movdqa xmm6,xmm1)
218
a2(pslld xmm1,12)
219
a2(psrld xmm6,20)
220
a2(pxor xmm1,xmm6)
221
a2(paddd xmm0,xmm1)
222
a2(pxor xmm3,xmm0)
223
a2(movdqa xmm6,xmm3)
224
a2(pslld xmm3,8)
225
a2(psrld xmm6,24)
226
a2(pxor xmm3,xmm6)
227
a3(pshufd xmm0,xmm0,0x39)
228
a2(paddd xmm2,xmm3)
229
a3(pshufd xmm3,xmm3,0x4e)
230
a2(pxor xmm1,xmm2)
231
a3(pshufd xmm2,xmm2,0x93)
232
a2(movdqa xmm6,xmm1)
233
a2(pslld xmm1,7)
234
a2(psrld xmm6,25)
235
a2(pxor xmm1,xmm6)
236
a1(ja scrypt_chacha_sse2_loop)
237
a2(paddd xmm0,xmm8)
238
a2(paddd xmm1,xmm9)
239
a2(paddd xmm2,xmm10)
240
a2(paddd xmm3,xmm11)
241
a2(lea rax,[r8+r9])
242
a2(xor r8,rcx)
243
a2(and rax,~0x7f)
244
a2(add r9,64)
245
a2(shr rax,1)
246
a2(add rax, rdi)
247
a2(cmp r9,rcx)
248
a2(movdqa [rax+0],xmm0)
249
a2(movdqa [rax+16],xmm1)
250
a2(movdqa [rax+32],xmm2)
251
a2(movdqa [rax+48],xmm3)
252
a1(jne scrypt_ChunkMix_sse2_loop)
253
a1(ret)
254
asm_naked_fn_end(scrypt_ChunkMix_sse2)
255
256
#endif
257
258
259
/* intrinsic */
260
#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
261
262
#define SCRYPT_CHACHA_SSE2
263
264
static void NOINLINE
265
scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
266
uint32_t i, blocksPerChunk = r * 2, half = 0;
267
xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3;
268
size_t rounds;
269
270
/* 1: X = B_{2r - 1} */
271
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
272
x0 = xmmp[0];
273
x1 = xmmp[1];
274
x2 = xmmp[2];
275
x3 = xmmp[3];
276
277
if (Bxor) {
278
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
279
x0 = _mm_xor_si128(x0, xmmp[0]);
280
x1 = _mm_xor_si128(x1, xmmp[1]);
281
x2 = _mm_xor_si128(x2, xmmp[2]);
282
x3 = _mm_xor_si128(x3, xmmp[3]);
283
}
284
285
/* 2: for i = 0 to 2r - 1 do */
286
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
287
/* 3: X = H(X ^ B_i) */
288
xmmp = (xmmi *)scrypt_block(Bin, i);
289
x0 = _mm_xor_si128(x0, xmmp[0]);
290
x1 = _mm_xor_si128(x1, xmmp[1]);
291
x2 = _mm_xor_si128(x2, xmmp[2]);
292
x3 = _mm_xor_si128(x3, xmmp[3]);
293
294
if (Bxor) {
295
xmmp = (xmmi *)scrypt_block(Bxor, i);
296
x0 = _mm_xor_si128(x0, xmmp[0]);
297
x1 = _mm_xor_si128(x1, xmmp[1]);
298
x2 = _mm_xor_si128(x2, xmmp[2]);
299
x3 = _mm_xor_si128(x3, xmmp[3]);
300
}
301
302
t0 = x0;
303
t1 = x1;
304
t2 = x2;
305
t3 = x3;
306
307
for (rounds = 8; rounds; rounds -= 2) {
308
x0 = _mm_add_epi32(x0, x1);
309
x3 = _mm_xor_si128(x3, x0);
310
x4 = x3;
311
x3 = _mm_slli_epi32(x3, 16);
312
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
313
x2 = _mm_add_epi32(x2, x3);
314
x1 = _mm_xor_si128(x1, x2);
315
x4 = x1;
316
x1 = _mm_slli_epi32(x1, 12);
317
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
318
x0 = _mm_add_epi32(x0, x1);
319
x3 = _mm_xor_si128(x3, x0);
320
x4 = x3;
321
x3 = _mm_slli_epi32(x3, 8);
322
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
323
x0 = _mm_shuffle_epi32(x0, 0x93);
324
x2 = _mm_add_epi32(x2, x3);
325
x3 = _mm_shuffle_epi32(x3, 0x4e);
326
x1 = _mm_xor_si128(x1, x2);
327
x2 = _mm_shuffle_epi32(x2, 0x39);
328
x4 = x1;
329
x1 = _mm_slli_epi32(x1, 7);
330
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
331
x0 = _mm_add_epi32(x0, x1);
332
x3 = _mm_xor_si128(x3, x0);
333
x4 = x3;
334
x3 = _mm_slli_epi32(x3, 16);
335
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
336
x2 = _mm_add_epi32(x2, x3);
337
x1 = _mm_xor_si128(x1, x2);
338
x4 = x1;
339
x1 = _mm_slli_epi32(x1, 12);
340
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
341
x0 = _mm_add_epi32(x0, x1);
342
x3 = _mm_xor_si128(x3, x0);
343
x4 = x3;
344
x3 = _mm_slli_epi32(x3, 8);
345
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
346
x0 = _mm_shuffle_epi32(x0, 0x39);
347
x2 = _mm_add_epi32(x2, x3);
348
x3 = _mm_shuffle_epi32(x3, 0x4e);
349
x1 = _mm_xor_si128(x1, x2);
350
x2 = _mm_shuffle_epi32(x2, 0x93);
351
x4 = x1;
352
x1 = _mm_slli_epi32(x1, 7);
353
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
354
}
355
356
x0 = _mm_add_epi32(x0, t0);
357
x1 = _mm_add_epi32(x1, t1);
358
x2 = _mm_add_epi32(x2, t2);
359
x3 = _mm_add_epi32(x3, t3);
360
361
/* 4: Y_i = X */
362
/* 6: B'[0..r-1] = Y_even */
363
/* 6: B'[r..2r-1] = Y_odd */
364
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
365
xmmp[0] = x0;
366
xmmp[1] = x1;
367
xmmp[2] = x2;
368
xmmp[3] = x3;
369
}
370
}
371
372
/*
373
* Special version with r = 1 and no XORing
374
* - mikaelh
375
*/
376
static void NOINLINE
377
scrypt_ChunkMix_sse2_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) {
378
const uint32_t r = 1;
379
uint32_t i, blocksPerChunk = r * 2, half = 0;
380
xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3;
381
size_t rounds;
382
383
/* 1: X = B_{2r - 1} */
384
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
385
x0 = xmmp[0];
386
x1 = xmmp[1];
387
x2 = xmmp[2];
388
x3 = xmmp[3];
389
390
/* 2: for i = 0 to 2r - 1 do */
391
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
392
/* 3: X = H(X ^ B_i) */
393
xmmp = (xmmi *)scrypt_block(Bin, i);
394
x0 = _mm_xor_si128(x0, xmmp[0]);
395
x1 = _mm_xor_si128(x1, xmmp[1]);
396
x2 = _mm_xor_si128(x2, xmmp[2]);
397
x3 = _mm_xor_si128(x3, xmmp[3]);
398
399
t0 = x0;
400
t1 = x1;
401
t2 = x2;
402
t3 = x3;
403
404
for (rounds = 8; rounds; rounds -= 2) {
405
x0 = _mm_add_epi32(x0, x1);
406
x3 = _mm_xor_si128(x3, x0);
407
x4 = x3;
408
x3 = _mm_slli_epi32(x3, 16);
409
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
410
x2 = _mm_add_epi32(x2, x3);
411
x1 = _mm_xor_si128(x1, x2);
412
x4 = x1;
413
x1 = _mm_slli_epi32(x1, 12);
414
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
415
x0 = _mm_add_epi32(x0, x1);
416
x3 = _mm_xor_si128(x3, x0);
417
x4 = x3;
418
x3 = _mm_slli_epi32(x3, 8);
419
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
420
x0 = _mm_shuffle_epi32(x0, 0x93);
421
x2 = _mm_add_epi32(x2, x3);
422
x3 = _mm_shuffle_epi32(x3, 0x4e);
423
x1 = _mm_xor_si128(x1, x2);
424
x2 = _mm_shuffle_epi32(x2, 0x39);
425
x4 = x1;
426
x1 = _mm_slli_epi32(x1, 7);
427
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
428
x0 = _mm_add_epi32(x0, x1);
429
x3 = _mm_xor_si128(x3, x0);
430
x4 = x3;
431
x3 = _mm_slli_epi32(x3, 16);
432
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
433
x2 = _mm_add_epi32(x2, x3);
434
x1 = _mm_xor_si128(x1, x2);
435
x4 = x1;
436
x1 = _mm_slli_epi32(x1, 12);
437
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
438
x0 = _mm_add_epi32(x0, x1);
439
x3 = _mm_xor_si128(x3, x0);
440
x4 = x3;
441
x3 = _mm_slli_epi32(x3, 8);
442
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
443
x0 = _mm_shuffle_epi32(x0, 0x39);
444
x2 = _mm_add_epi32(x2, x3);
445
x3 = _mm_shuffle_epi32(x3, 0x4e);
446
x1 = _mm_xor_si128(x1, x2);
447
x2 = _mm_shuffle_epi32(x2, 0x93);
448
x4 = x1;
449
x1 = _mm_slli_epi32(x1, 7);
450
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
451
}
452
453
x0 = _mm_add_epi32(x0, t0);
454
x1 = _mm_add_epi32(x1, t1);
455
x2 = _mm_add_epi32(x2, t2);
456
x3 = _mm_add_epi32(x3, t3);
457
458
/* 4: Y_i = X */
459
/* 6: B'[0..r-1] = Y_even */
460
/* 6: B'[r..2r-1] = Y_odd */
461
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
462
xmmp[0] = x0;
463
xmmp[1] = x1;
464
xmmp[2] = x2;
465
xmmp[3] = x3;
466
}
467
}
468
469
/*
470
* Special version with r = 1 and unconditional XORing
471
* - mikaelh
472
*/
473
static void NOINLINE
474
scrypt_ChunkMix_sse2_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) {
475
const uint32_t r = 1;
476
uint32_t i, blocksPerChunk = r * 2, half = 0;
477
xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3;
478
size_t rounds;
479
480
/* 1: X = B_{2r - 1} */
481
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
482
x0 = xmmp[0];
483
x1 = xmmp[1];
484
x2 = xmmp[2];
485
x3 = xmmp[3];
486
487
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
488
x0 = _mm_xor_si128(x0, xmmp[0]);
489
x1 = _mm_xor_si128(x1, xmmp[1]);
490
x2 = _mm_xor_si128(x2, xmmp[2]);
491
x3 = _mm_xor_si128(x3, xmmp[3]);
492
493
/* 2: for i = 0 to 2r - 1 do */
494
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
495
/* 3: X = H(X ^ B_i) */
496
xmmp = (xmmi *)scrypt_block(Bin, i);
497
x0 = _mm_xor_si128(x0, xmmp[0]);
498
x1 = _mm_xor_si128(x1, xmmp[1]);
499
x2 = _mm_xor_si128(x2, xmmp[2]);
500
x3 = _mm_xor_si128(x3, xmmp[3]);
501
502
xmmp = (xmmi *)scrypt_block(Bxor, i);
503
x0 = _mm_xor_si128(x0, xmmp[0]);
504
x1 = _mm_xor_si128(x1, xmmp[1]);
505
x2 = _mm_xor_si128(x2, xmmp[2]);
506
x3 = _mm_xor_si128(x3, xmmp[3]);
507
508
t0 = x0;
509
t1 = x1;
510
t2 = x2;
511
t3 = x3;
512
513
for (rounds = 8; rounds; rounds -= 2) {
514
x0 = _mm_add_epi32(x0, x1);
515
x3 = _mm_xor_si128(x3, x0);
516
x4 = x3;
517
x3 = _mm_slli_epi32(x3, 16);
518
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
519
x2 = _mm_add_epi32(x2, x3);
520
x1 = _mm_xor_si128(x1, x2);
521
x4 = x1;
522
x1 = _mm_slli_epi32(x1, 12);
523
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
524
x0 = _mm_add_epi32(x0, x1);
525
x3 = _mm_xor_si128(x3, x0);
526
x4 = x3;
527
x3 = _mm_slli_epi32(x3, 8);
528
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
529
x0 = _mm_shuffle_epi32(x0, 0x93);
530
x2 = _mm_add_epi32(x2, x3);
531
x3 = _mm_shuffle_epi32(x3, 0x4e);
532
x1 = _mm_xor_si128(x1, x2);
533
x2 = _mm_shuffle_epi32(x2, 0x39);
534
x4 = x1;
535
x1 = _mm_slli_epi32(x1, 7);
536
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
537
x0 = _mm_add_epi32(x0, x1);
538
x3 = _mm_xor_si128(x3, x0);
539
x4 = x3;
540
x3 = _mm_slli_epi32(x3, 16);
541
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
542
x2 = _mm_add_epi32(x2, x3);
543
x1 = _mm_xor_si128(x1, x2);
544
x4 = x1;
545
x1 = _mm_slli_epi32(x1, 12);
546
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
547
x0 = _mm_add_epi32(x0, x1);
548
x3 = _mm_xor_si128(x3, x0);
549
x4 = x3;
550
x3 = _mm_slli_epi32(x3, 8);
551
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
552
x0 = _mm_shuffle_epi32(x0, 0x39);
553
x2 = _mm_add_epi32(x2, x3);
554
x3 = _mm_shuffle_epi32(x3, 0x4e);
555
x1 = _mm_xor_si128(x1, x2);
556
x2 = _mm_shuffle_epi32(x2, 0x93);
557
x4 = x1;
558
x1 = _mm_slli_epi32(x1, 7);
559
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
560
}
561
562
x0 = _mm_add_epi32(x0, t0);
563
x1 = _mm_add_epi32(x1, t1);
564
x2 = _mm_add_epi32(x2, t2);
565
x3 = _mm_add_epi32(x3, t3);
566
567
/* 4: Y_i = X */
568
/* 6: B'[0..r-1] = Y_even */
569
/* 6: B'[r..2r-1] = Y_odd */
570
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
571
xmmp[0] = x0;
572
xmmp[1] = x1;
573
xmmp[2] = x2;
574
xmmp[3] = x3;
575
}
576
}
577
578
#endif
579
580
#if defined(SCRYPT_CHACHA_SSE2)
581
#undef SCRYPT_MIX
582
#define SCRYPT_MIX "ChaCha/8-SSE2"
583
#undef SCRYPT_CHACHA_INCLUDED
584
#define SCRYPT_CHACHA_INCLUDED
585
#endif
586
587