Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
tpruvot
GitHub Repository: tpruvot/cpuminer-multi
Path: blob/linux/scryptjane/scrypt-jane-mix_chacha-ssse3.h
1201 views
1
/* x86 */
2
#if defined(X86ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
3
4
#define SCRYPT_CHACHA_SSSE3
5
6
asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
7
asm_naked_fn(scrypt_ChunkMix_ssse3)
8
a1(push ebx)
9
a1(push edi)
10
a1(push esi)
11
a1(push ebp)
12
a2(mov ebp,esp)
13
a2(mov edi,[ebp+20])
14
a2(mov esi,[ebp+24])
15
a2(mov eax,[ebp+28])
16
a2(mov ebx,[ebp+32])
17
a2(sub esp,64)
18
a2(and esp,~63)
19
a2(lea edx,[ebx*2])
20
a2(shl edx,6)
21
a2(lea ecx,[edx-64])
22
a2(and eax, eax)
23
a2(mov ebx, 0x01000302)
24
a2(movd xmm4, ebx)
25
a2(mov ebx, 0x05040706)
26
a2(movd xmm0, ebx)
27
a2(mov ebx, 0x09080b0a)
28
a2(movd xmm1, ebx)
29
a2(mov ebx, 0x0d0c0f0e)
30
a2(movd xmm2, ebx)
31
a2(mov ebx, 0x02010003)
32
a2(movd xmm5, ebx)
33
a2(mov ebx, 0x06050407)
34
a2(movd xmm3, ebx)
35
a2(mov ebx, 0x0a09080b)
36
a2(movd xmm6, ebx)
37
a2(mov ebx, 0x0e0d0c0f)
38
a2(movd xmm7, ebx)
39
a2(punpckldq xmm4, xmm0)
40
a2(punpckldq xmm5, xmm3)
41
a2(punpckldq xmm1, xmm2)
42
a2(punpckldq xmm6, xmm7)
43
a2(punpcklqdq xmm4, xmm1)
44
a2(punpcklqdq xmm5, xmm6)
45
a2(movdqa xmm0,[ecx+esi+0])
46
a2(movdqa xmm1,[ecx+esi+16])
47
a2(movdqa xmm2,[ecx+esi+32])
48
a2(movdqa xmm3,[ecx+esi+48])
49
a1(jz scrypt_ChunkMix_ssse3_no_xor1)
50
a2(pxor xmm0,[ecx+eax+0])
51
a2(pxor xmm1,[ecx+eax+16])
52
a2(pxor xmm2,[ecx+eax+32])
53
a2(pxor xmm3,[ecx+eax+48])
54
a1(scrypt_ChunkMix_ssse3_no_xor1:)
55
a2(xor ecx,ecx)
56
a2(xor ebx,ebx)
57
a1(scrypt_ChunkMix_ssse3_loop:)
58
a2(and eax, eax)
59
a2(pxor xmm0,[esi+ecx+0])
60
a2(pxor xmm1,[esi+ecx+16])
61
a2(pxor xmm2,[esi+ecx+32])
62
a2(pxor xmm3,[esi+ecx+48])
63
a1(jz scrypt_ChunkMix_ssse3_no_xor2)
64
a2(pxor xmm0,[eax+ecx+0])
65
a2(pxor xmm1,[eax+ecx+16])
66
a2(pxor xmm2,[eax+ecx+32])
67
a2(pxor xmm3,[eax+ecx+48])
68
a1(scrypt_ChunkMix_ssse3_no_xor2:)
69
a2(movdqa [esp+0],xmm0)
70
a2(movdqa [esp+16],xmm1)
71
a2(movdqa [esp+32],xmm2)
72
a2(movdqa xmm7,xmm3)
73
a2(mov eax,8)
74
a1(scrypt_chacha_ssse3_loop: )
75
a2(paddd xmm0,xmm1)
76
a2(pxor xmm3,xmm0)
77
a2(pshufb xmm3,xmm4)
78
a2(paddd xmm2,xmm3)
79
a2(pxor xmm1,xmm2)
80
a2(movdqa xmm6,xmm1)
81
a2(pslld xmm1,12)
82
a2(psrld xmm6,20)
83
a2(pxor xmm1,xmm6)
84
a2(paddd xmm0,xmm1)
85
a2(pxor xmm3,xmm0)
86
a2(pshufb xmm3,xmm5)
87
a3(pshufd xmm0,xmm0,0x93)
88
a2(paddd xmm2,xmm3)
89
a3(pshufd xmm3,xmm3,0x4e)
90
a2(pxor xmm1,xmm2)
91
a3(pshufd xmm2,xmm2,0x39)
92
a2(movdqa xmm6,xmm1)
93
a2(pslld xmm1,7)
94
a2(psrld xmm6,25)
95
a2(pxor xmm1,xmm6)
96
a2(sub eax,2)
97
a2(paddd xmm0,xmm1)
98
a2(pxor xmm3,xmm0)
99
a2(pshufb xmm3,xmm4)
100
a2(paddd xmm2,xmm3)
101
a2(pxor xmm1,xmm2)
102
a2(movdqa xmm6,xmm1)
103
a2(pslld xmm1,12)
104
a2(psrld xmm6,20)
105
a2(pxor xmm1,xmm6)
106
a2(paddd xmm0,xmm1)
107
a2(pxor xmm3,xmm0)
108
a2(pshufb xmm3,xmm5)
109
a3(pshufd xmm0,xmm0,0x39)
110
a2(paddd xmm2,xmm3)
111
a3(pshufd xmm3,xmm3,0x4e)
112
a2(pxor xmm1,xmm2)
113
a3(pshufd xmm2,xmm2,0x93)
114
a2(movdqa xmm6,xmm1)
115
a2(pslld xmm1,7)
116
a2(psrld xmm6,25)
117
a2(pxor xmm1,xmm6)
118
a1(ja scrypt_chacha_ssse3_loop)
119
a2(paddd xmm0,[esp+0])
120
a2(paddd xmm1,[esp+16])
121
a2(paddd xmm2,[esp+32])
122
a2(paddd xmm3,xmm7)
123
a2(lea eax,[ebx+ecx])
124
a2(xor ebx,edx)
125
a2(and eax,~0x7f)
126
a2(add ecx,64)
127
a2(shr eax,1)
128
a2(add eax, edi)
129
a2(cmp ecx,edx)
130
a2(movdqa [eax+0],xmm0)
131
a2(movdqa [eax+16],xmm1)
132
a2(movdqa [eax+32],xmm2)
133
a2(movdqa [eax+48],xmm3)
134
a2(mov eax,[ebp+28])
135
a1(jne scrypt_ChunkMix_ssse3_loop)
136
a2(mov esp,ebp)
137
a1(pop ebp)
138
a1(pop esi)
139
a1(pop edi)
140
a1(pop ebx)
141
aret(16)
142
asm_naked_fn_end(scrypt_ChunkMix_ssse3)
143
144
#endif
145
146
147
148
/* x64 */
149
#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
150
151
#define SCRYPT_CHACHA_SSSE3
152
153
asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
154
asm_naked_fn(scrypt_ChunkMix_ssse3)
155
a2(lea rcx,[rcx*2])
156
a2(shl rcx,6)
157
a2(lea r9,[rcx-64])
158
a2(lea rax,[rsi+r9])
159
a2(lea r9,[rdx+r9])
160
a2(and rdx, rdx)
161
a2(movdqa xmm0,[rax+0])
162
a2(movdqa xmm1,[rax+16])
163
a2(movdqa xmm2,[rax+32])
164
a2(movdqa xmm3,[rax+48])
165
a2(mov r8, 0x0504070601000302)
166
a2(mov rax, 0x0d0c0f0e09080b0a)
167
a2(movq xmm4, r8)
168
a2(movq xmm6, rax)
169
a2(mov r8, 0x0605040702010003)
170
a2(mov rax, 0x0e0d0c0f0a09080b)
171
a2(movq xmm5, r8)
172
a2(movq xmm7, rax)
173
a2(punpcklqdq xmm4, xmm6)
174
a2(punpcklqdq xmm5, xmm7)
175
a1(jz scrypt_ChunkMix_ssse3_no_xor1)
176
a2(pxor xmm0,[r9+0])
177
a2(pxor xmm1,[r9+16])
178
a2(pxor xmm2,[r9+32])
179
a2(pxor xmm3,[r9+48])
180
a1(scrypt_ChunkMix_ssse3_no_xor1:)
181
a2(xor r8,r8)
182
a2(xor r9,r9)
183
a1(scrypt_ChunkMix_ssse3_loop:)
184
a2(and rdx, rdx)
185
a2(pxor xmm0,[rsi+r9+0])
186
a2(pxor xmm1,[rsi+r9+16])
187
a2(pxor xmm2,[rsi+r9+32])
188
a2(pxor xmm3,[rsi+r9+48])
189
a1(jz scrypt_ChunkMix_ssse3_no_xor2)
190
a2(pxor xmm0,[rdx+r9+0])
191
a2(pxor xmm1,[rdx+r9+16])
192
a2(pxor xmm2,[rdx+r9+32])
193
a2(pxor xmm3,[rdx+r9+48])
194
a1(scrypt_ChunkMix_ssse3_no_xor2:)
195
a2(movdqa xmm8,xmm0)
196
a2(movdqa xmm9,xmm1)
197
a2(movdqa xmm10,xmm2)
198
a2(movdqa xmm11,xmm3)
199
a2(mov rax,8)
200
a1(scrypt_chacha_ssse3_loop: )
201
a2(paddd xmm0,xmm1)
202
a2(pxor xmm3,xmm0)
203
a2(pshufb xmm3,xmm4)
204
a2(paddd xmm2,xmm3)
205
a2(pxor xmm1,xmm2)
206
a2(movdqa xmm12,xmm1)
207
a2(pslld xmm1,12)
208
a2(psrld xmm12,20)
209
a2(pxor xmm1,xmm12)
210
a2(paddd xmm0,xmm1)
211
a2(pxor xmm3,xmm0)
212
a2(pshufb xmm3,xmm5)
213
a3(pshufd xmm0,xmm0,0x93)
214
a2(paddd xmm2,xmm3)
215
a3(pshufd xmm3,xmm3,0x4e)
216
a2(pxor xmm1,xmm2)
217
a3(pshufd xmm2,xmm2,0x39)
218
a2(movdqa xmm12,xmm1)
219
a2(pslld xmm1,7)
220
a2(psrld xmm12,25)
221
a2(pxor xmm1,xmm12)
222
a2(sub rax,2)
223
a2(paddd xmm0,xmm1)
224
a2(pxor xmm3,xmm0)
225
a2(pshufb xmm3,xmm4)
226
a2(paddd xmm2,xmm3)
227
a2(pxor xmm1,xmm2)
228
a2(movdqa xmm12,xmm1)
229
a2(pslld xmm1,12)
230
a2(psrld xmm12,20)
231
a2(pxor xmm1,xmm12)
232
a2(paddd xmm0,xmm1)
233
a2(pxor xmm3,xmm0)
234
a2(pshufb xmm3,xmm5)
235
a3(pshufd xmm0,xmm0,0x39)
236
a2(paddd xmm2,xmm3)
237
a3(pshufd xmm3,xmm3,0x4e)
238
a2(pxor xmm1,xmm2)
239
a3(pshufd xmm2,xmm2,0x93)
240
a2(movdqa xmm12,xmm1)
241
a2(pslld xmm1,7)
242
a2(psrld xmm12,25)
243
a2(pxor xmm1,xmm12)
244
a1(ja scrypt_chacha_ssse3_loop)
245
a2(paddd xmm0,xmm8)
246
a2(paddd xmm1,xmm9)
247
a2(paddd xmm2,xmm10)
248
a2(paddd xmm3,xmm11)
249
a2(lea rax,[r8+r9])
250
a2(xor r8,rcx)
251
a2(and rax,~0x7f)
252
a2(add r9,64)
253
a2(shr rax,1)
254
a2(add rax, rdi)
255
a2(cmp r9,rcx)
256
a2(movdqa [rax+0],xmm0)
257
a2(movdqa [rax+16],xmm1)
258
a2(movdqa [rax+32],xmm2)
259
a2(movdqa [rax+48],xmm3)
260
a1(jne scrypt_ChunkMix_ssse3_loop)
261
a1(ret)
262
asm_naked_fn_end(scrypt_ChunkMix_ssse3)
263
264
#endif
265
266
267
/* intrinsic */
268
#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
269
270
#define SCRYPT_CHACHA_SSSE3
271
272
static void NOINLINE
273
scrypt_ChunkMix_ssse3(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
274
uint32_t i, blocksPerChunk = r * 2, half = 0;
275
xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
276
const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
277
size_t rounds;
278
279
/* 1: X = B_{2r - 1} */
280
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
281
x0 = xmmp[0];
282
x1 = xmmp[1];
283
x2 = xmmp[2];
284
x3 = xmmp[3];
285
286
if (Bxor) {
287
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
288
x0 = _mm_xor_si128(x0, xmmp[0]);
289
x1 = _mm_xor_si128(x1, xmmp[1]);
290
x2 = _mm_xor_si128(x2, xmmp[2]);
291
x3 = _mm_xor_si128(x3, xmmp[3]);
292
}
293
294
/* 2: for i = 0 to 2r - 1 do */
295
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
296
/* 3: X = H(X ^ B_i) */
297
xmmp = (xmmi *)scrypt_block(Bin, i);
298
x0 = _mm_xor_si128(x0, xmmp[0]);
299
x1 = _mm_xor_si128(x1, xmmp[1]);
300
x2 = _mm_xor_si128(x2, xmmp[2]);
301
x3 = _mm_xor_si128(x3, xmmp[3]);
302
303
if (Bxor) {
304
xmmp = (xmmi *)scrypt_block(Bxor, i);
305
x0 = _mm_xor_si128(x0, xmmp[0]);
306
x1 = _mm_xor_si128(x1, xmmp[1]);
307
x2 = _mm_xor_si128(x2, xmmp[2]);
308
x3 = _mm_xor_si128(x3, xmmp[3]);
309
}
310
311
t0 = x0;
312
t1 = x1;
313
t2 = x2;
314
t3 = x3;
315
316
for (rounds = 8; rounds; rounds -= 2) {
317
x0 = _mm_add_epi32(x0, x1);
318
x3 = _mm_xor_si128(x3, x0);
319
x3 = _mm_shuffle_epi8(x3, x4);
320
x2 = _mm_add_epi32(x2, x3);
321
x1 = _mm_xor_si128(x1, x2);
322
x6 = x1;
323
x1 = _mm_slli_epi32(x1, 12);
324
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
325
x0 = _mm_add_epi32(x0, x1);
326
x3 = _mm_xor_si128(x3, x0);
327
x3 = _mm_shuffle_epi8(x3, x5);
328
x0 = _mm_shuffle_epi32(x0, 0x93);
329
x2 = _mm_add_epi32(x2, x3);
330
x3 = _mm_shuffle_epi32(x3, 0x4e);
331
x1 = _mm_xor_si128(x1, x2);
332
x2 = _mm_shuffle_epi32(x2, 0x39);
333
x6 = x1;
334
x1 = _mm_slli_epi32(x1, 7);
335
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
336
x0 = _mm_add_epi32(x0, x1);
337
x3 = _mm_xor_si128(x3, x0);
338
x3 = _mm_shuffle_epi8(x3, x4);
339
x2 = _mm_add_epi32(x2, x3);
340
x1 = _mm_xor_si128(x1, x2);
341
x6 = x1;
342
x1 = _mm_slli_epi32(x1, 12);
343
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
344
x0 = _mm_add_epi32(x0, x1);
345
x3 = _mm_xor_si128(x3, x0);
346
x3 = _mm_shuffle_epi8(x3, x5);
347
x0 = _mm_shuffle_epi32(x0, 0x39);
348
x2 = _mm_add_epi32(x2, x3);
349
x3 = _mm_shuffle_epi32(x3, 0x4e);
350
x1 = _mm_xor_si128(x1, x2);
351
x2 = _mm_shuffle_epi32(x2, 0x93);
352
x6 = x1;
353
x1 = _mm_slli_epi32(x1, 7);
354
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
355
}
356
357
x0 = _mm_add_epi32(x0, t0);
358
x1 = _mm_add_epi32(x1, t1);
359
x2 = _mm_add_epi32(x2, t2);
360
x3 = _mm_add_epi32(x3, t3);
361
362
/* 4: Y_i = X */
363
/* 6: B'[0..r-1] = Y_even */
364
/* 6: B'[r..2r-1] = Y_odd */
365
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
366
xmmp[0] = x0;
367
xmmp[1] = x1;
368
xmmp[2] = x2;
369
xmmp[3] = x3;
370
}
371
}
372
373
/*
374
* Special version with r = 1 and no XORing
375
* - mikaelh
376
*/
377
static void NOINLINE
378
scrypt_ChunkMix_ssse3_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) {
379
const uint32_t r = 1;
380
uint32_t i, blocksPerChunk = r * 2, half = 0;
381
xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
382
const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
383
size_t rounds;
384
385
/* 1: X = B_{2r - 1} */
386
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
387
x0 = xmmp[0];
388
x1 = xmmp[1];
389
x2 = xmmp[2];
390
x3 = xmmp[3];
391
392
/* 2: for i = 0 to 2r - 1 do */
393
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
394
/* 3: X = H(X ^ B_i) */
395
xmmp = (xmmi *)scrypt_block(Bin, i);
396
x0 = _mm_xor_si128(x0, xmmp[0]);
397
x1 = _mm_xor_si128(x1, xmmp[1]);
398
x2 = _mm_xor_si128(x2, xmmp[2]);
399
x3 = _mm_xor_si128(x3, xmmp[3]);
400
401
t0 = x0;
402
t1 = x1;
403
t2 = x2;
404
t3 = x3;
405
406
for (rounds = 8; rounds; rounds -= 2) {
407
x0 = _mm_add_epi32(x0, x1);
408
x3 = _mm_xor_si128(x3, x0);
409
x3 = _mm_shuffle_epi8(x3, x4);
410
x2 = _mm_add_epi32(x2, x3);
411
x1 = _mm_xor_si128(x1, x2);
412
x6 = x1;
413
x1 = _mm_slli_epi32(x1, 12);
414
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
415
x0 = _mm_add_epi32(x0, x1);
416
x3 = _mm_xor_si128(x3, x0);
417
x3 = _mm_shuffle_epi8(x3, x5);
418
x0 = _mm_shuffle_epi32(x0, 0x93);
419
x2 = _mm_add_epi32(x2, x3);
420
x3 = _mm_shuffle_epi32(x3, 0x4e);
421
x1 = _mm_xor_si128(x1, x2);
422
x2 = _mm_shuffle_epi32(x2, 0x39);
423
x6 = x1;
424
x1 = _mm_slli_epi32(x1, 7);
425
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
426
x0 = _mm_add_epi32(x0, x1);
427
x3 = _mm_xor_si128(x3, x0);
428
x3 = _mm_shuffle_epi8(x3, x4);
429
x2 = _mm_add_epi32(x2, x3);
430
x1 = _mm_xor_si128(x1, x2);
431
x6 = x1;
432
x1 = _mm_slli_epi32(x1, 12);
433
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
434
x0 = _mm_add_epi32(x0, x1);
435
x3 = _mm_xor_si128(x3, x0);
436
x3 = _mm_shuffle_epi8(x3, x5);
437
x0 = _mm_shuffle_epi32(x0, 0x39);
438
x2 = _mm_add_epi32(x2, x3);
439
x3 = _mm_shuffle_epi32(x3, 0x4e);
440
x1 = _mm_xor_si128(x1, x2);
441
x2 = _mm_shuffle_epi32(x2, 0x93);
442
x6 = x1;
443
x1 = _mm_slli_epi32(x1, 7);
444
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
445
}
446
447
x0 = _mm_add_epi32(x0, t0);
448
x1 = _mm_add_epi32(x1, t1);
449
x2 = _mm_add_epi32(x2, t2);
450
x3 = _mm_add_epi32(x3, t3);
451
452
/* 4: Y_i = X */
453
/* 6: B'[0..r-1] = Y_even */
454
/* 6: B'[r..2r-1] = Y_odd */
455
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
456
xmmp[0] = x0;
457
xmmp[1] = x1;
458
xmmp[2] = x2;
459
xmmp[3] = x3;
460
}
461
}
462
463
/*
464
* Special version with r = 1 and unconditional XORing
465
* - mikaelh
466
*/
467
static void NOINLINE
468
scrypt_ChunkMix_ssse3_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) {
469
const uint32_t r = 1;
470
uint32_t i, blocksPerChunk = r * 2, half = 0;
471
xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
472
const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
473
size_t rounds;
474
475
/* 1: X = B_{2r - 1} */
476
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
477
x0 = xmmp[0];
478
x1 = xmmp[1];
479
x2 = xmmp[2];
480
x3 = xmmp[3];
481
482
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
483
x0 = _mm_xor_si128(x0, xmmp[0]);
484
x1 = _mm_xor_si128(x1, xmmp[1]);
485
x2 = _mm_xor_si128(x2, xmmp[2]);
486
x3 = _mm_xor_si128(x3, xmmp[3]);
487
488
/* 2: for i = 0 to 2r - 1 do */
489
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
490
/* 3: X = H(X ^ B_i) */
491
xmmp = (xmmi *)scrypt_block(Bin, i);
492
x0 = _mm_xor_si128(x0, xmmp[0]);
493
x1 = _mm_xor_si128(x1, xmmp[1]);
494
x2 = _mm_xor_si128(x2, xmmp[2]);
495
x3 = _mm_xor_si128(x3, xmmp[3]);
496
497
xmmp = (xmmi *)scrypt_block(Bxor, i);
498
x0 = _mm_xor_si128(x0, xmmp[0]);
499
x1 = _mm_xor_si128(x1, xmmp[1]);
500
x2 = _mm_xor_si128(x2, xmmp[2]);
501
x3 = _mm_xor_si128(x3, xmmp[3]);
502
503
t0 = x0;
504
t1 = x1;
505
t2 = x2;
506
t3 = x3;
507
508
for (rounds = 8; rounds; rounds -= 2) {
509
x0 = _mm_add_epi32(x0, x1);
510
x3 = _mm_xor_si128(x3, x0);
511
x3 = _mm_shuffle_epi8(x3, x4);
512
x2 = _mm_add_epi32(x2, x3);
513
x1 = _mm_xor_si128(x1, x2);
514
x6 = x1;
515
x1 = _mm_slli_epi32(x1, 12);
516
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
517
x0 = _mm_add_epi32(x0, x1);
518
x3 = _mm_xor_si128(x3, x0);
519
x3 = _mm_shuffle_epi8(x3, x5);
520
x0 = _mm_shuffle_epi32(x0, 0x93);
521
x2 = _mm_add_epi32(x2, x3);
522
x3 = _mm_shuffle_epi32(x3, 0x4e);
523
x1 = _mm_xor_si128(x1, x2);
524
x2 = _mm_shuffle_epi32(x2, 0x39);
525
x6 = x1;
526
x1 = _mm_slli_epi32(x1, 7);
527
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
528
x0 = _mm_add_epi32(x0, x1);
529
x3 = _mm_xor_si128(x3, x0);
530
x3 = _mm_shuffle_epi8(x3, x4);
531
x2 = _mm_add_epi32(x2, x3);
532
x1 = _mm_xor_si128(x1, x2);
533
x6 = x1;
534
x1 = _mm_slli_epi32(x1, 12);
535
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
536
x0 = _mm_add_epi32(x0, x1);
537
x3 = _mm_xor_si128(x3, x0);
538
x3 = _mm_shuffle_epi8(x3, x5);
539
x0 = _mm_shuffle_epi32(x0, 0x39);
540
x2 = _mm_add_epi32(x2, x3);
541
x3 = _mm_shuffle_epi32(x3, 0x4e);
542
x1 = _mm_xor_si128(x1, x2);
543
x2 = _mm_shuffle_epi32(x2, 0x93);
544
x6 = x1;
545
x1 = _mm_slli_epi32(x1, 7);
546
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
547
}
548
549
x0 = _mm_add_epi32(x0, t0);
550
x1 = _mm_add_epi32(x1, t1);
551
x2 = _mm_add_epi32(x2, t2);
552
x3 = _mm_add_epi32(x3, t3);
553
554
/* 4: Y_i = X */
555
/* 6: B'[0..r-1] = Y_even */
556
/* 6: B'[r..2r-1] = Y_odd */
557
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
558
xmmp[0] = x0;
559
xmmp[1] = x1;
560
xmmp[2] = x2;
561
xmmp[3] = x3;
562
}
563
}
564
565
#endif
566
567
#if defined(SCRYPT_CHACHA_SSSE3)
568
#undef SCRYPT_MIX
569
#define SCRYPT_MIX "ChaCha/8-SSSE3"
570
#undef SCRYPT_CHACHA_INCLUDED
571
#define SCRYPT_CHACHA_INCLUDED
572
#endif
573
574