Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
tpruvot
GitHub Repository: tpruvot/cpuminer-multi
Path: blob/linux/scryptjane/scrypt-jane-mix_salsa-sse2.h
1201 views
1
/* x86 */
2
#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
3
4
#define SCRYPT_SALSA_SSE2
5
6
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
7
asm_naked_fn(scrypt_ChunkMix_sse2)
8
a1(push ebx)
9
a1(push edi)
10
a1(push esi)
11
a1(push ebp)
12
a2(mov ebp,esp)
13
a2(mov edi,[ebp+20])
14
a2(mov esi,[ebp+24])
15
a2(mov eax,[ebp+28])
16
a2(mov ebx,[ebp+32])
17
a2(sub esp,32)
18
a2(and esp,~63)
19
a2(lea edx,[ebx*2])
20
a2(shl edx,6)
21
a2(lea ecx,[edx-64])
22
a2(and eax, eax)
23
a2(movdqa xmm0,[ecx+esi+0])
24
a2(movdqa xmm1,[ecx+esi+16])
25
a2(movdqa xmm2,[ecx+esi+32])
26
a2(movdqa xmm3,[ecx+esi+48])
27
a1(jz scrypt_ChunkMix_sse2_no_xor1)
28
a2(pxor xmm0,[ecx+eax+0])
29
a2(pxor xmm1,[ecx+eax+16])
30
a2(pxor xmm2,[ecx+eax+32])
31
a2(pxor xmm3,[ecx+eax+48])
32
a1(scrypt_ChunkMix_sse2_no_xor1:)
33
a2(xor ecx,ecx)
34
a2(xor ebx,ebx)
35
a1(scrypt_ChunkMix_sse2_loop:)
36
a2(and eax, eax)
37
a2(pxor xmm0,[esi+ecx+0])
38
a2(pxor xmm1,[esi+ecx+16])
39
a2(pxor xmm2,[esi+ecx+32])
40
a2(pxor xmm3,[esi+ecx+48])
41
a1(jz scrypt_ChunkMix_sse2_no_xor2)
42
a2(pxor xmm0,[eax+ecx+0])
43
a2(pxor xmm1,[eax+ecx+16])
44
a2(pxor xmm2,[eax+ecx+32])
45
a2(pxor xmm3,[eax+ecx+48])
46
a1(scrypt_ChunkMix_sse2_no_xor2:)
47
a2(movdqa [esp+0],xmm0)
48
a2(movdqa [esp+16],xmm1)
49
a2(movdqa xmm6,xmm2)
50
a2(movdqa xmm7,xmm3)
51
a2(mov eax,8)
52
a1(scrypt_salsa_sse2_loop: )
53
a2(movdqa xmm4, xmm1)
54
a2(paddd xmm4, xmm0)
55
a2(movdqa xmm5, xmm4)
56
a2(pslld xmm4, 7)
57
a2(psrld xmm5, 25)
58
a2(pxor xmm3, xmm4)
59
a2(movdqa xmm4, xmm0)
60
a2(pxor xmm3, xmm5)
61
a2(paddd xmm4, xmm3)
62
a2(movdqa xmm5, xmm4)
63
a2(pslld xmm4, 9)
64
a2(psrld xmm5, 23)
65
a2(pxor xmm2, xmm4)
66
a2(movdqa xmm4, xmm3)
67
a2(pxor xmm2, xmm5)
68
a3(pshufd xmm3, xmm3, 0x93)
69
a2(paddd xmm4, xmm2)
70
a2(movdqa xmm5, xmm4)
71
a2(pslld xmm4, 13)
72
a2(psrld xmm5, 19)
73
a2(pxor xmm1, xmm4)
74
a2(movdqa xmm4, xmm2)
75
a2(pxor xmm1, xmm5)
76
a3(pshufd xmm2, xmm2, 0x4e)
77
a2(paddd xmm4, xmm1)
78
a2(movdqa xmm5, xmm4)
79
a2(pslld xmm4, 18)
80
a2(psrld xmm5, 14)
81
a2(pxor xmm0, xmm4)
82
a2(movdqa xmm4, xmm3)
83
a2(pxor xmm0, xmm5)
84
a3(pshufd xmm1, xmm1, 0x39)
85
a2(paddd xmm4, xmm0)
86
a2(movdqa xmm5, xmm4)
87
a2(pslld xmm4, 7)
88
a2(psrld xmm5, 25)
89
a2(pxor xmm1, xmm4)
90
a2(movdqa xmm4, xmm0)
91
a2(pxor xmm1, xmm5)
92
a2(paddd xmm4, xmm1)
93
a2(movdqa xmm5, xmm4)
94
a2(pslld xmm4, 9)
95
a2(psrld xmm5, 23)
96
a2(pxor xmm2, xmm4)
97
a2(movdqa xmm4, xmm1)
98
a2(pxor xmm2, xmm5)
99
a3(pshufd xmm1, xmm1, 0x93)
100
a2(paddd xmm4, xmm2)
101
a2(movdqa xmm5, xmm4)
102
a2(pslld xmm4, 13)
103
a2(psrld xmm5, 19)
104
a2(pxor xmm3, xmm4)
105
a2(movdqa xmm4, xmm2)
106
a2(pxor xmm3, xmm5)
107
a3(pshufd xmm2, xmm2, 0x4e)
108
a2(paddd xmm4, xmm3)
109
a2(sub eax, 2)
110
a2(movdqa xmm5, xmm4)
111
a2(pslld xmm4, 18)
112
a2(psrld xmm5, 14)
113
a2(pxor xmm0, xmm4)
114
a3(pshufd xmm3, xmm3, 0x39)
115
a2(pxor xmm0, xmm5)
116
a1(ja scrypt_salsa_sse2_loop)
117
a2(paddd xmm0,[esp+0])
118
a2(paddd xmm1,[esp+16])
119
a2(paddd xmm2,xmm6)
120
a2(paddd xmm3,xmm7)
121
a2(lea eax,[ebx+ecx])
122
a2(xor ebx,edx)
123
a2(and eax,~0x7f)
124
a2(add ecx,64)
125
a2(shr eax,1)
126
a2(add eax, edi)
127
a2(cmp ecx,edx)
128
a2(movdqa [eax+0],xmm0)
129
a2(movdqa [eax+16],xmm1)
130
a2(movdqa [eax+32],xmm2)
131
a2(movdqa [eax+48],xmm3)
132
a2(mov eax,[ebp+28])
133
a1(jne scrypt_ChunkMix_sse2_loop)
134
a2(mov esp,ebp)
135
a1(pop ebp)
136
a1(pop esi)
137
a1(pop edi)
138
a1(pop ebx)
139
aret(16)
140
asm_naked_fn_end(scrypt_ChunkMix_sse2)
141
142
#endif
143
144
145
146
/* x64 */
147
#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
148
149
#define SCRYPT_SALSA_SSE2
150
151
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
152
asm_naked_fn(scrypt_ChunkMix_sse2)
153
a2(lea rcx,[rcx*2])
154
a2(shl rcx,6)
155
a2(lea r9,[rcx-64])
156
a2(lea rax,[rsi+r9])
157
a2(lea r9,[rdx+r9])
158
a2(and rdx, rdx)
159
a2(movdqa xmm0,[rax+0])
160
a2(movdqa xmm1,[rax+16])
161
a2(movdqa xmm2,[rax+32])
162
a2(movdqa xmm3,[rax+48])
163
a1(jz scrypt_ChunkMix_sse2_no_xor1)
164
a2(pxor xmm0,[r9+0])
165
a2(pxor xmm1,[r9+16])
166
a2(pxor xmm2,[r9+32])
167
a2(pxor xmm3,[r9+48])
168
a1(scrypt_ChunkMix_sse2_no_xor1:)
169
a2(xor r9,r9)
170
a2(xor r8,r8)
171
a1(scrypt_ChunkMix_sse2_loop:)
172
a2(and rdx, rdx)
173
a2(pxor xmm0,[rsi+r9+0])
174
a2(pxor xmm1,[rsi+r9+16])
175
a2(pxor xmm2,[rsi+r9+32])
176
a2(pxor xmm3,[rsi+r9+48])
177
a1(jz scrypt_ChunkMix_sse2_no_xor2)
178
a2(pxor xmm0,[rdx+r9+0])
179
a2(pxor xmm1,[rdx+r9+16])
180
a2(pxor xmm2,[rdx+r9+32])
181
a2(pxor xmm3,[rdx+r9+48])
182
a1(scrypt_ChunkMix_sse2_no_xor2:)
183
a2(movdqa xmm8,xmm0)
184
a2(movdqa xmm9,xmm1)
185
a2(movdqa xmm10,xmm2)
186
a2(movdqa xmm11,xmm3)
187
a2(mov rax,8)
188
a1(scrypt_salsa_sse2_loop: )
189
a2(movdqa xmm4, xmm1)
190
a2(paddd xmm4, xmm0)
191
a2(movdqa xmm5, xmm4)
192
a2(pslld xmm4, 7)
193
a2(psrld xmm5, 25)
194
a2(pxor xmm3, xmm4)
195
a2(movdqa xmm4, xmm0)
196
a2(pxor xmm3, xmm5)
197
a2(paddd xmm4, xmm3)
198
a2(movdqa xmm5, xmm4)
199
a2(pslld xmm4, 9)
200
a2(psrld xmm5, 23)
201
a2(pxor xmm2, xmm4)
202
a2(movdqa xmm4, xmm3)
203
a2(pxor xmm2, xmm5)
204
a3(pshufd xmm3, xmm3, 0x93)
205
a2(paddd xmm4, xmm2)
206
a2(movdqa xmm5, xmm4)
207
a2(pslld xmm4, 13)
208
a2(psrld xmm5, 19)
209
a2(pxor xmm1, xmm4)
210
a2(movdqa xmm4, xmm2)
211
a2(pxor xmm1, xmm5)
212
a3(pshufd xmm2, xmm2, 0x4e)
213
a2(paddd xmm4, xmm1)
214
a2(movdqa xmm5, xmm4)
215
a2(pslld xmm4, 18)
216
a2(psrld xmm5, 14)
217
a2(pxor xmm0, xmm4)
218
a2(movdqa xmm4, xmm3)
219
a2(pxor xmm0, xmm5)
220
a3(pshufd xmm1, xmm1, 0x39)
221
a2(paddd xmm4, xmm0)
222
a2(movdqa xmm5, xmm4)
223
a2(pslld xmm4, 7)
224
a2(psrld xmm5, 25)
225
a2(pxor xmm1, xmm4)
226
a2(movdqa xmm4, xmm0)
227
a2(pxor xmm1, xmm5)
228
a2(paddd xmm4, xmm1)
229
a2(movdqa xmm5, xmm4)
230
a2(pslld xmm4, 9)
231
a2(psrld xmm5, 23)
232
a2(pxor xmm2, xmm4)
233
a2(movdqa xmm4, xmm1)
234
a2(pxor xmm2, xmm5)
235
a3(pshufd xmm1, xmm1, 0x93)
236
a2(paddd xmm4, xmm2)
237
a2(movdqa xmm5, xmm4)
238
a2(pslld xmm4, 13)
239
a2(psrld xmm5, 19)
240
a2(pxor xmm3, xmm4)
241
a2(movdqa xmm4, xmm2)
242
a2(pxor xmm3, xmm5)
243
a3(pshufd xmm2, xmm2, 0x4e)
244
a2(paddd xmm4, xmm3)
245
a2(sub rax, 2)
246
a2(movdqa xmm5, xmm4)
247
a2(pslld xmm4, 18)
248
a2(psrld xmm5, 14)
249
a2(pxor xmm0, xmm4)
250
a3(pshufd xmm3, xmm3, 0x39)
251
a2(pxor xmm0, xmm5)
252
a1(ja scrypt_salsa_sse2_loop)
253
a2(paddd xmm0,xmm8)
254
a2(paddd xmm1,xmm9)
255
a2(paddd xmm2,xmm10)
256
a2(paddd xmm3,xmm11)
257
a2(lea rax,[r8+r9])
258
a2(xor r8,rcx)
259
a2(and rax,~0x7f)
260
a2(add r9,64)
261
a2(shr rax,1)
262
a2(add rax, rdi)
263
a2(cmp r9,rcx)
264
a2(movdqa [rax+0],xmm0)
265
a2(movdqa [rax+16],xmm1)
266
a2(movdqa [rax+32],xmm2)
267
a2(movdqa [rax+48],xmm3)
268
a1(jne scrypt_ChunkMix_sse2_loop)
269
a1(ret)
270
asm_naked_fn_end(scrypt_ChunkMix_sse2)
271
272
#endif
273
274
275
/* intrinsic */
276
#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
277
278
#define SCRYPT_SALSA_SSE2
279
280
static void NOINLINE
281
scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
282
uint32_t i, blocksPerChunk = r * 2, half = 0;
283
xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3;
284
size_t rounds;
285
286
/* 1: X = B_{2r - 1} */
287
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
288
x0 = xmmp[0];
289
x1 = xmmp[1];
290
x2 = xmmp[2];
291
x3 = xmmp[3];
292
293
if (Bxor) {
294
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
295
x0 = _mm_xor_si128(x0, xmmp[0]);
296
x1 = _mm_xor_si128(x1, xmmp[1]);
297
x2 = _mm_xor_si128(x2, xmmp[2]);
298
x3 = _mm_xor_si128(x3, xmmp[3]);
299
}
300
301
/* 2: for i = 0 to 2r - 1 do */
302
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
303
/* 3: X = H(X ^ B_i) */
304
xmmp = (xmmi *)scrypt_block(Bin, i);
305
x0 = _mm_xor_si128(x0, xmmp[0]);
306
x1 = _mm_xor_si128(x1, xmmp[1]);
307
x2 = _mm_xor_si128(x2, xmmp[2]);
308
x3 = _mm_xor_si128(x3, xmmp[3]);
309
310
if (Bxor) {
311
xmmp = (xmmi *)scrypt_block(Bxor, i);
312
x0 = _mm_xor_si128(x0, xmmp[0]);
313
x1 = _mm_xor_si128(x1, xmmp[1]);
314
x2 = _mm_xor_si128(x2, xmmp[2]);
315
x3 = _mm_xor_si128(x3, xmmp[3]);
316
}
317
318
t0 = x0;
319
t1 = x1;
320
t2 = x2;
321
t3 = x3;
322
323
for (rounds = 8; rounds; rounds -= 2) {
324
x4 = x1;
325
x4 = _mm_add_epi32(x4, x0);
326
x5 = x4;
327
x4 = _mm_slli_epi32(x4, 7);
328
x5 = _mm_srli_epi32(x5, 25);
329
x3 = _mm_xor_si128(x3, x4);
330
x4 = x0;
331
x3 = _mm_xor_si128(x3, x5);
332
x4 = _mm_add_epi32(x4, x3);
333
x5 = x4;
334
x4 = _mm_slli_epi32(x4, 9);
335
x5 = _mm_srli_epi32(x5, 23);
336
x2 = _mm_xor_si128(x2, x4);
337
x4 = x3;
338
x2 = _mm_xor_si128(x2, x5);
339
x3 = _mm_shuffle_epi32(x3, 0x93);
340
x4 = _mm_add_epi32(x4, x2);
341
x5 = x4;
342
x4 = _mm_slli_epi32(x4, 13);
343
x5 = _mm_srli_epi32(x5, 19);
344
x1 = _mm_xor_si128(x1, x4);
345
x4 = x2;
346
x1 = _mm_xor_si128(x1, x5);
347
x2 = _mm_shuffle_epi32(x2, 0x4e);
348
x4 = _mm_add_epi32(x4, x1);
349
x5 = x4;
350
x4 = _mm_slli_epi32(x4, 18);
351
x5 = _mm_srli_epi32(x5, 14);
352
x0 = _mm_xor_si128(x0, x4);
353
x4 = x3;
354
x0 = _mm_xor_si128(x0, x5);
355
x1 = _mm_shuffle_epi32(x1, 0x39);
356
x4 = _mm_add_epi32(x4, x0);
357
x5 = x4;
358
x4 = _mm_slli_epi32(x4, 7);
359
x5 = _mm_srli_epi32(x5, 25);
360
x1 = _mm_xor_si128(x1, x4);
361
x4 = x0;
362
x1 = _mm_xor_si128(x1, x5);
363
x4 = _mm_add_epi32(x4, x1);
364
x5 = x4;
365
x4 = _mm_slli_epi32(x4, 9);
366
x5 = _mm_srli_epi32(x5, 23);
367
x2 = _mm_xor_si128(x2, x4);
368
x4 = x1;
369
x2 = _mm_xor_si128(x2, x5);
370
x1 = _mm_shuffle_epi32(x1, 0x93);
371
x4 = _mm_add_epi32(x4, x2);
372
x5 = x4;
373
x4 = _mm_slli_epi32(x4, 13);
374
x5 = _mm_srli_epi32(x5, 19);
375
x3 = _mm_xor_si128(x3, x4);
376
x4 = x2;
377
x3 = _mm_xor_si128(x3, x5);
378
x2 = _mm_shuffle_epi32(x2, 0x4e);
379
x4 = _mm_add_epi32(x4, x3);
380
x5 = x4;
381
x4 = _mm_slli_epi32(x4, 18);
382
x5 = _mm_srli_epi32(x5, 14);
383
x0 = _mm_xor_si128(x0, x4);
384
x3 = _mm_shuffle_epi32(x3, 0x39);
385
x0 = _mm_xor_si128(x0, x5);
386
}
387
388
x0 = _mm_add_epi32(x0, t0);
389
x1 = _mm_add_epi32(x1, t1);
390
x2 = _mm_add_epi32(x2, t2);
391
x3 = _mm_add_epi32(x3, t3);
392
393
/* 4: Y_i = X */
394
/* 6: B'[0..r-1] = Y_even */
395
/* 6: B'[r..2r-1] = Y_odd */
396
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
397
xmmp[0] = x0;
398
xmmp[1] = x1;
399
xmmp[2] = x2;
400
xmmp[3] = x3;
401
}
402
}
403
404
#endif
405
406
#if defined(SCRYPT_SALSA_SSE2)
407
#undef SCRYPT_MIX
408
#define SCRYPT_MIX "Salsa/8-SSE2"
409
#undef SCRYPT_SALSA_INCLUDED
410
#define SCRYPT_SALSA_INCLUDED
411
#endif
412
413
/* used by avx,etc as well */
414
#if defined(SCRYPT_SALSA_INCLUDED)
415
/*
416
Default layout:
417
0 1 2 3
418
4 5 6 7
419
8 9 10 11
420
12 13 14 15
421
422
SSE2 layout:
423
0 5 10 15
424
12 1 6 11
425
8 13 2 7
426
4 9 14 3
427
*/
428
429
static void asm_calling_convention
430
salsa_core_tangle_sse2(uint32_t *blocks, size_t count) {
431
uint32_t t;
432
while (count--) {
433
t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t;
434
t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t;
435
t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t;
436
t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t;
437
t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t;
438
t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t;
439
blocks += 16;
440
}
441
}
442
#endif
443
444
445