Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
tpruvot
GitHub Repository: tpruvot/cpuminer-multi
Path: blob/linux/scryptjane/scrypt-jane-mix_salsa64-sse2.h
1201 views
1
/* x64 */
2
#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
3
4
#define SCRYPT_SALSA64_SSE2
5
6
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
7
asm_naked_fn(scrypt_ChunkMix_sse2)
8
a1(push rbp)
9
a2(mov rbp, rsp)
10
a2(and rsp, ~63)
11
a2(sub rsp, 128)
12
a2(lea rcx,[rcx*2])
13
a2(shl rcx,7)
14
a2(lea r9,[rcx-128])
15
a2(lea rax,[rsi+r9])
16
a2(lea r9,[rdx+r9])
17
a2(and rdx, rdx)
18
a2(movdqa xmm0,[rax+0])
19
a2(movdqa xmm1,[rax+16])
20
a2(movdqa xmm2,[rax+32])
21
a2(movdqa xmm3,[rax+48])
22
a2(movdqa xmm4,[rax+64])
23
a2(movdqa xmm5,[rax+80])
24
a2(movdqa xmm6,[rax+96])
25
a2(movdqa xmm7,[rax+112])
26
a1(jz scrypt_ChunkMix_sse2_no_xor1)
27
a2(pxor xmm0,[r9+0])
28
a2(pxor xmm1,[r9+16])
29
a2(pxor xmm2,[r9+32])
30
a2(pxor xmm3,[r9+48])
31
a2(pxor xmm4,[r9+64])
32
a2(pxor xmm5,[r9+80])
33
a2(pxor xmm6,[r9+96])
34
a2(pxor xmm7,[r9+112])
35
a1(scrypt_ChunkMix_sse2_no_xor1:)
36
a2(xor r9,r9)
37
a2(xor r8,r8)
38
a1(scrypt_ChunkMix_sse2_loop:)
39
a2(and rdx, rdx)
40
a2(pxor xmm0,[rsi+r9+0])
41
a2(pxor xmm1,[rsi+r9+16])
42
a2(pxor xmm2,[rsi+r9+32])
43
a2(pxor xmm3,[rsi+r9+48])
44
a2(pxor xmm4,[rsi+r9+64])
45
a2(pxor xmm5,[rsi+r9+80])
46
a2(pxor xmm6,[rsi+r9+96])
47
a2(pxor xmm7,[rsi+r9+112])
48
a1(jz scrypt_ChunkMix_sse2_no_xor2)
49
a2(pxor xmm0,[rdx+r9+0])
50
a2(pxor xmm1,[rdx+r9+16])
51
a2(pxor xmm2,[rdx+r9+32])
52
a2(pxor xmm3,[rdx+r9+48])
53
a2(pxor xmm4,[rdx+r9+64])
54
a2(pxor xmm5,[rdx+r9+80])
55
a2(pxor xmm6,[rdx+r9+96])
56
a2(pxor xmm7,[rdx+r9+112])
57
a1(scrypt_ChunkMix_sse2_no_xor2:)
58
a2(movdqa [rsp+0],xmm0)
59
a2(movdqa [rsp+16],xmm1)
60
a2(movdqa [rsp+32],xmm2)
61
a2(movdqa [rsp+48],xmm3)
62
a2(movdqa [rsp+64],xmm4)
63
a2(movdqa [rsp+80],xmm5)
64
a2(movdqa [rsp+96],xmm6)
65
a2(movdqa [rsp+112],xmm7)
66
a2(mov rax,8)
67
a1(scrypt_salsa64_sse2_loop: )
68
a2(movdqa xmm8, xmm0)
69
a2(movdqa xmm9, xmm1)
70
a2(paddq xmm8, xmm2)
71
a2(paddq xmm9, xmm3)
72
a3(pshufd xmm8, xmm8, 0xb1)
73
a3(pshufd xmm9, xmm9, 0xb1)
74
a2(pxor xmm6, xmm8)
75
a2(pxor xmm7, xmm9)
76
a2(movdqa xmm10, xmm0)
77
a2(movdqa xmm11, xmm1)
78
a2(paddq xmm10, xmm6)
79
a2(paddq xmm11, xmm7)
80
a2(movdqa xmm8, xmm10)
81
a2(movdqa xmm9, xmm11)
82
a2(psrlq xmm10, 51)
83
a2(psrlq xmm11, 51)
84
a2(psllq xmm8, 13)
85
a2(psllq xmm9, 13)
86
a2(pxor xmm4, xmm10)
87
a2(pxor xmm5, xmm11)
88
a2(pxor xmm4, xmm8)
89
a2(pxor xmm5, xmm9)
90
a2(movdqa xmm10, xmm6)
91
a2(movdqa xmm11, xmm7)
92
a2(paddq xmm10, xmm4)
93
a2(paddq xmm11, xmm5)
94
a2(movdqa xmm8, xmm10)
95
a2(movdqa xmm9, xmm11)
96
a2(psrlq xmm10, 25)
97
a2(psrlq xmm11, 25)
98
a2(psllq xmm8, 39)
99
a2(psllq xmm9, 39)
100
a2(pxor xmm2, xmm10)
101
a2(pxor xmm3, xmm11)
102
a2(pxor xmm2, xmm8)
103
a2(pxor xmm3, xmm9)
104
a2(movdqa xmm8, xmm4)
105
a2(movdqa xmm9, xmm5)
106
a2(paddq xmm8, xmm2)
107
a2(paddq xmm9, xmm3)
108
a3(pshufd xmm8, xmm8, 0xb1)
109
a3(pshufd xmm9, xmm9, 0xb1)
110
a2(pxor xmm0, xmm8)
111
a2(pxor xmm1, xmm9)
112
a2(movdqa xmm8, xmm2)
113
a2(movdqa xmm9, xmm3)
114
a2(movdqa xmm10, xmm6)
115
a2(movdqa xmm11, xmm7)
116
a2(movdqa xmm2, xmm7)
117
a2(movdqa xmm3, xmm6)
118
a2(punpcklqdq xmm10, xmm6)
119
a2(punpcklqdq xmm11, xmm7)
120
a2(movdqa xmm6, xmm8)
121
a2(movdqa xmm7, xmm9)
122
a2(punpcklqdq xmm9, xmm9)
123
a2(punpcklqdq xmm8, xmm8)
124
a2(punpckhqdq xmm2, xmm10)
125
a2(punpckhqdq xmm3, xmm11)
126
a2(punpckhqdq xmm6, xmm9)
127
a2(punpckhqdq xmm7, xmm8)
128
a2(sub rax, 2)
129
a2(movdqa xmm8, xmm0)
130
a2(movdqa xmm9, xmm1)
131
a2(paddq xmm8, xmm2)
132
a2(paddq xmm9, xmm3)
133
a3(pshufd xmm8, xmm8, 0xb1)
134
a3(pshufd xmm9, xmm9, 0xb1)
135
a2(pxor xmm6, xmm8)
136
a2(pxor xmm7, xmm9)
137
a2(movdqa xmm10, xmm0)
138
a2(movdqa xmm11, xmm1)
139
a2(paddq xmm10, xmm6)
140
a2(paddq xmm11, xmm7)
141
a2(movdqa xmm8, xmm10)
142
a2(movdqa xmm9, xmm11)
143
a2(psrlq xmm10, 51)
144
a2(psrlq xmm11, 51)
145
a2(psllq xmm8, 13)
146
a2(psllq xmm9, 13)
147
a2(pxor xmm5, xmm10)
148
a2(pxor xmm4, xmm11)
149
a2(pxor xmm5, xmm8)
150
a2(pxor xmm4, xmm9)
151
a2(movdqa xmm10, xmm6)
152
a2(movdqa xmm11, xmm7)
153
a2(paddq xmm10, xmm5)
154
a2(paddq xmm11, xmm4)
155
a2(movdqa xmm8, xmm10)
156
a2(movdqa xmm9, xmm11)
157
a2(psrlq xmm10, 25)
158
a2(psrlq xmm11, 25)
159
a2(psllq xmm8, 39)
160
a2(psllq xmm9, 39)
161
a2(pxor xmm2, xmm10)
162
a2(pxor xmm3, xmm11)
163
a2(pxor xmm2, xmm8)
164
a2(pxor xmm3, xmm9)
165
a2(movdqa xmm8, xmm5)
166
a2(movdqa xmm9, xmm4)
167
a2(paddq xmm8, xmm2)
168
a2(paddq xmm9, xmm3)
169
a3(pshufd xmm8, xmm8, 0xb1)
170
a3(pshufd xmm9, xmm9, 0xb1)
171
a2(pxor xmm0, xmm8)
172
a2(pxor xmm1, xmm9)
173
a2(movdqa xmm8, xmm2)
174
a2(movdqa xmm9, xmm3)
175
a2(movdqa xmm10, xmm6)
176
a2(movdqa xmm11, xmm7)
177
a2(movdqa xmm2, xmm7)
178
a2(movdqa xmm3, xmm6)
179
a2(punpcklqdq xmm10, xmm6)
180
a2(punpcklqdq xmm11, xmm7)
181
a2(movdqa xmm6, xmm8)
182
a2(movdqa xmm7, xmm9)
183
a2(punpcklqdq xmm9, xmm9)
184
a2(punpcklqdq xmm8, xmm8)
185
a2(punpckhqdq xmm2, xmm10)
186
a2(punpckhqdq xmm3, xmm11)
187
a2(punpckhqdq xmm6, xmm9)
188
a2(punpckhqdq xmm7, xmm8)
189
a1(ja scrypt_salsa64_sse2_loop)
190
a2(paddq xmm0,[rsp+0])
191
a2(paddq xmm1,[rsp+16])
192
a2(paddq xmm2,[rsp+32])
193
a2(paddq xmm3,[rsp+48])
194
a2(paddq xmm4,[rsp+64])
195
a2(paddq xmm5,[rsp+80])
196
a2(paddq xmm6,[rsp+96])
197
a2(paddq xmm7,[rsp+112])
198
a2(lea rax,[r8+r9])
199
a2(xor r8,rcx)
200
a2(and rax,~0xff)
201
a2(add r9,128)
202
a2(shr rax,1)
203
a2(add rax, rdi)
204
a2(cmp r9,rcx)
205
a2(movdqa [rax+0],xmm0)
206
a2(movdqa [rax+16],xmm1)
207
a2(movdqa [rax+32],xmm2)
208
a2(movdqa [rax+48],xmm3)
209
a2(movdqa [rax+64],xmm4)
210
a2(movdqa [rax+80],xmm5)
211
a2(movdqa [rax+96],xmm6)
212
a2(movdqa [rax+112],xmm7)
213
a1(jne scrypt_ChunkMix_sse2_loop)
214
a2(mov rsp, rbp)
215
a1(pop rbp)
216
a1(ret)
217
asm_naked_fn_end(scrypt_ChunkMix_sse2)
218
219
#endif
220
221
222
/* intrinsic */
223
#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSE2)
224
225
#define SCRYPT_SALSA64_SSE2
226
227
static void asm_calling_convention
228
scrypt_ChunkMix_sse2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
229
uint32_t i, blocksPerChunk = r * 2, half = 0;
230
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
231
size_t rounds;
232
233
/* 1: X = B_{2r - 1} */
234
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
235
x0 = xmmp[0];
236
x1 = xmmp[1];
237
x2 = xmmp[2];
238
x3 = xmmp[3];
239
x4 = xmmp[4];
240
x5 = xmmp[5];
241
x6 = xmmp[6];
242
x7 = xmmp[7];
243
244
if (Bxor) {
245
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
246
x0 = _mm_xor_si128(x0, xmmp[0]);
247
x1 = _mm_xor_si128(x1, xmmp[1]);
248
x2 = _mm_xor_si128(x2, xmmp[2]);
249
x3 = _mm_xor_si128(x3, xmmp[3]);
250
x4 = _mm_xor_si128(x4, xmmp[4]);
251
x5 = _mm_xor_si128(x5, xmmp[5]);
252
x6 = _mm_xor_si128(x6, xmmp[6]);
253
x7 = _mm_xor_si128(x7, xmmp[7]);
254
}
255
256
/* 2: for i = 0 to 2r - 1 do */
257
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
258
/* 3: X = H(X ^ B_i) */
259
xmmp = (xmmi *)scrypt_block(Bin, i);
260
x0 = _mm_xor_si128(x0, xmmp[0]);
261
x1 = _mm_xor_si128(x1, xmmp[1]);
262
x2 = _mm_xor_si128(x2, xmmp[2]);
263
x3 = _mm_xor_si128(x3, xmmp[3]);
264
x4 = _mm_xor_si128(x4, xmmp[4]);
265
x5 = _mm_xor_si128(x5, xmmp[5]);
266
x6 = _mm_xor_si128(x6, xmmp[6]);
267
x7 = _mm_xor_si128(x7, xmmp[7]);
268
269
if (Bxor) {
270
xmmp = (xmmi *)scrypt_block(Bxor, i);
271
x0 = _mm_xor_si128(x0, xmmp[0]);
272
x1 = _mm_xor_si128(x1, xmmp[1]);
273
x2 = _mm_xor_si128(x2, xmmp[2]);
274
x3 = _mm_xor_si128(x3, xmmp[3]);
275
x4 = _mm_xor_si128(x4, xmmp[4]);
276
x5 = _mm_xor_si128(x5, xmmp[5]);
277
x6 = _mm_xor_si128(x6, xmmp[6]);
278
x7 = _mm_xor_si128(x7, xmmp[7]);
279
}
280
281
t0 = x0;
282
t1 = x1;
283
t2 = x2;
284
t3 = x3;
285
t4 = x4;
286
t5 = x5;
287
t6 = x6;
288
t7 = x7;
289
290
for (rounds = 8; rounds; rounds -= 2) {
291
z0 = _mm_add_epi64(x0, x2);
292
z1 = _mm_add_epi64(x1, x3);
293
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
294
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
295
x6 = _mm_xor_si128(x6, z0);
296
x7 = _mm_xor_si128(x7, z1);
297
298
z0 = _mm_add_epi64(x6, x0);
299
z1 = _mm_add_epi64(x7, x1);
300
z2 = _mm_srli_epi64(z0, 64-13);
301
z3 = _mm_srli_epi64(z1, 64-13);
302
z0 = _mm_slli_epi64(z0, 13);
303
z1 = _mm_slli_epi64(z1, 13);
304
x4 = _mm_xor_si128(x4, z2);
305
x5 = _mm_xor_si128(x5, z3);
306
x4 = _mm_xor_si128(x4, z0);
307
x5 = _mm_xor_si128(x5, z1);
308
309
z0 = _mm_add_epi64(x4, x6);
310
z1 = _mm_add_epi64(x5, x7);
311
z2 = _mm_srli_epi64(z0, 64-39);
312
z3 = _mm_srli_epi64(z1, 64-39);
313
z0 = _mm_slli_epi64(z0, 39);
314
z1 = _mm_slli_epi64(z1, 39);
315
x2 = _mm_xor_si128(x2, z2);
316
x3 = _mm_xor_si128(x3, z3);
317
x2 = _mm_xor_si128(x2, z0);
318
x3 = _mm_xor_si128(x3, z1);
319
320
z0 = _mm_add_epi64(x2, x4);
321
z1 = _mm_add_epi64(x3, x5);
322
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
323
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
324
x0 = _mm_xor_si128(x0, z0);
325
x1 = _mm_xor_si128(x1, z1);
326
327
z0 = x4;
328
z1 = x5;
329
z2 = x2;
330
z3 = x3;
331
x4 = z1;
332
x5 = z0;
333
x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
334
x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
335
x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
336
x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
337
338
z0 = _mm_add_epi64(x0, x2);
339
z1 = _mm_add_epi64(x1, x3);
340
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
341
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
342
x6 = _mm_xor_si128(x6, z0);
343
x7 = _mm_xor_si128(x7, z1);
344
345
z0 = _mm_add_epi64(x6, x0);
346
z1 = _mm_add_epi64(x7, x1);
347
z2 = _mm_srli_epi64(z0, 64-13);
348
z3 = _mm_srli_epi64(z1, 64-13);
349
z0 = _mm_slli_epi64(z0, 13);
350
z1 = _mm_slli_epi64(z1, 13);
351
x4 = _mm_xor_si128(x4, z2);
352
x5 = _mm_xor_si128(x5, z3);
353
x4 = _mm_xor_si128(x4, z0);
354
x5 = _mm_xor_si128(x5, z1);
355
356
z0 = _mm_add_epi64(x4, x6);
357
z1 = _mm_add_epi64(x5, x7);
358
z2 = _mm_srli_epi64(z0, 64-39);
359
z3 = _mm_srli_epi64(z1, 64-39);
360
z0 = _mm_slli_epi64(z0, 39);
361
z1 = _mm_slli_epi64(z1, 39);
362
x2 = _mm_xor_si128(x2, z2);
363
x3 = _mm_xor_si128(x3, z3);
364
x2 = _mm_xor_si128(x2, z0);
365
x3 = _mm_xor_si128(x3, z1);
366
367
z0 = _mm_add_epi64(x2, x4);
368
z1 = _mm_add_epi64(x3, x5);
369
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
370
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
371
x0 = _mm_xor_si128(x0, z0);
372
x1 = _mm_xor_si128(x1, z1);
373
374
z0 = x4;
375
z1 = x5;
376
z2 = x2;
377
z3 = x3;
378
x4 = z1;
379
x5 = z0;
380
x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
381
x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
382
x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
383
x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
384
}
385
386
x0 = _mm_add_epi64(x0, t0);
387
x1 = _mm_add_epi64(x1, t1);
388
x2 = _mm_add_epi64(x2, t2);
389
x3 = _mm_add_epi64(x3, t3);
390
x4 = _mm_add_epi64(x4, t4);
391
x5 = _mm_add_epi64(x5, t5);
392
x6 = _mm_add_epi64(x6, t6);
393
x7 = _mm_add_epi64(x7, t7);
394
395
/* 4: Y_i = X */
396
/* 6: B'[0..r-1] = Y_even */
397
/* 6: B'[r..2r-1] = Y_odd */
398
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
399
xmmp[0] = x0;
400
xmmp[1] = x1;
401
xmmp[2] = x2;
402
xmmp[3] = x3;
403
xmmp[4] = x4;
404
xmmp[5] = x5;
405
xmmp[6] = x6;
406
xmmp[7] = x7;
407
}
408
}
409
410
#endif
411
412
#if defined(SCRYPT_SALSA64_SSE2)
413
#undef SCRYPT_MIX
414
#define SCRYPT_MIX "Salsa64/8-SSE2"
415
#undef SCRYPT_SALSA64_INCLUDED
416
#define SCRYPT_SALSA64_INCLUDED
417
#endif
418
419
/* sse3/avx use this as well */
420
#if defined(SCRYPT_SALSA64_INCLUDED)
421
/*
422
Default layout:
423
0 1 2 3
424
4 5 6 7
425
8 9 10 11
426
12 13 14 15
427
428
SSE2 layout:
429
0 5 10 15
430
12 1 6 11
431
8 13 2 7
432
4 9 14 3
433
*/
434
435
436
static void asm_calling_convention
437
salsa64_core_tangle_sse2(uint64_t *blocks, size_t count) {
438
uint64_t t;
439
while (count--) {
440
t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t;
441
t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t;
442
t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t;
443
t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t;
444
t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t;
445
t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t;
446
blocks += 16;
447
}
448
}
449
#endif
450