Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
tpruvot
GitHub Repository: tpruvot/cpuminer-multi
Path: blob/linux/scryptjane/scrypt-jane-mix_salsa64-ssse3.h
1201 views
1
/* x64 */
2
#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
3
4
#define SCRYPT_SALSA64_SSSE3
5
6
asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
7
asm_naked_fn(scrypt_ChunkMix_ssse3)
8
a1(push rbp)
9
a2(mov rbp, rsp)
10
a2(and rsp, ~63)
11
a2(sub rsp, 128)
12
a2(lea rcx,[rcx*2])
13
a2(shl rcx,7)
14
a2(lea r9,[rcx-128])
15
a2(lea rax,[rsi+r9])
16
a2(lea r9,[rdx+r9])
17
a2(and rdx, rdx)
18
a2(movdqa xmm0,[rax+0])
19
a2(movdqa xmm1,[rax+16])
20
a2(movdqa xmm2,[rax+32])
21
a2(movdqa xmm3,[rax+48])
22
a2(movdqa xmm4,[rax+64])
23
a2(movdqa xmm5,[rax+80])
24
a2(movdqa xmm6,[rax+96])
25
a2(movdqa xmm7,[rax+112])
26
a1(jz scrypt_ChunkMix_ssse3_no_xor1)
27
a2(pxor xmm0,[r9+0])
28
a2(pxor xmm1,[r9+16])
29
a2(pxor xmm2,[r9+32])
30
a2(pxor xmm3,[r9+48])
31
a2(pxor xmm4,[r9+64])
32
a2(pxor xmm5,[r9+80])
33
a2(pxor xmm6,[r9+96])
34
a2(pxor xmm7,[r9+112])
35
a1(scrypt_ChunkMix_ssse3_no_xor1:)
36
a2(xor r9,r9)
37
a2(xor r8,r8)
38
a1(scrypt_ChunkMix_ssse3_loop:)
39
a2(and rdx, rdx)
40
a2(pxor xmm0,[rsi+r9+0])
41
a2(pxor xmm1,[rsi+r9+16])
42
a2(pxor xmm2,[rsi+r9+32])
43
a2(pxor xmm3,[rsi+r9+48])
44
a2(pxor xmm4,[rsi+r9+64])
45
a2(pxor xmm5,[rsi+r9+80])
46
a2(pxor xmm6,[rsi+r9+96])
47
a2(pxor xmm7,[rsi+r9+112])
48
a1(jz scrypt_ChunkMix_ssse3_no_xor2)
49
a2(pxor xmm0,[rdx+r9+0])
50
a2(pxor xmm1,[rdx+r9+16])
51
a2(pxor xmm2,[rdx+r9+32])
52
a2(pxor xmm3,[rdx+r9+48])
53
a2(pxor xmm4,[rdx+r9+64])
54
a2(pxor xmm5,[rdx+r9+80])
55
a2(pxor xmm6,[rdx+r9+96])
56
a2(pxor xmm7,[rdx+r9+112])
57
a1(scrypt_ChunkMix_ssse3_no_xor2:)
58
a2(movdqa [rsp+0],xmm0)
59
a2(movdqa [rsp+16],xmm1)
60
a2(movdqa [rsp+32],xmm2)
61
a2(movdqa [rsp+48],xmm3)
62
a2(movdqa [rsp+64],xmm4)
63
a2(movdqa [rsp+80],xmm5)
64
a2(movdqa [rsp+96],xmm6)
65
a2(movdqa [rsp+112],xmm7)
66
a2(mov rax,8)
67
a1(scrypt_salsa64_ssse3_loop: )
68
a2(movdqa xmm8, xmm0)
69
a2(movdqa xmm9, xmm1)
70
a2(paddq xmm8, xmm2)
71
a2(paddq xmm9, xmm3)
72
a3(pshufd xmm8, xmm8, 0xb1)
73
a3(pshufd xmm9, xmm9, 0xb1)
74
a2(pxor xmm6, xmm8)
75
a2(pxor xmm7, xmm9)
76
a2(movdqa xmm10, xmm0)
77
a2(movdqa xmm11, xmm1)
78
a2(paddq xmm10, xmm6)
79
a2(paddq xmm11, xmm7)
80
a2(movdqa xmm8, xmm10)
81
a2(movdqa xmm9, xmm11)
82
a2(psrlq xmm10, 51)
83
a2(psrlq xmm11, 51)
84
a2(psllq xmm8, 13)
85
a2(psllq xmm9, 13)
86
a2(pxor xmm4, xmm10)
87
a2(pxor xmm5, xmm11)
88
a2(pxor xmm4, xmm8)
89
a2(pxor xmm5, xmm9)
90
a2(movdqa xmm10, xmm6)
91
a2(movdqa xmm11, xmm7)
92
a2(paddq xmm10, xmm4)
93
a2(paddq xmm11, xmm5)
94
a2(movdqa xmm8, xmm10)
95
a2(movdqa xmm9, xmm11)
96
a2(psrlq xmm10, 25)
97
a2(psrlq xmm11, 25)
98
a2(psllq xmm8, 39)
99
a2(psllq xmm9, 39)
100
a2(pxor xmm2, xmm10)
101
a2(pxor xmm3, xmm11)
102
a2(pxor xmm2, xmm8)
103
a2(pxor xmm3, xmm9)
104
a2(movdqa xmm8, xmm4)
105
a2(movdqa xmm9, xmm5)
106
a2(paddq xmm8, xmm2)
107
a2(paddq xmm9, xmm3)
108
a3(pshufd xmm8, xmm8, 0xb1)
109
a3(pshufd xmm9, xmm9, 0xb1)
110
a2(pxor xmm0, xmm8)
111
a2(pxor xmm1, xmm9)
112
a2(movdqa xmm10, xmm2)
113
a2(movdqa xmm11, xmm3)
114
a2(movdqa xmm2, xmm6)
115
a2(movdqa xmm3, xmm7)
116
a3(palignr xmm2, xmm7, 8)
117
a3(palignr xmm3, xmm6, 8)
118
a2(movdqa xmm6, xmm11)
119
a2(movdqa xmm7, xmm10)
120
a3(palignr xmm6, xmm10, 8)
121
a3(palignr xmm7, xmm11, 8)
122
a2(sub rax, 2)
123
a2(movdqa xmm8, xmm0)
124
a2(movdqa xmm9, xmm1)
125
a2(paddq xmm8, xmm2)
126
a2(paddq xmm9, xmm3)
127
a3(pshufd xmm8, xmm8, 0xb1)
128
a3(pshufd xmm9, xmm9, 0xb1)
129
a2(pxor xmm6, xmm8)
130
a2(pxor xmm7, xmm9)
131
a2(movdqa xmm10, xmm0)
132
a2(movdqa xmm11, xmm1)
133
a2(paddq xmm10, xmm6)
134
a2(paddq xmm11, xmm7)
135
a2(movdqa xmm8, xmm10)
136
a2(movdqa xmm9, xmm11)
137
a2(psrlq xmm10, 51)
138
a2(psrlq xmm11, 51)
139
a2(psllq xmm8, 13)
140
a2(psllq xmm9, 13)
141
a2(pxor xmm5, xmm10)
142
a2(pxor xmm4, xmm11)
143
a2(pxor xmm5, xmm8)
144
a2(pxor xmm4, xmm9)
145
a2(movdqa xmm10, xmm6)
146
a2(movdqa xmm11, xmm7)
147
a2(paddq xmm10, xmm5)
148
a2(paddq xmm11, xmm4)
149
a2(movdqa xmm8, xmm10)
150
a2(movdqa xmm9, xmm11)
151
a2(psrlq xmm10, 25)
152
a2(psrlq xmm11, 25)
153
a2(psllq xmm8, 39)
154
a2(psllq xmm9, 39)
155
a2(pxor xmm2, xmm10)
156
a2(pxor xmm3, xmm11)
157
a2(pxor xmm2, xmm8)
158
a2(pxor xmm3, xmm9)
159
a2(movdqa xmm8, xmm5)
160
a2(movdqa xmm9, xmm4)
161
a2(paddq xmm8, xmm2)
162
a2(paddq xmm9, xmm3)
163
a3(pshufd xmm8, xmm8, 0xb1)
164
a3(pshufd xmm9, xmm9, 0xb1)
165
a2(pxor xmm0, xmm8)
166
a2(pxor xmm1, xmm9)
167
a2(movdqa xmm10, xmm2)
168
a2(movdqa xmm11, xmm3)
169
a2(movdqa xmm2, xmm6)
170
a2(movdqa xmm3, xmm7)
171
a3(palignr xmm2, xmm7, 8)
172
a3(palignr xmm3, xmm6, 8)
173
a2(movdqa xmm6, xmm11)
174
a2(movdqa xmm7, xmm10)
175
a3(palignr xmm6, xmm10, 8)
176
a3(palignr xmm7, xmm11, 8)
177
a1(ja scrypt_salsa64_ssse3_loop)
178
a2(paddq xmm0,[rsp+0])
179
a2(paddq xmm1,[rsp+16])
180
a2(paddq xmm2,[rsp+32])
181
a2(paddq xmm3,[rsp+48])
182
a2(paddq xmm4,[rsp+64])
183
a2(paddq xmm5,[rsp+80])
184
a2(paddq xmm6,[rsp+96])
185
a2(paddq xmm7,[rsp+112])
186
a2(lea rax,[r8+r9])
187
a2(xor r8,rcx)
188
a2(and rax,~0xff)
189
a2(add r9,128)
190
a2(shr rax,1)
191
a2(add rax, rdi)
192
a2(cmp r9,rcx)
193
a2(movdqa [rax+0],xmm0)
194
a2(movdqa [rax+16],xmm1)
195
a2(movdqa [rax+32],xmm2)
196
a2(movdqa [rax+48],xmm3)
197
a2(movdqa [rax+64],xmm4)
198
a2(movdqa [rax+80],xmm5)
199
a2(movdqa [rax+96],xmm6)
200
a2(movdqa [rax+112],xmm7)
201
a1(jne scrypt_ChunkMix_ssse3_loop)
202
a2(mov rsp, rbp)
203
a1(pop rbp)
204
a1(ret)
205
asm_naked_fn_end(scrypt_ChunkMix_ssse3)
206
207
#endif
208
209
210
/* intrinsic */
211
#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSSE3)
212
213
#define SCRYPT_SALSA64_SSSE3
214
215
static void asm_calling_convention
216
scrypt_ChunkMix_ssse3(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
217
uint32_t i, blocksPerChunk = r * 2, half = 0;
218
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
219
size_t rounds;
220
221
/* 1: X = B_{2r - 1} */
222
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
223
x0 = xmmp[0];
224
x1 = xmmp[1];
225
x2 = xmmp[2];
226
x3 = xmmp[3];
227
x4 = xmmp[4];
228
x5 = xmmp[5];
229
x6 = xmmp[6];
230
x7 = xmmp[7];
231
232
if (Bxor) {
233
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
234
x0 = _mm_xor_si128(x0, xmmp[0]);
235
x1 = _mm_xor_si128(x1, xmmp[1]);
236
x2 = _mm_xor_si128(x2, xmmp[2]);
237
x3 = _mm_xor_si128(x3, xmmp[3]);
238
x4 = _mm_xor_si128(x4, xmmp[4]);
239
x5 = _mm_xor_si128(x5, xmmp[5]);
240
x6 = _mm_xor_si128(x6, xmmp[6]);
241
x7 = _mm_xor_si128(x7, xmmp[7]);
242
}
243
244
/* 2: for i = 0 to 2r - 1 do */
245
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
246
/* 3: X = H(X ^ B_i) */
247
xmmp = (xmmi *)scrypt_block(Bin, i);
248
x0 = _mm_xor_si128(x0, xmmp[0]);
249
x1 = _mm_xor_si128(x1, xmmp[1]);
250
x2 = _mm_xor_si128(x2, xmmp[2]);
251
x3 = _mm_xor_si128(x3, xmmp[3]);
252
x4 = _mm_xor_si128(x4, xmmp[4]);
253
x5 = _mm_xor_si128(x5, xmmp[5]);
254
x6 = _mm_xor_si128(x6, xmmp[6]);
255
x7 = _mm_xor_si128(x7, xmmp[7]);
256
257
if (Bxor) {
258
xmmp = (xmmi *)scrypt_block(Bxor, i);
259
x0 = _mm_xor_si128(x0, xmmp[0]);
260
x1 = _mm_xor_si128(x1, xmmp[1]);
261
x2 = _mm_xor_si128(x2, xmmp[2]);
262
x3 = _mm_xor_si128(x3, xmmp[3]);
263
x4 = _mm_xor_si128(x4, xmmp[4]);
264
x5 = _mm_xor_si128(x5, xmmp[5]);
265
x6 = _mm_xor_si128(x6, xmmp[6]);
266
x7 = _mm_xor_si128(x7, xmmp[7]);
267
}
268
269
t0 = x0;
270
t1 = x1;
271
t2 = x2;
272
t3 = x3;
273
t4 = x4;
274
t5 = x5;
275
t6 = x6;
276
t7 = x7;
277
278
for (rounds = 8; rounds; rounds -= 2) {
279
z0 = _mm_add_epi64(x0, x2);
280
z1 = _mm_add_epi64(x1, x3);
281
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
282
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
283
x6 = _mm_xor_si128(x6, z0);
284
x7 = _mm_xor_si128(x7, z1);
285
286
z0 = _mm_add_epi64(x6, x0);
287
z1 = _mm_add_epi64(x7, x1);
288
z2 = _mm_srli_epi64(z0, 64-13);
289
z3 = _mm_srli_epi64(z1, 64-13);
290
z0 = _mm_slli_epi64(z0, 13);
291
z1 = _mm_slli_epi64(z1, 13);
292
x4 = _mm_xor_si128(x4, z2);
293
x5 = _mm_xor_si128(x5, z3);
294
x4 = _mm_xor_si128(x4, z0);
295
x5 = _mm_xor_si128(x5, z1);
296
297
z0 = _mm_add_epi64(x4, x6);
298
z1 = _mm_add_epi64(x5, x7);
299
z2 = _mm_srli_epi64(z0, 64-39);
300
z3 = _mm_srli_epi64(z1, 64-39);
301
z0 = _mm_slli_epi64(z0, 39);
302
z1 = _mm_slli_epi64(z1, 39);
303
x2 = _mm_xor_si128(x2, z2);
304
x3 = _mm_xor_si128(x3, z3);
305
x2 = _mm_xor_si128(x2, z0);
306
x3 = _mm_xor_si128(x3, z1);
307
308
z0 = _mm_add_epi64(x2, x4);
309
z1 = _mm_add_epi64(x3, x5);
310
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
311
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
312
x0 = _mm_xor_si128(x0, z0);
313
x1 = _mm_xor_si128(x1, z1);
314
315
z0 = x2;
316
z1 = x3;
317
x2 = _mm_alignr_epi8(x6, x7, 8);
318
x3 = _mm_alignr_epi8(x7, x6, 8);
319
x6 = _mm_alignr_epi8(z1, z0, 8);
320
x7 = _mm_alignr_epi8(z0, z1, 8);
321
322
z0 = _mm_add_epi64(x0, x2);
323
z1 = _mm_add_epi64(x1, x3);
324
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
325
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
326
x6 = _mm_xor_si128(x6, z0);
327
x7 = _mm_xor_si128(x7, z1);
328
329
z0 = _mm_add_epi64(x6, x0);
330
z1 = _mm_add_epi64(x7, x1);
331
z2 = _mm_srli_epi64(z0, 64-13);
332
z3 = _mm_srli_epi64(z1, 64-13);
333
z0 = _mm_slli_epi64(z0, 13);
334
z1 = _mm_slli_epi64(z1, 13);
335
x5 = _mm_xor_si128(x5, z2);
336
x4 = _mm_xor_si128(x4, z3);
337
x5 = _mm_xor_si128(x5, z0);
338
x4 = _mm_xor_si128(x4, z1);
339
340
z0 = _mm_add_epi64(x5, x6);
341
z1 = _mm_add_epi64(x4, x7);
342
z2 = _mm_srli_epi64(z0, 64-39);
343
z3 = _mm_srli_epi64(z1, 64-39);
344
z0 = _mm_slli_epi64(z0, 39);
345
z1 = _mm_slli_epi64(z1, 39);
346
x2 = _mm_xor_si128(x2, z2);
347
x3 = _mm_xor_si128(x3, z3);
348
x2 = _mm_xor_si128(x2, z0);
349
x3 = _mm_xor_si128(x3, z1);
350
351
z0 = _mm_add_epi64(x2, x5);
352
z1 = _mm_add_epi64(x3, x4);
353
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
354
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
355
x0 = _mm_xor_si128(x0, z0);
356
x1 = _mm_xor_si128(x1, z1);
357
358
z0 = x2;
359
z1 = x3;
360
x2 = _mm_alignr_epi8(x6, x7, 8);
361
x3 = _mm_alignr_epi8(x7, x6, 8);
362
x6 = _mm_alignr_epi8(z1, z0, 8);
363
x7 = _mm_alignr_epi8(z0, z1, 8);
364
}
365
366
x0 = _mm_add_epi64(x0, t0);
367
x1 = _mm_add_epi64(x1, t1);
368
x2 = _mm_add_epi64(x2, t2);
369
x3 = _mm_add_epi64(x3, t3);
370
x4 = _mm_add_epi64(x4, t4);
371
x5 = _mm_add_epi64(x5, t5);
372
x6 = _mm_add_epi64(x6, t6);
373
x7 = _mm_add_epi64(x7, t7);
374
375
/* 4: Y_i = X */
376
/* 6: B'[0..r-1] = Y_even */
377
/* 6: B'[r..2r-1] = Y_odd */
378
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
379
xmmp[0] = x0;
380
xmmp[1] = x1;
381
xmmp[2] = x2;
382
xmmp[3] = x3;
383
xmmp[4] = x4;
384
xmmp[5] = x5;
385
xmmp[6] = x6;
386
xmmp[7] = x7;
387
}
388
}
389
390
#endif
391
392
#if defined(SCRYPT_SALSA64_SSSE3)
393
/* uses salsa64_core_tangle_sse2 */
394
395
#undef SCRYPT_MIX
396
#define SCRYPT_MIX "Salsa64/8-SSSE3"
397
#undef SCRYPT_SALSA64_INCLUDED
398
#define SCRYPT_SALSA64_INCLUDED
399
#endif
400
401