Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
tpruvot
GitHub Repository: tpruvot/cpuminer-multi
Path: blob/linux/scryptjane/scrypt-jane-mix_salsa64-avx.h
1201 views
1
/* x64 */
2
#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
3
4
#define SCRYPT_SALSA64_AVX
5
6
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
7
asm_naked_fn(scrypt_ChunkMix_avx)
8
a1(push rbp)
9
a2(mov rbp, rsp)
10
a2(and rsp, ~63)
11
a2(sub rsp, 128)
12
a2(lea rcx,[rcx*2])
13
a2(shl rcx,7)
14
a2(lea r9,[rcx-128])
15
a2(lea rax,[rsi+r9])
16
a2(lea r9,[rdx+r9])
17
a2(and rdx, rdx)
18
a2(vmovdqa xmm0,[rax+0])
19
a2(vmovdqa xmm1,[rax+16])
20
a2(vmovdqa xmm2,[rax+32])
21
a2(vmovdqa xmm3,[rax+48])
22
a2(vmovdqa xmm4,[rax+64])
23
a2(vmovdqa xmm5,[rax+80])
24
a2(vmovdqa xmm6,[rax+96])
25
a2(vmovdqa xmm7,[rax+112])
26
a1(jz scrypt_ChunkMix_avx_no_xor1)
27
a3(vpxor xmm0,xmm0,[r9+0])
28
a3(vpxor xmm1,xmm1,[r9+16])
29
a3(vpxor xmm2,xmm2,[r9+32])
30
a3(vpxor xmm3,xmm3,[r9+48])
31
a3(vpxor xmm4,xmm4,[r9+64])
32
a3(vpxor xmm5,xmm5,[r9+80])
33
a3(vpxor xmm6,xmm6,[r9+96])
34
a3(vpxor xmm7,xmm7,[r9+112])
35
a1(scrypt_ChunkMix_avx_no_xor1:)
36
a2(xor r9,r9)
37
a2(xor r8,r8)
38
a1(scrypt_ChunkMix_avx_loop:)
39
a2(and rdx, rdx)
40
a3(vpxor xmm0,xmm0,[rsi+r9+0])
41
a3(vpxor xmm1,xmm1,[rsi+r9+16])
42
a3(vpxor xmm2,xmm2,[rsi+r9+32])
43
a3(vpxor xmm3,xmm3,[rsi+r9+48])
44
a3(vpxor xmm4,xmm4,[rsi+r9+64])
45
a3(vpxor xmm5,xmm5,[rsi+r9+80])
46
a3(vpxor xmm6,xmm6,[rsi+r9+96])
47
a3(vpxor xmm7,xmm7,[rsi+r9+112])
48
a1(jz scrypt_ChunkMix_avx_no_xor2)
49
a3(vpxor xmm0,xmm0,[rdx+r9+0])
50
a3(vpxor xmm1,xmm1,[rdx+r9+16])
51
a3(vpxor xmm2,xmm2,[rdx+r9+32])
52
a3(vpxor xmm3,xmm3,[rdx+r9+48])
53
a3(vpxor xmm4,xmm4,[rdx+r9+64])
54
a3(vpxor xmm5,xmm5,[rdx+r9+80])
55
a3(vpxor xmm6,xmm6,[rdx+r9+96])
56
a3(vpxor xmm7,xmm7,[rdx+r9+112])
57
a1(scrypt_ChunkMix_avx_no_xor2:)
58
a2(vmovdqa [rsp+0],xmm0)
59
a2(vmovdqa [rsp+16],xmm1)
60
a2(vmovdqa [rsp+32],xmm2)
61
a2(vmovdqa [rsp+48],xmm3)
62
a2(vmovdqa [rsp+64],xmm4)
63
a2(vmovdqa [rsp+80],xmm5)
64
a2(vmovdqa [rsp+96],xmm6)
65
a2(vmovdqa [rsp+112],xmm7)
66
a2(mov rax,8)
67
a1(scrypt_salsa64_avx_loop: )
68
a3(vpaddq xmm8, xmm0, xmm2)
69
a3(vpaddq xmm9, xmm1, xmm3)
70
a3(vpshufd xmm8, xmm8, 0xb1)
71
a3(vpshufd xmm9, xmm9, 0xb1)
72
a3(vpxor xmm6, xmm6, xmm8)
73
a3(vpxor xmm7, xmm7, xmm9)
74
a3(vpaddq xmm10, xmm0, xmm6)
75
a3(vpaddq xmm11, xmm1, xmm7)
76
a3(vpsrlq xmm8, xmm10, 51)
77
a3(vpsrlq xmm9, xmm11, 51)
78
a3(vpsllq xmm10, xmm10, 13)
79
a3(vpsllq xmm11, xmm11, 13)
80
a3(vpxor xmm4, xmm4, xmm8)
81
a3(vpxor xmm5, xmm5, xmm9)
82
a3(vpxor xmm4, xmm4, xmm10)
83
a3(vpxor xmm5, xmm5, xmm11)
84
a3(vpaddq xmm8, xmm6, xmm4)
85
a3(vpaddq xmm9, xmm7, xmm5)
86
a3(vpsrlq xmm10, xmm8, 25)
87
a3(vpsrlq xmm11, xmm9, 25)
88
a3(vpsllq xmm8, xmm8, 39)
89
a3(vpsllq xmm9, xmm9, 39)
90
a3(vpxor xmm2, xmm2, xmm10)
91
a3(vpxor xmm3, xmm3, xmm11)
92
a3(vpxor xmm2, xmm2, xmm8)
93
a3(vpxor xmm3, xmm3, xmm9)
94
a3(vpaddq xmm10, xmm4, xmm2)
95
a3(vpaddq xmm11, xmm5, xmm3)
96
a3(vpshufd xmm10, xmm10, 0xb1)
97
a3(vpshufd xmm11, xmm11, 0xb1)
98
a3(vpxor xmm0, xmm0, xmm10)
99
a3(vpxor xmm1, xmm1, xmm11)
100
a2(vmovdqa xmm8, xmm2)
101
a2(vmovdqa xmm9, xmm3)
102
a4(vpalignr xmm2, xmm6, xmm7, 8)
103
a4(vpalignr xmm3, xmm7, xmm6, 8)
104
a4(vpalignr xmm6, xmm9, xmm8, 8)
105
a4(vpalignr xmm7, xmm8, xmm9, 8)
106
a2(sub rax, 2)
107
a3(vpaddq xmm10, xmm0, xmm2)
108
a3(vpaddq xmm11, xmm1, xmm3)
109
a3(vpshufd xmm10, xmm10, 0xb1)
110
a3(vpshufd xmm11, xmm11, 0xb1)
111
a3(vpxor xmm6, xmm6, xmm10)
112
a3(vpxor xmm7, xmm7, xmm11)
113
a3(vpaddq xmm8, xmm0, xmm6)
114
a3(vpaddq xmm9, xmm1, xmm7)
115
a3(vpsrlq xmm10, xmm8, 51)
116
a3(vpsrlq xmm11, xmm9, 51)
117
a3(vpsllq xmm8, xmm8, 13)
118
a3(vpsllq xmm9, xmm9, 13)
119
a3(vpxor xmm5, xmm5, xmm10)
120
a3(vpxor xmm4, xmm4, xmm11)
121
a3(vpxor xmm5, xmm5, xmm8)
122
a3(vpxor xmm4, xmm4, xmm9)
123
a3(vpaddq xmm10, xmm6, xmm5)
124
a3(vpaddq xmm11, xmm7, xmm4)
125
a3(vpsrlq xmm8, xmm10, 25)
126
a3(vpsrlq xmm9, xmm11, 25)
127
a3(vpsllq xmm10, xmm10, 39)
128
a3(vpsllq xmm11, xmm11, 39)
129
a3(vpxor xmm2, xmm2, xmm8)
130
a3(vpxor xmm3, xmm3, xmm9)
131
a3(vpxor xmm2, xmm2, xmm10)
132
a3(vpxor xmm3, xmm3, xmm11)
133
a3(vpaddq xmm8, xmm5, xmm2)
134
a3(vpaddq xmm9, xmm4, xmm3)
135
a3(vpshufd xmm8, xmm8, 0xb1)
136
a3(vpshufd xmm9, xmm9, 0xb1)
137
a3(vpxor xmm0, xmm0, xmm8)
138
a3(vpxor xmm1, xmm1, xmm9)
139
a2(vmovdqa xmm10, xmm2)
140
a2(vmovdqa xmm11, xmm3)
141
a4(vpalignr xmm2, xmm6, xmm7, 8)
142
a4(vpalignr xmm3, xmm7, xmm6, 8)
143
a4(vpalignr xmm6, xmm11, xmm10, 8)
144
a4(vpalignr xmm7, xmm10, xmm11, 8)
145
a1(ja scrypt_salsa64_avx_loop)
146
a3(vpaddq xmm0,xmm0,[rsp+0])
147
a3(vpaddq xmm1,xmm1,[rsp+16])
148
a3(vpaddq xmm2,xmm2,[rsp+32])
149
a3(vpaddq xmm3,xmm3,[rsp+48])
150
a3(vpaddq xmm4,xmm4,[rsp+64])
151
a3(vpaddq xmm5,xmm5,[rsp+80])
152
a3(vpaddq xmm6,xmm6,[rsp+96])
153
a3(vpaddq xmm7,xmm7,[rsp+112])
154
a2(lea rax,[r8+r9])
155
a2(xor r8,rcx)
156
a2(and rax,~0xff)
157
a2(add r9,128)
158
a2(shr rax,1)
159
a2(add rax, rdi)
160
a2(cmp r9,rcx)
161
a2(vmovdqa [rax+0],xmm0)
162
a2(vmovdqa [rax+16],xmm1)
163
a2(vmovdqa [rax+32],xmm2)
164
a2(vmovdqa [rax+48],xmm3)
165
a2(vmovdqa [rax+64],xmm4)
166
a2(vmovdqa [rax+80],xmm5)
167
a2(vmovdqa [rax+96],xmm6)
168
a2(vmovdqa [rax+112],xmm7)
169
a1(jne scrypt_ChunkMix_avx_loop)
170
a2(mov rsp, rbp)
171
a1(pop rbp)
172
a1(ret)
173
asm_naked_fn_end(scrypt_ChunkMix_avx)
174
175
#endif
176
177
178
/* intrinsic */
179
#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_AVX)
180
181
#define SCRYPT_SALSA64_AVX
182
183
static void asm_calling_convention
184
scrypt_ChunkMix_avx(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
185
uint32_t i, blocksPerChunk = r * 2, half = 0;
186
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
187
size_t rounds;
188
189
/* 1: X = B_{2r - 1} */
190
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
191
x0 = xmmp[0];
192
x1 = xmmp[1];
193
x2 = xmmp[2];
194
x3 = xmmp[3];
195
x4 = xmmp[4];
196
x5 = xmmp[5];
197
x6 = xmmp[6];
198
x7 = xmmp[7];
199
200
if (Bxor) {
201
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
202
x0 = _mm_xor_si128(x0, xmmp[0]);
203
x1 = _mm_xor_si128(x1, xmmp[1]);
204
x2 = _mm_xor_si128(x2, xmmp[2]);
205
x3 = _mm_xor_si128(x3, xmmp[3]);
206
x4 = _mm_xor_si128(x4, xmmp[4]);
207
x5 = _mm_xor_si128(x5, xmmp[5]);
208
x6 = _mm_xor_si128(x6, xmmp[6]);
209
x7 = _mm_xor_si128(x7, xmmp[7]);
210
}
211
212
/* 2: for i = 0 to 2r - 1 do */
213
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
214
/* 3: X = H(X ^ B_i) */
215
xmmp = (xmmi *)scrypt_block(Bin, i);
216
x0 = _mm_xor_si128(x0, xmmp[0]);
217
x1 = _mm_xor_si128(x1, xmmp[1]);
218
x2 = _mm_xor_si128(x2, xmmp[2]);
219
x3 = _mm_xor_si128(x3, xmmp[3]);
220
x4 = _mm_xor_si128(x4, xmmp[4]);
221
x5 = _mm_xor_si128(x5, xmmp[5]);
222
x6 = _mm_xor_si128(x6, xmmp[6]);
223
x7 = _mm_xor_si128(x7, xmmp[7]);
224
225
if (Bxor) {
226
xmmp = (xmmi *)scrypt_block(Bxor, i);
227
x0 = _mm_xor_si128(x0, xmmp[0]);
228
x1 = _mm_xor_si128(x1, xmmp[1]);
229
x2 = _mm_xor_si128(x2, xmmp[2]);
230
x3 = _mm_xor_si128(x3, xmmp[3]);
231
x4 = _mm_xor_si128(x4, xmmp[4]);
232
x5 = _mm_xor_si128(x5, xmmp[5]);
233
x6 = _mm_xor_si128(x6, xmmp[6]);
234
x7 = _mm_xor_si128(x7, xmmp[7]);
235
}
236
237
t0 = x0;
238
t1 = x1;
239
t2 = x2;
240
t3 = x3;
241
t4 = x4;
242
t5 = x5;
243
t6 = x6;
244
t7 = x7;
245
246
for (rounds = 8; rounds; rounds -= 2) {
247
z0 = _mm_add_epi64(x0, x2);
248
z1 = _mm_add_epi64(x1, x3);
249
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
250
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
251
x6 = _mm_xor_si128(x6, z0);
252
x7 = _mm_xor_si128(x7, z1);
253
254
z0 = _mm_add_epi64(x6, x0);
255
z1 = _mm_add_epi64(x7, x1);
256
z2 = _mm_srli_epi64(z0, 64-13);
257
z3 = _mm_srli_epi64(z1, 64-13);
258
z0 = _mm_slli_epi64(z0, 13);
259
z1 = _mm_slli_epi64(z1, 13);
260
x4 = _mm_xor_si128(x4, z2);
261
x5 = _mm_xor_si128(x5, z3);
262
x4 = _mm_xor_si128(x4, z0);
263
x5 = _mm_xor_si128(x5, z1);
264
265
z0 = _mm_add_epi64(x4, x6);
266
z1 = _mm_add_epi64(x5, x7);
267
z2 = _mm_srli_epi64(z0, 64-39);
268
z3 = _mm_srli_epi64(z1, 64-39);
269
z0 = _mm_slli_epi64(z0, 39);
270
z1 = _mm_slli_epi64(z1, 39);
271
x2 = _mm_xor_si128(x2, z2);
272
x3 = _mm_xor_si128(x3, z3);
273
x2 = _mm_xor_si128(x2, z0);
274
x3 = _mm_xor_si128(x3, z1);
275
276
z0 = _mm_add_epi64(x2, x4);
277
z1 = _mm_add_epi64(x3, x5);
278
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
279
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
280
x0 = _mm_xor_si128(x0, z0);
281
x1 = _mm_xor_si128(x1, z1);
282
283
z0 = x2;
284
z1 = x3;
285
x2 = _mm_alignr_epi8(x6, x7, 8);
286
x3 = _mm_alignr_epi8(x7, x6, 8);
287
x6 = _mm_alignr_epi8(z1, z0, 8);
288
x7 = _mm_alignr_epi8(z0, z1, 8);
289
290
z0 = _mm_add_epi64(x0, x2);
291
z1 = _mm_add_epi64(x1, x3);
292
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
293
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
294
x6 = _mm_xor_si128(x6, z0);
295
x7 = _mm_xor_si128(x7, z1);
296
297
z0 = _mm_add_epi64(x6, x0);
298
z1 = _mm_add_epi64(x7, x1);
299
z2 = _mm_srli_epi64(z0, 64-13);
300
z3 = _mm_srli_epi64(z1, 64-13);
301
z0 = _mm_slli_epi64(z0, 13);
302
z1 = _mm_slli_epi64(z1, 13);
303
x5 = _mm_xor_si128(x5, z2);
304
x4 = _mm_xor_si128(x4, z3);
305
x5 = _mm_xor_si128(x5, z0);
306
x4 = _mm_xor_si128(x4, z1);
307
308
z0 = _mm_add_epi64(x5, x6);
309
z1 = _mm_add_epi64(x4, x7);
310
z2 = _mm_srli_epi64(z0, 64-39);
311
z3 = _mm_srli_epi64(z1, 64-39);
312
z0 = _mm_slli_epi64(z0, 39);
313
z1 = _mm_slli_epi64(z1, 39);
314
x2 = _mm_xor_si128(x2, z2);
315
x3 = _mm_xor_si128(x3, z3);
316
x2 = _mm_xor_si128(x2, z0);
317
x3 = _mm_xor_si128(x3, z1);
318
319
z0 = _mm_add_epi64(x2, x5);
320
z1 = _mm_add_epi64(x3, x4);
321
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
322
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
323
x0 = _mm_xor_si128(x0, z0);
324
x1 = _mm_xor_si128(x1, z1);
325
326
z0 = x2;
327
z1 = x3;
328
x2 = _mm_alignr_epi8(x6, x7, 8);
329
x3 = _mm_alignr_epi8(x7, x6, 8);
330
x6 = _mm_alignr_epi8(z1, z0, 8);
331
x7 = _mm_alignr_epi8(z0, z1, 8);
332
}
333
334
x0 = _mm_add_epi64(x0, t0);
335
x1 = _mm_add_epi64(x1, t1);
336
x2 = _mm_add_epi64(x2, t2);
337
x3 = _mm_add_epi64(x3, t3);
338
x4 = _mm_add_epi64(x4, t4);
339
x5 = _mm_add_epi64(x5, t5);
340
x6 = _mm_add_epi64(x6, t6);
341
x7 = _mm_add_epi64(x7, t7);
342
343
/* 4: Y_i = X */
344
/* 6: B'[0..r-1] = Y_even */
345
/* 6: B'[r..2r-1] = Y_odd */
346
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
347
xmmp[0] = x0;
348
xmmp[1] = x1;
349
xmmp[2] = x2;
350
xmmp[3] = x3;
351
xmmp[4] = x4;
352
xmmp[5] = x5;
353
xmmp[6] = x6;
354
xmmp[7] = x7;
355
}
356
}
357
358
#endif
359
360
#if defined(SCRYPT_SALSA64_AVX)
361
/* uses salsa64_core_tangle_sse2 */
362
363
#undef SCRYPT_MIX
364
#define SCRYPT_MIX "Salsa64/8-AVX"
365
#undef SCRYPT_SALSA64_INCLUDED
366
#define SCRYPT_SALSA64_INCLUDED
367
#endif
368
369