Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
tpruvot
GitHub Repository: tpruvot/cpuminer-multi
Path: blob/linux/scryptjane/scrypt-jane-mix_salsa-avx.h
1201 views
1
/* x86 */
2
#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
3
4
#define SCRYPT_SALSA_AVX
5
6
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
7
asm_naked_fn(scrypt_ChunkMix_avx)
8
a1(push ebx)
9
a1(push edi)
10
a1(push esi)
11
a1(push ebp)
12
a2(mov ebp,esp)
13
a2(mov edi,[ebp+20])
14
a2(mov esi,[ebp+24])
15
a2(mov eax,[ebp+28])
16
a2(mov ebx,[ebp+32])
17
a2(sub esp,32)
18
a2(and esp,~63)
19
a2(lea edx,[ebx*2])
20
a2(shl edx,6)
21
a2(lea ecx,[edx-64])
22
a2(and eax, eax)
23
a2(movdqa xmm0,[ecx+esi+0])
24
a2(movdqa xmm1,[ecx+esi+16])
25
a2(movdqa xmm2,[ecx+esi+32])
26
a2(movdqa xmm3,[ecx+esi+48])
27
a1(jz scrypt_ChunkMix_avx_no_xor1)
28
a3(vpxor xmm0,xmm0,[ecx+eax+0])
29
a3(vpxor xmm1,xmm1,[ecx+eax+16])
30
a3(vpxor xmm2,xmm2,[ecx+eax+32])
31
a3(vpxor xmm3,xmm3,[ecx+eax+48])
32
a1(scrypt_ChunkMix_avx_no_xor1:)
33
a2(xor ecx,ecx)
34
a2(xor ebx,ebx)
35
a1(scrypt_ChunkMix_avx_loop:)
36
a2(and eax, eax)
37
a3(vpxor xmm0,xmm0,[esi+ecx+0])
38
a3(vpxor xmm1,xmm1,[esi+ecx+16])
39
a3(vpxor xmm2,xmm2,[esi+ecx+32])
40
a3(vpxor xmm3,xmm3,[esi+ecx+48])
41
a1(jz scrypt_ChunkMix_avx_no_xor2)
42
a3(vpxor xmm0,xmm0,[eax+ecx+0])
43
a3(vpxor xmm1,xmm1,[eax+ecx+16])
44
a3(vpxor xmm2,xmm2,[eax+ecx+32])
45
a3(vpxor xmm3,xmm3,[eax+ecx+48])
46
a1(scrypt_ChunkMix_avx_no_xor2:)
47
a2(vmovdqa [esp+0],xmm0)
48
a2(vmovdqa [esp+16],xmm1)
49
a2(vmovdqa xmm6,xmm2)
50
a2(vmovdqa xmm7,xmm3)
51
a2(mov eax,8)
52
a1(scrypt_salsa_avx_loop: )
53
a3(vpaddd xmm4, xmm1, xmm0)
54
a3(vpsrld xmm5, xmm4, 25)
55
a3(vpslld xmm4, xmm4, 7)
56
a3(vpxor xmm3, xmm3, xmm5)
57
a3(vpxor xmm3, xmm3, xmm4)
58
a3(vpaddd xmm4, xmm0, xmm3)
59
a3(vpsrld xmm5, xmm4, 23)
60
a3(vpslld xmm4, xmm4, 9)
61
a3(vpxor xmm2, xmm2, xmm5)
62
a3(vpxor xmm2, xmm2, xmm4)
63
a3(vpaddd xmm4, xmm3, xmm2)
64
a3(vpsrld xmm5, xmm4, 19)
65
a3(vpslld xmm4, xmm4, 13)
66
a3(vpxor xmm1, xmm1, xmm5)
67
a3(pshufd xmm3, xmm3, 0x93)
68
a3(vpxor xmm1, xmm1, xmm4)
69
a3(vpaddd xmm4, xmm2, xmm1)
70
a3(vpsrld xmm5, xmm4, 14)
71
a3(vpslld xmm4, xmm4, 18)
72
a3(vpxor xmm0, xmm0, xmm5)
73
a3(pshufd xmm2, xmm2, 0x4e)
74
a3(vpxor xmm0, xmm0, xmm4)
75
a2(sub eax, 2)
76
a3(vpaddd xmm4, xmm3, xmm0)
77
a3(pshufd xmm1, xmm1, 0x39)
78
a3(vpsrld xmm5, xmm4, 25)
79
a3(vpslld xmm4, xmm4, 7)
80
a3(vpxor xmm1, xmm1, xmm5)
81
a3(vpxor xmm1, xmm1, xmm4)
82
a3(vpaddd xmm4, xmm0, xmm1)
83
a3(vpsrld xmm5, xmm4, 23)
84
a3(vpslld xmm4, xmm4, 9)
85
a3(vpxor xmm2, xmm2, xmm5)
86
a3(vpxor xmm2, xmm2, xmm4)
87
a3(vpaddd xmm4, xmm1, xmm2)
88
a3(vpsrld xmm5, xmm4, 19)
89
a3(vpslld xmm4, xmm4, 13)
90
a3(vpxor xmm3, xmm3, xmm5)
91
a3(pshufd xmm1, xmm1, 0x93)
92
a3(vpxor xmm3, xmm3, xmm4)
93
a3(vpaddd xmm4, xmm2, xmm3)
94
a3(vpsrld xmm5, xmm4, 14)
95
a3(vpslld xmm4, xmm4, 18)
96
a3(vpxor xmm0, xmm0, xmm5)
97
a3(pshufd xmm2, xmm2, 0x4e)
98
a3(vpxor xmm0, xmm0, xmm4)
99
a3(pshufd xmm3, xmm3, 0x39)
100
a1(ja scrypt_salsa_avx_loop)
101
a3(vpaddd xmm0,xmm0,[esp+0])
102
a3(vpaddd xmm1,xmm1,[esp+16])
103
a3(vpaddd xmm2,xmm2,xmm6)
104
a3(vpaddd xmm3,xmm3,xmm7)
105
a2(lea eax,[ebx+ecx])
106
a2(xor ebx,edx)
107
a2(and eax,~0x7f)
108
a2(add ecx,64)
109
a2(shr eax,1)
110
a2(add eax, edi)
111
a2(cmp ecx,edx)
112
a2(vmovdqa [eax+0],xmm0)
113
a2(vmovdqa [eax+16],xmm1)
114
a2(vmovdqa [eax+32],xmm2)
115
a2(vmovdqa [eax+48],xmm3)
116
a2(mov eax,[ebp+28])
117
a1(jne scrypt_ChunkMix_avx_loop)
118
a2(mov esp,ebp)
119
a1(pop ebp)
120
a1(pop esi)
121
a1(pop edi)
122
a1(pop ebx)
123
aret(16)
124
asm_naked_fn_end(scrypt_ChunkMix_avx)
125
126
#endif
127
128
129
130
/* x64 */
131
#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
132
133
#define SCRYPT_SALSA_AVX
134
135
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
136
asm_naked_fn(scrypt_ChunkMix_avx)
137
a2(lea rcx,[rcx*2])
138
a2(shl rcx,6)
139
a2(lea r9,[rcx-64])
140
a2(lea rax,[rsi+r9])
141
a2(lea r9,[rdx+r9])
142
a2(and rdx, rdx)
143
a2(vmovdqa xmm0,[rax+0])
144
a2(vmovdqa xmm1,[rax+16])
145
a2(vmovdqa xmm2,[rax+32])
146
a2(vmovdqa xmm3,[rax+48])
147
a1(jz scrypt_ChunkMix_avx_no_xor1)
148
a3(vpxor xmm0,xmm0,[r9+0])
149
a3(vpxor xmm1,xmm1,[r9+16])
150
a3(vpxor xmm2,xmm2,[r9+32])
151
a3(vpxor xmm3,xmm3,[r9+48])
152
a1(scrypt_ChunkMix_avx_no_xor1:)
153
a2(xor r9,r9)
154
a2(xor r8,r8)
155
a1(scrypt_ChunkMix_avx_loop:)
156
a2(and rdx, rdx)
157
a3(vpxor xmm0,xmm0,[rsi+r9+0])
158
a3(vpxor xmm1,xmm1,[rsi+r9+16])
159
a3(vpxor xmm2,xmm2,[rsi+r9+32])
160
a3(vpxor xmm3,xmm3,[rsi+r9+48])
161
a1(jz scrypt_ChunkMix_avx_no_xor2)
162
a3(vpxor xmm0,xmm0,[rdx+r9+0])
163
a3(vpxor xmm1,xmm1,[rdx+r9+16])
164
a3(vpxor xmm2,xmm2,[rdx+r9+32])
165
a3(vpxor xmm3,xmm3,[rdx+r9+48])
166
a1(scrypt_ChunkMix_avx_no_xor2:)
167
a2(vmovdqa xmm8,xmm0)
168
a2(vmovdqa xmm9,xmm1)
169
a2(vmovdqa xmm10,xmm2)
170
a2(vmovdqa xmm11,xmm3)
171
a2(mov rax,8)
172
a1(scrypt_salsa_avx_loop: )
173
a3(vpaddd xmm4, xmm1, xmm0)
174
a3(vpsrld xmm5, xmm4, 25)
175
a3(vpslld xmm4, xmm4, 7)
176
a3(vpxor xmm3, xmm3, xmm5)
177
a3(vpxor xmm3, xmm3, xmm4)
178
a3(vpaddd xmm4, xmm0, xmm3)
179
a3(vpsrld xmm5, xmm4, 23)
180
a3(vpslld xmm4, xmm4, 9)
181
a3(vpxor xmm2, xmm2, xmm5)
182
a3(vpxor xmm2, xmm2, xmm4)
183
a3(vpaddd xmm4, xmm3, xmm2)
184
a3(vpsrld xmm5, xmm4, 19)
185
a3(vpslld xmm4, xmm4, 13)
186
a3(vpxor xmm1, xmm1, xmm5)
187
a3(pshufd xmm3, xmm3, 0x93)
188
a3(vpxor xmm1, xmm1, xmm4)
189
a3(vpaddd xmm4, xmm2, xmm1)
190
a3(vpsrld xmm5, xmm4, 14)
191
a3(vpslld xmm4, xmm4, 18)
192
a3(vpxor xmm0, xmm0, xmm5)
193
a3(pshufd xmm2, xmm2, 0x4e)
194
a3(vpxor xmm0, xmm0, xmm4)
195
a2(sub rax, 2)
196
a3(vpaddd xmm4, xmm3, xmm0)
197
a3(pshufd xmm1, xmm1, 0x39)
198
a3(vpsrld xmm5, xmm4, 25)
199
a3(vpslld xmm4, xmm4, 7)
200
a3(vpxor xmm1, xmm1, xmm5)
201
a3(vpxor xmm1, xmm1, xmm4)
202
a3(vpaddd xmm4, xmm0, xmm1)
203
a3(vpsrld xmm5, xmm4, 23)
204
a3(vpslld xmm4, xmm4, 9)
205
a3(vpxor xmm2, xmm2, xmm5)
206
a3(vpxor xmm2, xmm2, xmm4)
207
a3(vpaddd xmm4, xmm1, xmm2)
208
a3(vpsrld xmm5, xmm4, 19)
209
a3(vpslld xmm4, xmm4, 13)
210
a3(vpxor xmm3, xmm3, xmm5)
211
a3(pshufd xmm1, xmm1, 0x93)
212
a3(vpxor xmm3, xmm3, xmm4)
213
a3(vpaddd xmm4, xmm2, xmm3)
214
a3(vpsrld xmm5, xmm4, 14)
215
a3(vpslld xmm4, xmm4, 18)
216
a3(vpxor xmm0, xmm0, xmm5)
217
a3(pshufd xmm2, xmm2, 0x4e)
218
a3(vpxor xmm0, xmm0, xmm4)
219
a3(pshufd xmm3, xmm3, 0x39)
220
a1(ja scrypt_salsa_avx_loop)
221
a3(vpaddd xmm0,xmm0,xmm8)
222
a3(vpaddd xmm1,xmm1,xmm9)
223
a3(vpaddd xmm2,xmm2,xmm10)
224
a3(vpaddd xmm3,xmm3,xmm11)
225
a2(lea rax,[r8+r9])
226
a2(xor r8,rcx)
227
a2(and rax,~0x7f)
228
a2(add r9,64)
229
a2(shr rax,1)
230
a2(add rax, rdi)
231
a2(cmp r9,rcx)
232
a2(vmovdqa [rax+0],xmm0)
233
a2(vmovdqa [rax+16],xmm1)
234
a2(vmovdqa [rax+32],xmm2)
235
a2(vmovdqa [rax+48],xmm3)
236
a1(jne scrypt_ChunkMix_avx_loop)
237
a1(ret)
238
asm_naked_fn_end(scrypt_ChunkMix_avx)
239
240
#endif
241
242
243
/* intrinsic */
244
#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
245
246
#define SCRYPT_SALSA_AVX
247
248
static void NOINLINE
249
scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
250
uint32_t i, blocksPerChunk = r * 2, half = 0;
251
xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3;
252
size_t rounds;
253
254
/* 1: X = B_{2r - 1} */
255
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
256
x0 = xmmp[0];
257
x1 = xmmp[1];
258
x2 = xmmp[2];
259
x3 = xmmp[3];
260
261
if (Bxor) {
262
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
263
x0 = _mm_xor_si128(x0, xmmp[0]);
264
x1 = _mm_xor_si128(x1, xmmp[1]);
265
x2 = _mm_xor_si128(x2, xmmp[2]);
266
x3 = _mm_xor_si128(x3, xmmp[3]);
267
}
268
269
/* 2: for i = 0 to 2r - 1 do */
270
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
271
/* 3: X = H(X ^ B_i) */
272
xmmp = (xmmi *)scrypt_block(Bin, i);
273
x0 = _mm_xor_si128(x0, xmmp[0]);
274
x1 = _mm_xor_si128(x1, xmmp[1]);
275
x2 = _mm_xor_si128(x2, xmmp[2]);
276
x3 = _mm_xor_si128(x3, xmmp[3]);
277
278
if (Bxor) {
279
xmmp = (xmmi *)scrypt_block(Bxor, i);
280
x0 = _mm_xor_si128(x0, xmmp[0]);
281
x1 = _mm_xor_si128(x1, xmmp[1]);
282
x2 = _mm_xor_si128(x2, xmmp[2]);
283
x3 = _mm_xor_si128(x3, xmmp[3]);
284
}
285
286
t0 = x0;
287
t1 = x1;
288
t2 = x2;
289
t3 = x3;
290
291
for (rounds = 8; rounds; rounds -= 2) {
292
x4 = x1;
293
x4 = _mm_add_epi32(x4, x0);
294
x5 = x4;
295
x4 = _mm_slli_epi32(x4, 7);
296
x5 = _mm_srli_epi32(x5, 25);
297
x3 = _mm_xor_si128(x3, x4);
298
x4 = x0;
299
x3 = _mm_xor_si128(x3, x5);
300
x4 = _mm_add_epi32(x4, x3);
301
x5 = x4;
302
x4 = _mm_slli_epi32(x4, 9);
303
x5 = _mm_srli_epi32(x5, 23);
304
x2 = _mm_xor_si128(x2, x4);
305
x4 = x3;
306
x2 = _mm_xor_si128(x2, x5);
307
x3 = _mm_shuffle_epi32(x3, 0x93);
308
x4 = _mm_add_epi32(x4, x2);
309
x5 = x4;
310
x4 = _mm_slli_epi32(x4, 13);
311
x5 = _mm_srli_epi32(x5, 19);
312
x1 = _mm_xor_si128(x1, x4);
313
x4 = x2;
314
x1 = _mm_xor_si128(x1, x5);
315
x2 = _mm_shuffle_epi32(x2, 0x4e);
316
x4 = _mm_add_epi32(x4, x1);
317
x5 = x4;
318
x4 = _mm_slli_epi32(x4, 18);
319
x5 = _mm_srli_epi32(x5, 14);
320
x0 = _mm_xor_si128(x0, x4);
321
x4 = x3;
322
x0 = _mm_xor_si128(x0, x5);
323
x1 = _mm_shuffle_epi32(x1, 0x39);
324
x4 = _mm_add_epi32(x4, x0);
325
x5 = x4;
326
x4 = _mm_slli_epi32(x4, 7);
327
x5 = _mm_srli_epi32(x5, 25);
328
x1 = _mm_xor_si128(x1, x4);
329
x4 = x0;
330
x1 = _mm_xor_si128(x1, x5);
331
x4 = _mm_add_epi32(x4, x1);
332
x5 = x4;
333
x4 = _mm_slli_epi32(x4, 9);
334
x5 = _mm_srli_epi32(x5, 23);
335
x2 = _mm_xor_si128(x2, x4);
336
x4 = x1;
337
x2 = _mm_xor_si128(x2, x5);
338
x1 = _mm_shuffle_epi32(x1, 0x93);
339
x4 = _mm_add_epi32(x4, x2);
340
x5 = x4;
341
x4 = _mm_slli_epi32(x4, 13);
342
x5 = _mm_srli_epi32(x5, 19);
343
x3 = _mm_xor_si128(x3, x4);
344
x4 = x2;
345
x3 = _mm_xor_si128(x3, x5);
346
x2 = _mm_shuffle_epi32(x2, 0x4e);
347
x4 = _mm_add_epi32(x4, x3);
348
x5 = x4;
349
x4 = _mm_slli_epi32(x4, 18);
350
x5 = _mm_srli_epi32(x5, 14);
351
x0 = _mm_xor_si128(x0, x4);
352
x3 = _mm_shuffle_epi32(x3, 0x39);
353
x0 = _mm_xor_si128(x0, x5);
354
}
355
356
x0 = _mm_add_epi32(x0, t0);
357
x1 = _mm_add_epi32(x1, t1);
358
x2 = _mm_add_epi32(x2, t2);
359
x3 = _mm_add_epi32(x3, t3);
360
361
/* 4: Y_i = X */
362
/* 6: B'[0..r-1] = Y_even */
363
/* 6: B'[r..2r-1] = Y_odd */
364
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
365
xmmp[0] = x0;
366
xmmp[1] = x1;
367
xmmp[2] = x2;
368
xmmp[3] = x3;
369
}
370
}
371
372
#endif
373
374
#if defined(SCRYPT_SALSA_AVX)
375
/* uses salsa_core_tangle_sse2 */
376
377
#undef SCRYPT_MIX
378
#define SCRYPT_MIX "Salsa/8-AVX"
379
#undef SCRYPT_SALSA_INCLUDED
380
#define SCRYPT_SALSA_INCLUDED
381
#endif
382
383