CoCalc -- scrypt-jane-mix

GitHub Repository: tpruvot/cpuminer-multi
Path: blob/linux/scryptjane/scrypt-jane-mix_chacha-avx.h
¹⁵³⁸ views
1
/* x86 */
2
#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
3

4
#define SCRYPT_CHACHA_AVX
5

6
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
7
asm_naked_fn(scrypt_ChunkMix_avx)
8
	a1(push ebx)
9
	a1(push edi)
10
	a1(push esi)
11
	a1(push ebp)
12
	a2(mov ebp,esp)
13
	a2(mov edi,[ebp+20])
14
	a2(mov esi,[ebp+24])
15
	a2(mov eax,[ebp+28])
16
	a2(mov ebx,[ebp+32])
17
	a2(sub esp,64)
18
	a2(and esp,~63)
19
	a2(lea edx,[ebx*2])
20
	a2(shl edx,6)
21
	a2(lea ecx,[edx-64])
22
	a2(and eax, eax)
23
	a2(mov ebx, 0x01000302)
24
	a2(vmovd xmm4, ebx)
25
	a2(mov ebx, 0x05040706)
26
	a2(vmovd xmm0, ebx)
27
	a2(mov ebx, 0x09080b0a)
28
	a2(vmovd xmm1, ebx)
29
	a2(mov ebx, 0x0d0c0f0e)
30
	a2(vmovd xmm2, ebx)
31
	a2(mov ebx, 0x02010003)
32
	a2(vmovd xmm5, ebx)
33
	a2(mov ebx, 0x06050407)
34
	a2(vmovd xmm3, ebx)
35
	a2(mov ebx, 0x0a09080b)
36
	a2(vmovd xmm6, ebx)
37
	a2(mov ebx, 0x0e0d0c0f)
38
	a2(vmovd xmm7, ebx)
39
	a3(vpunpckldq xmm4, xmm4, xmm0)
40
	a3(vpunpckldq xmm5, xmm5, xmm3)
41
	a3(vpunpckldq xmm1, xmm1, xmm2)
42
	a3(vpunpckldq xmm6, xmm6, xmm7)
43
	a3(vpunpcklqdq xmm4, xmm4, xmm1)
44
	a3(vpunpcklqdq xmm5, xmm5, xmm6)
45
	a2(vmovdqa xmm0,[ecx+esi+0])
46
	a2(vmovdqa xmm1,[ecx+esi+16])
47
	a2(vmovdqa xmm2,[ecx+esi+32])
48
	a2(vmovdqa xmm3,[ecx+esi+48])
49
	a1(jz scrypt_ChunkMix_avx_no_xor1)
50
	a3(vpxor xmm0,xmm0,[ecx+eax+0])
51
	a3(vpxor xmm1,xmm1,[ecx+eax+16])
52
	a3(vpxor xmm2,xmm2,[ecx+eax+32])
53
	a3(vpxor xmm3,xmm3,[ecx+eax+48])
54
	a1(scrypt_ChunkMix_avx_no_xor1:)
55
	a2(xor ecx,ecx)
56
	a2(xor ebx,ebx)
57
	a1(scrypt_ChunkMix_avx_loop:)
58
		a2(and eax, eax)
59
		a3(vpxor xmm0,xmm0,[esi+ecx+0])
60
		a3(vpxor xmm1,xmm1,[esi+ecx+16])
61
		a3(vpxor xmm2,xmm2,[esi+ecx+32])
62
		a3(vpxor xmm3,xmm3,[esi+ecx+48])
63
		a1(jz scrypt_ChunkMix_avx_no_xor2)
64
		a3(vpxor xmm0,xmm0,[eax+ecx+0])
65
		a3(vpxor xmm1,xmm1,[eax+ecx+16])
66
		a3(vpxor xmm2,xmm2,[eax+ecx+32])
67
		a3(vpxor xmm3,xmm3,[eax+ecx+48])
68
		a1(scrypt_ChunkMix_avx_no_xor2:)
69
		a2(vmovdqa [esp+0],xmm0)
70
		a2(vmovdqa [esp+16],xmm1)
71
		a2(vmovdqa [esp+32],xmm2)
72
		a2(vmovdqa [esp+48],xmm3)
73
		a2(mov eax,8)
74
		a1(scrypt_chacha_avx_loop: )
75
			a3(vpaddd xmm0,xmm0,xmm1)
76
			a3(vpxor  xmm3,xmm3,xmm0)
77
			a3(vpshufb xmm3,xmm3,xmm4)
78
			a3(vpaddd xmm2,xmm2,xmm3)
79
			a3(vpxor  xmm1,xmm1,xmm2)
80
			a3(vpsrld xmm6,xmm1,20)
81
			a3(vpslld xmm1,xmm1,12)
82
			a3(vpxor  xmm1,xmm1,xmm6)
83
			a3(vpaddd xmm0,xmm0,xmm1)
84
			a3(vpxor  xmm3,xmm3,xmm0)
85
			a3(vpshufb xmm3,xmm3,xmm5)
86
			a3(vpshufd xmm0,xmm0,0x93)
87
			a3(vpaddd xmm2,xmm2,xmm3)
88
			a3(vpshufd xmm3,xmm3,0x4e)
89
			a3(vpxor xmm1,xmm1,xmm2)
90
			a3(vpshufd xmm2,xmm2,0x39)
91
			a3(vpsrld xmm6,xmm1,25)
92
			a3(vpslld xmm1,xmm1,7)
93
			a3(vpxor xmm1,xmm1,xmm6)
94
			a2(sub eax,2)
95
			a3(vpaddd xmm0,xmm0,xmm1)
96
			a3(vpxor  xmm3,xmm3,xmm0)
97
			a3(vpshufb xmm3,xmm3,xmm4)
98
			a3(vpaddd xmm2,xmm2,xmm3)
99
			a3(vpxor  xmm1,xmm1,xmm2)
100
			a3(vpsrld xmm6,xmm1,20)
101
			a3(vpslld xmm1,xmm1,12)
102
			a3(vpxor xmm1,xmm1,xmm6)
103
			a3(vpaddd xmm0,xmm0,xmm1)
104
			a3(vpxor  xmm3,xmm3,xmm0)
105
			a3(vpshufb xmm3,xmm3,xmm5)
106
			a3(vpshufd xmm0,xmm0,0x39)
107
			a3(vpaddd xmm2,xmm2,xmm3)
108
			a3(pshufd xmm3,xmm3,0x4e)
109
			a3(vpxor  xmm1,xmm1,xmm2)
110
			a3(pshufd xmm2,xmm2,0x93)
111
			a3(vpsrld xmm6,xmm1,25)
112
			a3(vpslld xmm1,xmm1,7)
113
			a3(vpxor  xmm1,xmm1,xmm6)
114
			a1(ja scrypt_chacha_avx_loop)
115
		a3(vpaddd xmm0,xmm0,[esp+0])
116
		a3(vpaddd xmm1,xmm1,[esp+16])
117
		a3(vpaddd xmm2,xmm2,[esp+32])
118
		a3(vpaddd xmm3,xmm3,[esp+48])
119
		a2(lea eax,[ebx+ecx])
120
		a2(xor ebx,edx)
121
		a2(and eax,~0x7f)
122
		a2(add ecx,64)
123
		a2(shr eax,1)
124
		a2(add eax, edi)
125
		a2(cmp ecx,edx)
126
		a2(vmovdqa [eax+0],xmm0)
127
		a2(vmovdqa [eax+16],xmm1)
128
		a2(vmovdqa [eax+32],xmm2)
129
		a2(vmovdqa [eax+48],xmm3)
130
		a2(mov eax,[ebp+28])
131
		a1(jne scrypt_ChunkMix_avx_loop)
132
	a2(mov esp,ebp)
133
	a1(pop ebp)
134
	a1(pop esi)
135
	a1(pop edi)
136
	a1(pop ebx)
137
	aret(16)
138
asm_naked_fn_end(scrypt_ChunkMix_avx)
139

140
#endif
141

142

143

144
/* x64 */
145
#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
146

147
#define SCRYPT_CHACHA_AVX
148

149
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
150
asm_naked_fn(scrypt_ChunkMix_avx)
151
	a2(lea rcx,[rcx*2])
152
	a2(shl rcx,6)
153
	a2(lea r9,[rcx-64])
154
	a2(lea rax,[rsi+r9])
155
	a2(lea r9,[rdx+r9])
156
	a2(and rdx, rdx)
157
	a2(vmovdqa xmm0,[rax+0])
158
	a2(vmovdqa xmm1,[rax+16])
159
	a2(vmovdqa xmm2,[rax+32])
160
	a2(vmovdqa xmm3,[rax+48])
161
	a2(mov r8, 0x0504070601000302)
162
	a2(mov rax, 0x0d0c0f0e09080b0a)
163
	a2(movq xmm4, r8)
164
	a2(movq xmm6, rax)
165
	a2(mov r8, 0x0605040702010003)
166
	a2(mov rax, 0x0e0d0c0f0a09080b)
167
	a2(movq xmm5, r8)
168
	a2(movq xmm7, rax)
169
	a3(vpunpcklqdq xmm4, xmm4, xmm6)
170
	a3(vpunpcklqdq xmm5, xmm5, xmm7)
171
	a1(jz scrypt_ChunkMix_avx_no_xor1)
172
	a3(vpxor xmm0,xmm0,[r9+0])
173
	a3(vpxor xmm1,xmm1,[r9+16])
174
	a3(vpxor xmm2,xmm2,[r9+32])
175
	a3(vpxor xmm3,xmm3,[r9+48])
176
	a1(scrypt_ChunkMix_avx_no_xor1:)
177
	a2(xor r8,r8)
178
	a2(xor r9,r9)
179
	a1(scrypt_ChunkMix_avx_loop:)
180
		a2(and rdx, rdx)
181
		a3(vpxor xmm0,xmm0,[rsi+r9+0])
182
		a3(vpxor xmm1,xmm1,[rsi+r9+16])
183
		a3(vpxor xmm2,xmm2,[rsi+r9+32])
184
		a3(vpxor xmm3,xmm3,[rsi+r9+48])
185
		a1(jz scrypt_ChunkMix_avx_no_xor2)
186
		a3(vpxor xmm0,xmm0,[rdx+r9+0])
187
		a3(vpxor xmm1,xmm1,[rdx+r9+16])
188
		a3(vpxor xmm2,xmm2,[rdx+r9+32])
189
		a3(vpxor xmm3,xmm3,[rdx+r9+48])
190
		a1(scrypt_ChunkMix_avx_no_xor2:)
191
		a2(vmovdqa xmm8,xmm0)
192
		a2(vmovdqa xmm9,xmm1)
193
		a2(vmovdqa xmm10,xmm2)
194
		a2(vmovdqa xmm11,xmm3)
195
		a2(mov rax,8)
196
		a1(scrypt_chacha_avx_loop: )
197
			a3(vpaddd xmm0,xmm0,xmm1)
198
			a3(vpxor  xmm3,xmm3,xmm0)
199
			a3(vpshufb xmm3,xmm3,xmm4)
200
			a3(vpaddd xmm2,xmm2,xmm3)
201
			a3(vpxor  xmm1,xmm1,xmm2)
202
			a3(vpsrld xmm12,xmm1,20)
203
			a3(vpslld xmm1,xmm1,12)
204
			a3(vpxor  xmm1,xmm1,xmm12)
205
			a3(vpaddd xmm0,xmm0,xmm1)
206
			a3(vpxor  xmm3,xmm3,xmm0)
207
			a3(vpshufb xmm3,xmm3,xmm5)
208
			a3(vpshufd xmm0,xmm0,0x93)
209
			a3(vpaddd xmm2,xmm2,xmm3)
210
			a3(vpshufd xmm3,xmm3,0x4e)
211
			a3(vpxor xmm1,xmm1,xmm2)
212
			a3(vpshufd xmm2,xmm2,0x39)
213
			a3(vpsrld xmm12,xmm1,25)
214
			a3(vpslld xmm1,xmm1,7)
215
			a3(vpxor xmm1,xmm1,xmm12)
216
			a2(sub rax,2)
217
			a3(vpaddd xmm0,xmm0,xmm1)
218
			a3(vpxor  xmm3,xmm3,xmm0)
219
			a3(vpshufb xmm3,xmm3,xmm4)
220
			a3(vpaddd xmm2,xmm2,xmm3)
221
			a3(vpxor  xmm1,xmm1,xmm2)
222
			a3(vpsrld xmm12,xmm1,20)
223
			a3(vpslld xmm1,xmm1,12)
224
			a3(vpxor xmm1,xmm1,xmm12)
225
			a3(vpaddd xmm0,xmm0,xmm1)
226
			a3(vpxor  xmm3,xmm3,xmm0)
227
			a3(vpshufb xmm3,xmm3,xmm5)
228
			a3(vpshufd xmm0,xmm0,0x39)
229
			a3(vpaddd xmm2,xmm2,xmm3)
230
			a3(pshufd xmm3,xmm3,0x4e)
231
			a3(vpxor  xmm1,xmm1,xmm2)
232
			a3(pshufd xmm2,xmm2,0x93)
233
			a3(vpsrld xmm12,xmm1,25)
234
			a3(vpslld xmm1,xmm1,7)
235
			a3(vpxor  xmm1,xmm1,xmm12)
236
			a1(ja scrypt_chacha_avx_loop)
237
		a3(vpaddd xmm0,xmm0,xmm8)
238
		a3(vpaddd xmm1,xmm1,xmm9)
239
		a3(vpaddd xmm2,xmm2,xmm10)
240
		a3(vpaddd xmm3,xmm3,xmm11)
241
		a2(lea rax,[r8+r9])
242
		a2(xor r8,rcx)
243
		a2(and rax,~0x7f)
244
		a2(add r9,64)
245
		a2(shr rax,1)
246
		a2(add rax, rdi)
247
		a2(cmp r9,rcx)
248
		a2(vmovdqa [rax+0],xmm0)
249
		a2(vmovdqa [rax+16],xmm1)
250
		a2(vmovdqa [rax+32],xmm2)
251
		a2(vmovdqa [rax+48],xmm3)
252
		a1(jne scrypt_ChunkMix_avx_loop)
253
	a1(ret)
254
asm_naked_fn_end(scrypt_ChunkMix_avx)
255

256
#endif
257

258

259
/* intrinsic */
260
#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
261

262
#define SCRYPT_CHACHA_AVX
263

264
static void NOINLINE
265
scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
266
	uint32_t i, blocksPerChunk = r * 2, half = 0;
267
	xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
268
	const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
269
	size_t rounds;
270

271
	/* 1: X = B_{2r - 1} */
272
	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
273
	x0 = xmmp[0];
274
	x1 = xmmp[1];
275
	x2 = xmmp[2];
276
	x3 = xmmp[3];
277

278
	if (Bxor) {
279
		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
280
		x0 = _mm_xor_si128(x0, xmmp[0]);
281
		x1 = _mm_xor_si128(x1, xmmp[1]);
282
		x2 = _mm_xor_si128(x2, xmmp[2]);
283
		x3 = _mm_xor_si128(x3, xmmp[3]);
284
	}
285

286
	/* 2: for i = 0 to 2r - 1 do */
287
	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
288
		/* 3: X = H(X ^ B_i) */
289
		xmmp = (xmmi *)scrypt_block(Bin, i);
290
		x0 = _mm_xor_si128(x0, xmmp[0]);
291
		x1 = _mm_xor_si128(x1, xmmp[1]);
292
		x2 = _mm_xor_si128(x2, xmmp[2]);
293
		x3 = _mm_xor_si128(x3, xmmp[3]);
294

295
		if (Bxor) {
296
			xmmp = (xmmi *)scrypt_block(Bxor, i);
297
			x0 = _mm_xor_si128(x0, xmmp[0]);
298
			x1 = _mm_xor_si128(x1, xmmp[1]);
299
			x2 = _mm_xor_si128(x2, xmmp[2]);
300
			x3 = _mm_xor_si128(x3, xmmp[3]);
301
		}
302

303
		t0 = x0;
304
		t1 = x1;
305
		t2 = x2;
306
		t3 = x3;
307

308
		for (rounds = 8; rounds; rounds -= 2) {
309
			x0 = _mm_add_epi32(x0, x1);
310
			x3 = _mm_xor_si128(x3, x0);
311
			x3 = _mm_shuffle_epi8(x3, x4);
312
			x2 = _mm_add_epi32(x2, x3);
313
			x1 = _mm_xor_si128(x1, x2);
314
			x6 = _mm_srli_epi32(x1, 20);
315
			x1 = _mm_slli_epi32(x1, 12);
316
			x1 = _mm_or_si128(x1, x6);
317
			x0 = _mm_add_epi32(x0, x1);
318
			x3 = _mm_xor_si128(x3, x0);
319
			x3 = _mm_shuffle_epi8(x3, x5);
320
			x0 = _mm_shuffle_epi32(x0, 0x93);
321
			x2 = _mm_add_epi32(x2, x3);
322
			x3 = _mm_shuffle_epi32(x3, 0x4e);
323
			x1 = _mm_xor_si128(x1, x2);
324
			x2 = _mm_shuffle_epi32(x2, 0x39);
325
			x6 = _mm_srli_epi32(x1, 25);
326
			x1 = _mm_slli_epi32(x1, 7);
327
			x1 = _mm_or_si128(x1, x6);
328
			x0 = _mm_add_epi32(x0, x1);
329
			x3 = _mm_xor_si128(x3, x0);
330
			x3 = _mm_shuffle_epi8(x3, x4);
331
			x2 = _mm_add_epi32(x2, x3);
332
			x1 = _mm_xor_si128(x1, x2);
333
			x6 = _mm_srli_epi32(x1, 20);
334
			x1 = _mm_slli_epi32(x1, 12);
335
			x1 = _mm_or_si128(x1, x6);
336
			x0 = _mm_add_epi32(x0, x1);
337
			x3 = _mm_xor_si128(x3, x0);
338
			x3 = _mm_shuffle_epi8(x3, x5);
339
			x0 = _mm_shuffle_epi32(x0, 0x39);
340
			x2 = _mm_add_epi32(x2, x3);
341
			x3 = _mm_shuffle_epi32(x3, 0x4e);
342
			x1 = _mm_xor_si128(x1, x2);
343
			x2 = _mm_shuffle_epi32(x2, 0x93);
344
			x6 = _mm_srli_epi32(x1, 25);
345
			x1 = _mm_slli_epi32(x1, 7);
346
			x1 = _mm_or_si128(x1, x6);
347
		}
348

349
		x0 = _mm_add_epi32(x0, t0);
350
		x1 = _mm_add_epi32(x1, t1);
351
		x2 = _mm_add_epi32(x2, t2);
352
		x3 = _mm_add_epi32(x3, t3);
353

354
		/* 4: Y_i = X */
355
		/* 6: B'[0..r-1] = Y_even */
356
		/* 6: B'[r..2r-1] = Y_odd */
357
		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
358
		xmmp[0] = x0;
359
		xmmp[1] = x1;
360
		xmmp[2] = x2;
361
		xmmp[3] = x3;
362
	}
363
}
364

365
/*
366
 * Special version with r = 1 and no XORing
367
 *  - mikaelh
368
 */
369
static void NOINLINE
370
scrypt_ChunkMix_avx_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) {
371
	const uint32_t r = 1;
372
	uint32_t i, blocksPerChunk = r * 2, half = 0;
373
	xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
374
	const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
375
	size_t rounds;
376

377
	/* 1: X = B_{2r - 1} */
378
	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
379
	x0 = xmmp[0];
380
	x1 = xmmp[1];
381
	x2 = xmmp[2];
382
	x3 = xmmp[3];
383

384
	/* 2: for i = 0 to 2r - 1 do */
385
	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
386
		/* 3: X = H(X ^ B_i) */
387
		xmmp = (xmmi *)scrypt_block(Bin, i);
388
		x0 = _mm_xor_si128(x0, xmmp[0]);
389
		x1 = _mm_xor_si128(x1, xmmp[1]);
390
		x2 = _mm_xor_si128(x2, xmmp[2]);
391
		x3 = _mm_xor_si128(x3, xmmp[3]);
392

393
		t0 = x0;
394
		t1 = x1;
395
		t2 = x2;
396
		t3 = x3;
397

398
		for (rounds = 8; rounds; rounds -= 2) {
399
			x0 = _mm_add_epi32(x0, x1);
400
			x3 = _mm_xor_si128(x3, x0);
401
			x3 = _mm_shuffle_epi8(x3, x4);
402
			x2 = _mm_add_epi32(x2, x3);
403
			x1 = _mm_xor_si128(x1, x2);
404
			x6 = _mm_srli_epi32(x1, 20);
405
			x1 = _mm_slli_epi32(x1, 12);
406
			x1 = _mm_or_si128(x1, x6);
407
			x0 = _mm_add_epi32(x0, x1);
408
			x3 = _mm_xor_si128(x3, x0);
409
			x3 = _mm_shuffle_epi8(x3, x5);
410
			x0 = _mm_shuffle_epi32(x0, 0x93);
411
			x2 = _mm_add_epi32(x2, x3);
412
			x3 = _mm_shuffle_epi32(x3, 0x4e);
413
			x1 = _mm_xor_si128(x1, x2);
414
			x2 = _mm_shuffle_epi32(x2, 0x39);
415
			x6 = _mm_srli_epi32(x1, 25);
416
			x1 = _mm_slli_epi32(x1, 7);
417
			x1 = _mm_or_si128(x1, x6);
418
			x0 = _mm_add_epi32(x0, x1);
419
			x3 = _mm_xor_si128(x3, x0);
420
			x3 = _mm_shuffle_epi8(x3, x4);
421
			x2 = _mm_add_epi32(x2, x3);
422
			x1 = _mm_xor_si128(x1, x2);
423
			x6 = _mm_srli_epi32(x1, 20);
424
			x1 = _mm_slli_epi32(x1, 12);
425
			x1 = _mm_or_si128(x1, x6);
426
			x0 = _mm_add_epi32(x0, x1);
427
			x3 = _mm_xor_si128(x3, x0);
428
			x3 = _mm_shuffle_epi8(x3, x5);
429
			x0 = _mm_shuffle_epi32(x0, 0x39);
430
			x2 = _mm_add_epi32(x2, x3);
431
			x3 = _mm_shuffle_epi32(x3, 0x4e);
432
			x1 = _mm_xor_si128(x1, x2);
433
			x2 = _mm_shuffle_epi32(x2, 0x93);
434
			x6 = _mm_srli_epi32(x1, 25);
435
			x1 = _mm_slli_epi32(x1, 7);
436
			x1 = _mm_or_si128(x1, x6);
437
		}
438

439
		x0 = _mm_add_epi32(x0, t0);
440
		x1 = _mm_add_epi32(x1, t1);
441
		x2 = _mm_add_epi32(x2, t2);
442
		x3 = _mm_add_epi32(x3, t3);
443

444
		/* 4: Y_i = X */
445
		/* 6: B'[0..r-1] = Y_even */
446
		/* 6: B'[r..2r-1] = Y_odd */
447
		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
448
		xmmp[0] = x0;
449
		xmmp[1] = x1;
450
		xmmp[2] = x2;
451
		xmmp[3] = x3;
452
	}
453
}
454

455
/*
456
 * Special version with r = 1 and unconditional XORing
457
 *  - mikaelh
458
 */
459
static void NOINLINE
460
scrypt_ChunkMix_avx_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) {
461
	const uint32_t r = 1;
462
	uint32_t i, blocksPerChunk = r * 2, half = 0;
463
	xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
464
	const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
465
	size_t rounds;
466

467
	/* 1: X = B_{2r - 1} */
468
	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
469
	x0 = xmmp[0];
470
	x1 = xmmp[1];
471
	x2 = xmmp[2];
472
	x3 = xmmp[3];
473

474
	xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
475
	x0 = _mm_xor_si128(x0, xmmp[0]);
476
	x1 = _mm_xor_si128(x1, xmmp[1]);
477
	x2 = _mm_xor_si128(x2, xmmp[2]);
478
	x3 = _mm_xor_si128(x3, xmmp[3]);
479

480
	/* 2: for i = 0 to 2r - 1 do */
481
	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
482
		/* 3: X = H(X ^ B_i) */
483
		xmmp = (xmmi *)scrypt_block(Bin, i);
484
		x0 = _mm_xor_si128(x0, xmmp[0]);
485
		x1 = _mm_xor_si128(x1, xmmp[1]);
486
		x2 = _mm_xor_si128(x2, xmmp[2]);
487
		x3 = _mm_xor_si128(x3, xmmp[3]);
488

489
		xmmp = (xmmi *)scrypt_block(Bxor, i);
490
		x0 = _mm_xor_si128(x0, xmmp[0]);
491
		x1 = _mm_xor_si128(x1, xmmp[1]);
492
		x2 = _mm_xor_si128(x2, xmmp[2]);
493
		x3 = _mm_xor_si128(x3, xmmp[3]);
494

495
		t0 = x0;
496
		t1 = x1;
497
		t2 = x2;
498
		t3 = x3;
499

500
		for (rounds = 8; rounds; rounds -= 2) {
501
			x0 = _mm_add_epi32(x0, x1);
502
			x3 = _mm_xor_si128(x3, x0);
503
			x3 = _mm_shuffle_epi8(x3, x4);
504
			x2 = _mm_add_epi32(x2, x3);
505
			x1 = _mm_xor_si128(x1, x2);
506
			x6 = _mm_srli_epi32(x1, 20);
507
			x1 = _mm_slli_epi32(x1, 12);
508
			x1 = _mm_or_si128(x1, x6);
509
			x0 = _mm_add_epi32(x0, x1);
510
			x3 = _mm_xor_si128(x3, x0);
511
			x3 = _mm_shuffle_epi8(x3, x5);
512
			x0 = _mm_shuffle_epi32(x0, 0x93);
513
			x2 = _mm_add_epi32(x2, x3);
514
			x3 = _mm_shuffle_epi32(x3, 0x4e);
515
			x1 = _mm_xor_si128(x1, x2);
516
			x2 = _mm_shuffle_epi32(x2, 0x39);
517
			x6 = _mm_srli_epi32(x1, 25);
518
			x1 = _mm_slli_epi32(x1, 7);
519
			x1 = _mm_or_si128(x1, x6);
520
			x0 = _mm_add_epi32(x0, x1);
521
			x3 = _mm_xor_si128(x3, x0);
522
			x3 = _mm_shuffle_epi8(x3, x4);
523
			x2 = _mm_add_epi32(x2, x3);
524
			x1 = _mm_xor_si128(x1, x2);
525
			x6 = _mm_srli_epi32(x1, 20);
526
			x1 = _mm_slli_epi32(x1, 12);
527
			x1 = _mm_or_si128(x1, x6);
528
			x0 = _mm_add_epi32(x0, x1);
529
			x3 = _mm_xor_si128(x3, x0);
530
			x3 = _mm_shuffle_epi8(x3, x5);
531
			x0 = _mm_shuffle_epi32(x0, 0x39);
532
			x2 = _mm_add_epi32(x2, x3);
533
			x3 = _mm_shuffle_epi32(x3, 0x4e);
534
			x1 = _mm_xor_si128(x1, x2);
535
			x2 = _mm_shuffle_epi32(x2, 0x93);
536
			x6 = _mm_srli_epi32(x1, 25);
537
			x1 = _mm_slli_epi32(x1, 7);
538
			x1 = _mm_or_si128(x1, x6);
539
		}
540

541
		x0 = _mm_add_epi32(x0, t0);
542
		x1 = _mm_add_epi32(x1, t1);
543
		x2 = _mm_add_epi32(x2, t2);
544
		x3 = _mm_add_epi32(x3, t3);
545

546
		/* 4: Y_i = X */
547
		/* 6: B'[0..r-1] = Y_even */
548
		/* 6: B'[r..2r-1] = Y_odd */
549
		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
550
		xmmp[0] = x0;
551
		xmmp[1] = x1;
552
		xmmp[2] = x2;
553
		xmmp[3] = x3;
554
	}
555
}
556

557
#endif
558

559
#if defined(SCRYPT_CHACHA_AVX)
560
	#undef SCRYPT_MIX
561
	#define SCRYPT_MIX "ChaCha/8-AVX"
562
	#undef SCRYPT_CHACHA_INCLUDED
563
	#define SCRYPT_CHACHA_INCLUDED
564
#endif
565

566
Product

Resources

Company