Path: blob/linux/scryptjane/scrypt-jane-mix_chacha-avx.h
1201 views
/* x86 */1#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))23#define SCRYPT_CHACHA_AVX45asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)6asm_naked_fn(scrypt_ChunkMix_avx)7a1(push ebx)8a1(push edi)9a1(push esi)10a1(push ebp)11a2(mov ebp,esp)12a2(mov edi,[ebp+20])13a2(mov esi,[ebp+24])14a2(mov eax,[ebp+28])15a2(mov ebx,[ebp+32])16a2(sub esp,64)17a2(and esp,~63)18a2(lea edx,[ebx*2])19a2(shl edx,6)20a2(lea ecx,[edx-64])21a2(and eax, eax)22a2(mov ebx, 0x01000302)23a2(vmovd xmm4, ebx)24a2(mov ebx, 0x05040706)25a2(vmovd xmm0, ebx)26a2(mov ebx, 0x09080b0a)27a2(vmovd xmm1, ebx)28a2(mov ebx, 0x0d0c0f0e)29a2(vmovd xmm2, ebx)30a2(mov ebx, 0x02010003)31a2(vmovd xmm5, ebx)32a2(mov ebx, 0x06050407)33a2(vmovd xmm3, ebx)34a2(mov ebx, 0x0a09080b)35a2(vmovd xmm6, ebx)36a2(mov ebx, 0x0e0d0c0f)37a2(vmovd xmm7, ebx)38a3(vpunpckldq xmm4, xmm4, xmm0)39a3(vpunpckldq xmm5, xmm5, xmm3)40a3(vpunpckldq xmm1, xmm1, xmm2)41a3(vpunpckldq xmm6, xmm6, xmm7)42a3(vpunpcklqdq xmm4, xmm4, xmm1)43a3(vpunpcklqdq xmm5, xmm5, xmm6)44a2(vmovdqa xmm0,[ecx+esi+0])45a2(vmovdqa xmm1,[ecx+esi+16])46a2(vmovdqa xmm2,[ecx+esi+32])47a2(vmovdqa xmm3,[ecx+esi+48])48a1(jz scrypt_ChunkMix_avx_no_xor1)49a3(vpxor xmm0,xmm0,[ecx+eax+0])50a3(vpxor xmm1,xmm1,[ecx+eax+16])51a3(vpxor xmm2,xmm2,[ecx+eax+32])52a3(vpxor xmm3,xmm3,[ecx+eax+48])53a1(scrypt_ChunkMix_avx_no_xor1:)54a2(xor ecx,ecx)55a2(xor ebx,ebx)56a1(scrypt_ChunkMix_avx_loop:)57a2(and eax, eax)58a3(vpxor xmm0,xmm0,[esi+ecx+0])59a3(vpxor xmm1,xmm1,[esi+ecx+16])60a3(vpxor xmm2,xmm2,[esi+ecx+32])61a3(vpxor xmm3,xmm3,[esi+ecx+48])62a1(jz scrypt_ChunkMix_avx_no_xor2)63a3(vpxor xmm0,xmm0,[eax+ecx+0])64a3(vpxor xmm1,xmm1,[eax+ecx+16])65a3(vpxor xmm2,xmm2,[eax+ecx+32])66a3(vpxor xmm3,xmm3,[eax+ecx+48])67a1(scrypt_ChunkMix_avx_no_xor2:)68a2(vmovdqa [esp+0],xmm0)69a2(vmovdqa [esp+16],xmm1)70a2(vmovdqa [esp+32],xmm2)71a2(vmovdqa [esp+48],xmm3)72a2(mov eax,8)73a1(scrypt_chacha_avx_loop: )74a3(vpaddd xmm0,xmm0,xmm1)75a3(vpxor xmm3,xmm3,xmm0)76a3(vpshufb xmm3,xmm3,xmm4)77a3(vpaddd xmm2,xmm2,xmm3)78a3(vpxor xmm1,xmm1,xmm2)79a3(vpsrld xmm6,xmm1,20)80a3(vpslld xmm1,xmm1,12)81a3(vpxor xmm1,xmm1,xmm6)82a3(vpaddd xmm0,xmm0,xmm1)83a3(vpxor xmm3,xmm3,xmm0)84a3(vpshufb xmm3,xmm3,xmm5)85a3(vpshufd xmm0,xmm0,0x93)86a3(vpaddd xmm2,xmm2,xmm3)87a3(vpshufd xmm3,xmm3,0x4e)88a3(vpxor xmm1,xmm1,xmm2)89a3(vpshufd xmm2,xmm2,0x39)90a3(vpsrld xmm6,xmm1,25)91a3(vpslld xmm1,xmm1,7)92a3(vpxor xmm1,xmm1,xmm6)93a2(sub eax,2)94a3(vpaddd xmm0,xmm0,xmm1)95a3(vpxor xmm3,xmm3,xmm0)96a3(vpshufb xmm3,xmm3,xmm4)97a3(vpaddd xmm2,xmm2,xmm3)98a3(vpxor xmm1,xmm1,xmm2)99a3(vpsrld xmm6,xmm1,20)100a3(vpslld xmm1,xmm1,12)101a3(vpxor xmm1,xmm1,xmm6)102a3(vpaddd xmm0,xmm0,xmm1)103a3(vpxor xmm3,xmm3,xmm0)104a3(vpshufb xmm3,xmm3,xmm5)105a3(vpshufd xmm0,xmm0,0x39)106a3(vpaddd xmm2,xmm2,xmm3)107a3(pshufd xmm3,xmm3,0x4e)108a3(vpxor xmm1,xmm1,xmm2)109a3(pshufd xmm2,xmm2,0x93)110a3(vpsrld xmm6,xmm1,25)111a3(vpslld xmm1,xmm1,7)112a3(vpxor xmm1,xmm1,xmm6)113a1(ja scrypt_chacha_avx_loop)114a3(vpaddd xmm0,xmm0,[esp+0])115a3(vpaddd xmm1,xmm1,[esp+16])116a3(vpaddd xmm2,xmm2,[esp+32])117a3(vpaddd xmm3,xmm3,[esp+48])118a2(lea eax,[ebx+ecx])119a2(xor ebx,edx)120a2(and eax,~0x7f)121a2(add ecx,64)122a2(shr eax,1)123a2(add eax, edi)124a2(cmp ecx,edx)125a2(vmovdqa [eax+0],xmm0)126a2(vmovdqa [eax+16],xmm1)127a2(vmovdqa [eax+32],xmm2)128a2(vmovdqa [eax+48],xmm3)129a2(mov eax,[ebp+28])130a1(jne scrypt_ChunkMix_avx_loop)131a2(mov esp,ebp)132a1(pop ebp)133a1(pop esi)134a1(pop edi)135a1(pop ebx)136aret(16)137asm_naked_fn_end(scrypt_ChunkMix_avx)138139#endif140141142143/* x64 */144#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))145146#define SCRYPT_CHACHA_AVX147148asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)149asm_naked_fn(scrypt_ChunkMix_avx)150a2(lea rcx,[rcx*2])151a2(shl rcx,6)152a2(lea r9,[rcx-64])153a2(lea rax,[rsi+r9])154a2(lea r9,[rdx+r9])155a2(and rdx, rdx)156a2(vmovdqa xmm0,[rax+0])157a2(vmovdqa xmm1,[rax+16])158a2(vmovdqa xmm2,[rax+32])159a2(vmovdqa xmm3,[rax+48])160a2(mov r8, 0x0504070601000302)161a2(mov rax, 0x0d0c0f0e09080b0a)162a2(movq xmm4, r8)163a2(movq xmm6, rax)164a2(mov r8, 0x0605040702010003)165a2(mov rax, 0x0e0d0c0f0a09080b)166a2(movq xmm5, r8)167a2(movq xmm7, rax)168a3(vpunpcklqdq xmm4, xmm4, xmm6)169a3(vpunpcklqdq xmm5, xmm5, xmm7)170a1(jz scrypt_ChunkMix_avx_no_xor1)171a3(vpxor xmm0,xmm0,[r9+0])172a3(vpxor xmm1,xmm1,[r9+16])173a3(vpxor xmm2,xmm2,[r9+32])174a3(vpxor xmm3,xmm3,[r9+48])175a1(scrypt_ChunkMix_avx_no_xor1:)176a2(xor r8,r8)177a2(xor r9,r9)178a1(scrypt_ChunkMix_avx_loop:)179a2(and rdx, rdx)180a3(vpxor xmm0,xmm0,[rsi+r9+0])181a3(vpxor xmm1,xmm1,[rsi+r9+16])182a3(vpxor xmm2,xmm2,[rsi+r9+32])183a3(vpxor xmm3,xmm3,[rsi+r9+48])184a1(jz scrypt_ChunkMix_avx_no_xor2)185a3(vpxor xmm0,xmm0,[rdx+r9+0])186a3(vpxor xmm1,xmm1,[rdx+r9+16])187a3(vpxor xmm2,xmm2,[rdx+r9+32])188a3(vpxor xmm3,xmm3,[rdx+r9+48])189a1(scrypt_ChunkMix_avx_no_xor2:)190a2(vmovdqa xmm8,xmm0)191a2(vmovdqa xmm9,xmm1)192a2(vmovdqa xmm10,xmm2)193a2(vmovdqa xmm11,xmm3)194a2(mov rax,8)195a1(scrypt_chacha_avx_loop: )196a3(vpaddd xmm0,xmm0,xmm1)197a3(vpxor xmm3,xmm3,xmm0)198a3(vpshufb xmm3,xmm3,xmm4)199a3(vpaddd xmm2,xmm2,xmm3)200a3(vpxor xmm1,xmm1,xmm2)201a3(vpsrld xmm12,xmm1,20)202a3(vpslld xmm1,xmm1,12)203a3(vpxor xmm1,xmm1,xmm12)204a3(vpaddd xmm0,xmm0,xmm1)205a3(vpxor xmm3,xmm3,xmm0)206a3(vpshufb xmm3,xmm3,xmm5)207a3(vpshufd xmm0,xmm0,0x93)208a3(vpaddd xmm2,xmm2,xmm3)209a3(vpshufd xmm3,xmm3,0x4e)210a3(vpxor xmm1,xmm1,xmm2)211a3(vpshufd xmm2,xmm2,0x39)212a3(vpsrld xmm12,xmm1,25)213a3(vpslld xmm1,xmm1,7)214a3(vpxor xmm1,xmm1,xmm12)215a2(sub rax,2)216a3(vpaddd xmm0,xmm0,xmm1)217a3(vpxor xmm3,xmm3,xmm0)218a3(vpshufb xmm3,xmm3,xmm4)219a3(vpaddd xmm2,xmm2,xmm3)220a3(vpxor xmm1,xmm1,xmm2)221a3(vpsrld xmm12,xmm1,20)222a3(vpslld xmm1,xmm1,12)223a3(vpxor xmm1,xmm1,xmm12)224a3(vpaddd xmm0,xmm0,xmm1)225a3(vpxor xmm3,xmm3,xmm0)226a3(vpshufb xmm3,xmm3,xmm5)227a3(vpshufd xmm0,xmm0,0x39)228a3(vpaddd xmm2,xmm2,xmm3)229a3(pshufd xmm3,xmm3,0x4e)230a3(vpxor xmm1,xmm1,xmm2)231a3(pshufd xmm2,xmm2,0x93)232a3(vpsrld xmm12,xmm1,25)233a3(vpslld xmm1,xmm1,7)234a3(vpxor xmm1,xmm1,xmm12)235a1(ja scrypt_chacha_avx_loop)236a3(vpaddd xmm0,xmm0,xmm8)237a3(vpaddd xmm1,xmm1,xmm9)238a3(vpaddd xmm2,xmm2,xmm10)239a3(vpaddd xmm3,xmm3,xmm11)240a2(lea rax,[r8+r9])241a2(xor r8,rcx)242a2(and rax,~0x7f)243a2(add r9,64)244a2(shr rax,1)245a2(add rax, rdi)246a2(cmp r9,rcx)247a2(vmovdqa [rax+0],xmm0)248a2(vmovdqa [rax+16],xmm1)249a2(vmovdqa [rax+32],xmm2)250a2(vmovdqa [rax+48],xmm3)251a1(jne scrypt_ChunkMix_avx_loop)252a1(ret)253asm_naked_fn_end(scrypt_ChunkMix_avx)254255#endif256257258/* intrinsic */259#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))260261#define SCRYPT_CHACHA_AVX262263static void NOINLINE264scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {265uint32_t i, blocksPerChunk = r * 2, half = 0;266xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;267const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;268size_t rounds;269270/* 1: X = B_{2r - 1} */271xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);272x0 = xmmp[0];273x1 = xmmp[1];274x2 = xmmp[2];275x3 = xmmp[3];276277if (Bxor) {278xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);279x0 = _mm_xor_si128(x0, xmmp[0]);280x1 = _mm_xor_si128(x1, xmmp[1]);281x2 = _mm_xor_si128(x2, xmmp[2]);282x3 = _mm_xor_si128(x3, xmmp[3]);283}284285/* 2: for i = 0 to 2r - 1 do */286for (i = 0; i < blocksPerChunk; i++, half ^= r) {287/* 3: X = H(X ^ B_i) */288xmmp = (xmmi *)scrypt_block(Bin, i);289x0 = _mm_xor_si128(x0, xmmp[0]);290x1 = _mm_xor_si128(x1, xmmp[1]);291x2 = _mm_xor_si128(x2, xmmp[2]);292x3 = _mm_xor_si128(x3, xmmp[3]);293294if (Bxor) {295xmmp = (xmmi *)scrypt_block(Bxor, i);296x0 = _mm_xor_si128(x0, xmmp[0]);297x1 = _mm_xor_si128(x1, xmmp[1]);298x2 = _mm_xor_si128(x2, xmmp[2]);299x3 = _mm_xor_si128(x3, xmmp[3]);300}301302t0 = x0;303t1 = x1;304t2 = x2;305t3 = x3;306307for (rounds = 8; rounds; rounds -= 2) {308x0 = _mm_add_epi32(x0, x1);309x3 = _mm_xor_si128(x3, x0);310x3 = _mm_shuffle_epi8(x3, x4);311x2 = _mm_add_epi32(x2, x3);312x1 = _mm_xor_si128(x1, x2);313x6 = _mm_srli_epi32(x1, 20);314x1 = _mm_slli_epi32(x1, 12);315x1 = _mm_or_si128(x1, x6);316x0 = _mm_add_epi32(x0, x1);317x3 = _mm_xor_si128(x3, x0);318x3 = _mm_shuffle_epi8(x3, x5);319x0 = _mm_shuffle_epi32(x0, 0x93);320x2 = _mm_add_epi32(x2, x3);321x3 = _mm_shuffle_epi32(x3, 0x4e);322x1 = _mm_xor_si128(x1, x2);323x2 = _mm_shuffle_epi32(x2, 0x39);324x6 = _mm_srli_epi32(x1, 25);325x1 = _mm_slli_epi32(x1, 7);326x1 = _mm_or_si128(x1, x6);327x0 = _mm_add_epi32(x0, x1);328x3 = _mm_xor_si128(x3, x0);329x3 = _mm_shuffle_epi8(x3, x4);330x2 = _mm_add_epi32(x2, x3);331x1 = _mm_xor_si128(x1, x2);332x6 = _mm_srli_epi32(x1, 20);333x1 = _mm_slli_epi32(x1, 12);334x1 = _mm_or_si128(x1, x6);335x0 = _mm_add_epi32(x0, x1);336x3 = _mm_xor_si128(x3, x0);337x3 = _mm_shuffle_epi8(x3, x5);338x0 = _mm_shuffle_epi32(x0, 0x39);339x2 = _mm_add_epi32(x2, x3);340x3 = _mm_shuffle_epi32(x3, 0x4e);341x1 = _mm_xor_si128(x1, x2);342x2 = _mm_shuffle_epi32(x2, 0x93);343x6 = _mm_srli_epi32(x1, 25);344x1 = _mm_slli_epi32(x1, 7);345x1 = _mm_or_si128(x1, x6);346}347348x0 = _mm_add_epi32(x0, t0);349x1 = _mm_add_epi32(x1, t1);350x2 = _mm_add_epi32(x2, t2);351x3 = _mm_add_epi32(x3, t3);352353/* 4: Y_i = X */354/* 6: B'[0..r-1] = Y_even */355/* 6: B'[r..2r-1] = Y_odd */356xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);357xmmp[0] = x0;358xmmp[1] = x1;359xmmp[2] = x2;360xmmp[3] = x3;361}362}363364/*365* Special version with r = 1 and no XORing366* - mikaelh367*/368static void NOINLINE369scrypt_ChunkMix_avx_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) {370const uint32_t r = 1;371uint32_t i, blocksPerChunk = r * 2, half = 0;372xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;373const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;374size_t rounds;375376/* 1: X = B_{2r - 1} */377xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);378x0 = xmmp[0];379x1 = xmmp[1];380x2 = xmmp[2];381x3 = xmmp[3];382383/* 2: for i = 0 to 2r - 1 do */384for (i = 0; i < blocksPerChunk; i++, half ^= r) {385/* 3: X = H(X ^ B_i) */386xmmp = (xmmi *)scrypt_block(Bin, i);387x0 = _mm_xor_si128(x0, xmmp[0]);388x1 = _mm_xor_si128(x1, xmmp[1]);389x2 = _mm_xor_si128(x2, xmmp[2]);390x3 = _mm_xor_si128(x3, xmmp[3]);391392t0 = x0;393t1 = x1;394t2 = x2;395t3 = x3;396397for (rounds = 8; rounds; rounds -= 2) {398x0 = _mm_add_epi32(x0, x1);399x3 = _mm_xor_si128(x3, x0);400x3 = _mm_shuffle_epi8(x3, x4);401x2 = _mm_add_epi32(x2, x3);402x1 = _mm_xor_si128(x1, x2);403x6 = _mm_srli_epi32(x1, 20);404x1 = _mm_slli_epi32(x1, 12);405x1 = _mm_or_si128(x1, x6);406x0 = _mm_add_epi32(x0, x1);407x3 = _mm_xor_si128(x3, x0);408x3 = _mm_shuffle_epi8(x3, x5);409x0 = _mm_shuffle_epi32(x0, 0x93);410x2 = _mm_add_epi32(x2, x3);411x3 = _mm_shuffle_epi32(x3, 0x4e);412x1 = _mm_xor_si128(x1, x2);413x2 = _mm_shuffle_epi32(x2, 0x39);414x6 = _mm_srli_epi32(x1, 25);415x1 = _mm_slli_epi32(x1, 7);416x1 = _mm_or_si128(x1, x6);417x0 = _mm_add_epi32(x0, x1);418x3 = _mm_xor_si128(x3, x0);419x3 = _mm_shuffle_epi8(x3, x4);420x2 = _mm_add_epi32(x2, x3);421x1 = _mm_xor_si128(x1, x2);422x6 = _mm_srli_epi32(x1, 20);423x1 = _mm_slli_epi32(x1, 12);424x1 = _mm_or_si128(x1, x6);425x0 = _mm_add_epi32(x0, x1);426x3 = _mm_xor_si128(x3, x0);427x3 = _mm_shuffle_epi8(x3, x5);428x0 = _mm_shuffle_epi32(x0, 0x39);429x2 = _mm_add_epi32(x2, x3);430x3 = _mm_shuffle_epi32(x3, 0x4e);431x1 = _mm_xor_si128(x1, x2);432x2 = _mm_shuffle_epi32(x2, 0x93);433x6 = _mm_srli_epi32(x1, 25);434x1 = _mm_slli_epi32(x1, 7);435x1 = _mm_or_si128(x1, x6);436}437438x0 = _mm_add_epi32(x0, t0);439x1 = _mm_add_epi32(x1, t1);440x2 = _mm_add_epi32(x2, t2);441x3 = _mm_add_epi32(x3, t3);442443/* 4: Y_i = X */444/* 6: B'[0..r-1] = Y_even */445/* 6: B'[r..2r-1] = Y_odd */446xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);447xmmp[0] = x0;448xmmp[1] = x1;449xmmp[2] = x2;450xmmp[3] = x3;451}452}453454/*455* Special version with r = 1 and unconditional XORing456* - mikaelh457*/458static void NOINLINE459scrypt_ChunkMix_avx_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) {460const uint32_t r = 1;461uint32_t i, blocksPerChunk = r * 2, half = 0;462xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;463const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;464size_t rounds;465466/* 1: X = B_{2r - 1} */467xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);468x0 = xmmp[0];469x1 = xmmp[1];470x2 = xmmp[2];471x3 = xmmp[3];472473xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);474x0 = _mm_xor_si128(x0, xmmp[0]);475x1 = _mm_xor_si128(x1, xmmp[1]);476x2 = _mm_xor_si128(x2, xmmp[2]);477x3 = _mm_xor_si128(x3, xmmp[3]);478479/* 2: for i = 0 to 2r - 1 do */480for (i = 0; i < blocksPerChunk; i++, half ^= r) {481/* 3: X = H(X ^ B_i) */482xmmp = (xmmi *)scrypt_block(Bin, i);483x0 = _mm_xor_si128(x0, xmmp[0]);484x1 = _mm_xor_si128(x1, xmmp[1]);485x2 = _mm_xor_si128(x2, xmmp[2]);486x3 = _mm_xor_si128(x3, xmmp[3]);487488xmmp = (xmmi *)scrypt_block(Bxor, i);489x0 = _mm_xor_si128(x0, xmmp[0]);490x1 = _mm_xor_si128(x1, xmmp[1]);491x2 = _mm_xor_si128(x2, xmmp[2]);492x3 = _mm_xor_si128(x3, xmmp[3]);493494t0 = x0;495t1 = x1;496t2 = x2;497t3 = x3;498499for (rounds = 8; rounds; rounds -= 2) {500x0 = _mm_add_epi32(x0, x1);501x3 = _mm_xor_si128(x3, x0);502x3 = _mm_shuffle_epi8(x3, x4);503x2 = _mm_add_epi32(x2, x3);504x1 = _mm_xor_si128(x1, x2);505x6 = _mm_srli_epi32(x1, 20);506x1 = _mm_slli_epi32(x1, 12);507x1 = _mm_or_si128(x1, x6);508x0 = _mm_add_epi32(x0, x1);509x3 = _mm_xor_si128(x3, x0);510x3 = _mm_shuffle_epi8(x3, x5);511x0 = _mm_shuffle_epi32(x0, 0x93);512x2 = _mm_add_epi32(x2, x3);513x3 = _mm_shuffle_epi32(x3, 0x4e);514x1 = _mm_xor_si128(x1, x2);515x2 = _mm_shuffle_epi32(x2, 0x39);516x6 = _mm_srli_epi32(x1, 25);517x1 = _mm_slli_epi32(x1, 7);518x1 = _mm_or_si128(x1, x6);519x0 = _mm_add_epi32(x0, x1);520x3 = _mm_xor_si128(x3, x0);521x3 = _mm_shuffle_epi8(x3, x4);522x2 = _mm_add_epi32(x2, x3);523x1 = _mm_xor_si128(x1, x2);524x6 = _mm_srli_epi32(x1, 20);525x1 = _mm_slli_epi32(x1, 12);526x1 = _mm_or_si128(x1, x6);527x0 = _mm_add_epi32(x0, x1);528x3 = _mm_xor_si128(x3, x0);529x3 = _mm_shuffle_epi8(x3, x5);530x0 = _mm_shuffle_epi32(x0, 0x39);531x2 = _mm_add_epi32(x2, x3);532x3 = _mm_shuffle_epi32(x3, 0x4e);533x1 = _mm_xor_si128(x1, x2);534x2 = _mm_shuffle_epi32(x2, 0x93);535x6 = _mm_srli_epi32(x1, 25);536x1 = _mm_slli_epi32(x1, 7);537x1 = _mm_or_si128(x1, x6);538}539540x0 = _mm_add_epi32(x0, t0);541x1 = _mm_add_epi32(x1, t1);542x2 = _mm_add_epi32(x2, t2);543x3 = _mm_add_epi32(x3, t3);544545/* 4: Y_i = X */546/* 6: B'[0..r-1] = Y_even */547/* 6: B'[r..2r-1] = Y_odd */548xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);549xmmp[0] = x0;550xmmp[1] = x1;551xmmp[2] = x2;552xmmp[3] = x3;553}554}555556#endif557558#if defined(SCRYPT_CHACHA_AVX)559#undef SCRYPT_MIX560#define SCRYPT_MIX "ChaCha/8-AVX"561#undef SCRYPT_CHACHA_INCLUDED562#define SCRYPT_CHACHA_INCLUDED563#endif564565566