Path: blob/linux/scryptjane/scrypt-jane-mix_salsa64-sse2.h
1201 views
/* x64 */1#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))23#define SCRYPT_SALSA64_SSE245asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)6asm_naked_fn(scrypt_ChunkMix_sse2)7a1(push rbp)8a2(mov rbp, rsp)9a2(and rsp, ~63)10a2(sub rsp, 128)11a2(lea rcx,[rcx*2])12a2(shl rcx,7)13a2(lea r9,[rcx-128])14a2(lea rax,[rsi+r9])15a2(lea r9,[rdx+r9])16a2(and rdx, rdx)17a2(movdqa xmm0,[rax+0])18a2(movdqa xmm1,[rax+16])19a2(movdqa xmm2,[rax+32])20a2(movdqa xmm3,[rax+48])21a2(movdqa xmm4,[rax+64])22a2(movdqa xmm5,[rax+80])23a2(movdqa xmm6,[rax+96])24a2(movdqa xmm7,[rax+112])25a1(jz scrypt_ChunkMix_sse2_no_xor1)26a2(pxor xmm0,[r9+0])27a2(pxor xmm1,[r9+16])28a2(pxor xmm2,[r9+32])29a2(pxor xmm3,[r9+48])30a2(pxor xmm4,[r9+64])31a2(pxor xmm5,[r9+80])32a2(pxor xmm6,[r9+96])33a2(pxor xmm7,[r9+112])34a1(scrypt_ChunkMix_sse2_no_xor1:)35a2(xor r9,r9)36a2(xor r8,r8)37a1(scrypt_ChunkMix_sse2_loop:)38a2(and rdx, rdx)39a2(pxor xmm0,[rsi+r9+0])40a2(pxor xmm1,[rsi+r9+16])41a2(pxor xmm2,[rsi+r9+32])42a2(pxor xmm3,[rsi+r9+48])43a2(pxor xmm4,[rsi+r9+64])44a2(pxor xmm5,[rsi+r9+80])45a2(pxor xmm6,[rsi+r9+96])46a2(pxor xmm7,[rsi+r9+112])47a1(jz scrypt_ChunkMix_sse2_no_xor2)48a2(pxor xmm0,[rdx+r9+0])49a2(pxor xmm1,[rdx+r9+16])50a2(pxor xmm2,[rdx+r9+32])51a2(pxor xmm3,[rdx+r9+48])52a2(pxor xmm4,[rdx+r9+64])53a2(pxor xmm5,[rdx+r9+80])54a2(pxor xmm6,[rdx+r9+96])55a2(pxor xmm7,[rdx+r9+112])56a1(scrypt_ChunkMix_sse2_no_xor2:)57a2(movdqa [rsp+0],xmm0)58a2(movdqa [rsp+16],xmm1)59a2(movdqa [rsp+32],xmm2)60a2(movdqa [rsp+48],xmm3)61a2(movdqa [rsp+64],xmm4)62a2(movdqa [rsp+80],xmm5)63a2(movdqa [rsp+96],xmm6)64a2(movdqa [rsp+112],xmm7)65a2(mov rax,8)66a1(scrypt_salsa64_sse2_loop: )67a2(movdqa xmm8, xmm0)68a2(movdqa xmm9, xmm1)69a2(paddq xmm8, xmm2)70a2(paddq xmm9, xmm3)71a3(pshufd xmm8, xmm8, 0xb1)72a3(pshufd xmm9, xmm9, 0xb1)73a2(pxor xmm6, xmm8)74a2(pxor xmm7, xmm9)75a2(movdqa xmm10, xmm0)76a2(movdqa xmm11, xmm1)77a2(paddq xmm10, xmm6)78a2(paddq xmm11, xmm7)79a2(movdqa xmm8, xmm10)80a2(movdqa xmm9, xmm11)81a2(psrlq xmm10, 51)82a2(psrlq xmm11, 51)83a2(psllq xmm8, 13)84a2(psllq xmm9, 13)85a2(pxor xmm4, xmm10)86a2(pxor xmm5, xmm11)87a2(pxor xmm4, xmm8)88a2(pxor xmm5, xmm9)89a2(movdqa xmm10, xmm6)90a2(movdqa xmm11, xmm7)91a2(paddq xmm10, xmm4)92a2(paddq xmm11, xmm5)93a2(movdqa xmm8, xmm10)94a2(movdqa xmm9, xmm11)95a2(psrlq xmm10, 25)96a2(psrlq xmm11, 25)97a2(psllq xmm8, 39)98a2(psllq xmm9, 39)99a2(pxor xmm2, xmm10)100a2(pxor xmm3, xmm11)101a2(pxor xmm2, xmm8)102a2(pxor xmm3, xmm9)103a2(movdqa xmm8, xmm4)104a2(movdqa xmm9, xmm5)105a2(paddq xmm8, xmm2)106a2(paddq xmm9, xmm3)107a3(pshufd xmm8, xmm8, 0xb1)108a3(pshufd xmm9, xmm9, 0xb1)109a2(pxor xmm0, xmm8)110a2(pxor xmm1, xmm9)111a2(movdqa xmm8, xmm2)112a2(movdqa xmm9, xmm3)113a2(movdqa xmm10, xmm6)114a2(movdqa xmm11, xmm7)115a2(movdqa xmm2, xmm7)116a2(movdqa xmm3, xmm6)117a2(punpcklqdq xmm10, xmm6)118a2(punpcklqdq xmm11, xmm7)119a2(movdqa xmm6, xmm8)120a2(movdqa xmm7, xmm9)121a2(punpcklqdq xmm9, xmm9)122a2(punpcklqdq xmm8, xmm8)123a2(punpckhqdq xmm2, xmm10)124a2(punpckhqdq xmm3, xmm11)125a2(punpckhqdq xmm6, xmm9)126a2(punpckhqdq xmm7, xmm8)127a2(sub rax, 2)128a2(movdqa xmm8, xmm0)129a2(movdqa xmm9, xmm1)130a2(paddq xmm8, xmm2)131a2(paddq xmm9, xmm3)132a3(pshufd xmm8, xmm8, 0xb1)133a3(pshufd xmm9, xmm9, 0xb1)134a2(pxor xmm6, xmm8)135a2(pxor xmm7, xmm9)136a2(movdqa xmm10, xmm0)137a2(movdqa xmm11, xmm1)138a2(paddq xmm10, xmm6)139a2(paddq xmm11, xmm7)140a2(movdqa xmm8, xmm10)141a2(movdqa xmm9, xmm11)142a2(psrlq xmm10, 51)143a2(psrlq xmm11, 51)144a2(psllq xmm8, 13)145a2(psllq xmm9, 13)146a2(pxor xmm5, xmm10)147a2(pxor xmm4, xmm11)148a2(pxor xmm5, xmm8)149a2(pxor xmm4, xmm9)150a2(movdqa xmm10, xmm6)151a2(movdqa xmm11, xmm7)152a2(paddq xmm10, xmm5)153a2(paddq xmm11, xmm4)154a2(movdqa xmm8, xmm10)155a2(movdqa xmm9, xmm11)156a2(psrlq xmm10, 25)157a2(psrlq xmm11, 25)158a2(psllq xmm8, 39)159a2(psllq xmm9, 39)160a2(pxor xmm2, xmm10)161a2(pxor xmm3, xmm11)162a2(pxor xmm2, xmm8)163a2(pxor xmm3, xmm9)164a2(movdqa xmm8, xmm5)165a2(movdqa xmm9, xmm4)166a2(paddq xmm8, xmm2)167a2(paddq xmm9, xmm3)168a3(pshufd xmm8, xmm8, 0xb1)169a3(pshufd xmm9, xmm9, 0xb1)170a2(pxor xmm0, xmm8)171a2(pxor xmm1, xmm9)172a2(movdqa xmm8, xmm2)173a2(movdqa xmm9, xmm3)174a2(movdqa xmm10, xmm6)175a2(movdqa xmm11, xmm7)176a2(movdqa xmm2, xmm7)177a2(movdqa xmm3, xmm6)178a2(punpcklqdq xmm10, xmm6)179a2(punpcklqdq xmm11, xmm7)180a2(movdqa xmm6, xmm8)181a2(movdqa xmm7, xmm9)182a2(punpcklqdq xmm9, xmm9)183a2(punpcklqdq xmm8, xmm8)184a2(punpckhqdq xmm2, xmm10)185a2(punpckhqdq xmm3, xmm11)186a2(punpckhqdq xmm6, xmm9)187a2(punpckhqdq xmm7, xmm8)188a1(ja scrypt_salsa64_sse2_loop)189a2(paddq xmm0,[rsp+0])190a2(paddq xmm1,[rsp+16])191a2(paddq xmm2,[rsp+32])192a2(paddq xmm3,[rsp+48])193a2(paddq xmm4,[rsp+64])194a2(paddq xmm5,[rsp+80])195a2(paddq xmm6,[rsp+96])196a2(paddq xmm7,[rsp+112])197a2(lea rax,[r8+r9])198a2(xor r8,rcx)199a2(and rax,~0xff)200a2(add r9,128)201a2(shr rax,1)202a2(add rax, rdi)203a2(cmp r9,rcx)204a2(movdqa [rax+0],xmm0)205a2(movdqa [rax+16],xmm1)206a2(movdqa [rax+32],xmm2)207a2(movdqa [rax+48],xmm3)208a2(movdqa [rax+64],xmm4)209a2(movdqa [rax+80],xmm5)210a2(movdqa [rax+96],xmm6)211a2(movdqa [rax+112],xmm7)212a1(jne scrypt_ChunkMix_sse2_loop)213a2(mov rsp, rbp)214a1(pop rbp)215a1(ret)216asm_naked_fn_end(scrypt_ChunkMix_sse2)217218#endif219220221/* intrinsic */222#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSE2)223224#define SCRYPT_SALSA64_SSE2225226static void asm_calling_convention227scrypt_ChunkMix_sse2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {228uint32_t i, blocksPerChunk = r * 2, half = 0;229xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;230size_t rounds;231232/* 1: X = B_{2r - 1} */233xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);234x0 = xmmp[0];235x1 = xmmp[1];236x2 = xmmp[2];237x3 = xmmp[3];238x4 = xmmp[4];239x5 = xmmp[5];240x6 = xmmp[6];241x7 = xmmp[7];242243if (Bxor) {244xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);245x0 = _mm_xor_si128(x0, xmmp[0]);246x1 = _mm_xor_si128(x1, xmmp[1]);247x2 = _mm_xor_si128(x2, xmmp[2]);248x3 = _mm_xor_si128(x3, xmmp[3]);249x4 = _mm_xor_si128(x4, xmmp[4]);250x5 = _mm_xor_si128(x5, xmmp[5]);251x6 = _mm_xor_si128(x6, xmmp[6]);252x7 = _mm_xor_si128(x7, xmmp[7]);253}254255/* 2: for i = 0 to 2r - 1 do */256for (i = 0; i < blocksPerChunk; i++, half ^= r) {257/* 3: X = H(X ^ B_i) */258xmmp = (xmmi *)scrypt_block(Bin, i);259x0 = _mm_xor_si128(x0, xmmp[0]);260x1 = _mm_xor_si128(x1, xmmp[1]);261x2 = _mm_xor_si128(x2, xmmp[2]);262x3 = _mm_xor_si128(x3, xmmp[3]);263x4 = _mm_xor_si128(x4, xmmp[4]);264x5 = _mm_xor_si128(x5, xmmp[5]);265x6 = _mm_xor_si128(x6, xmmp[6]);266x7 = _mm_xor_si128(x7, xmmp[7]);267268if (Bxor) {269xmmp = (xmmi *)scrypt_block(Bxor, i);270x0 = _mm_xor_si128(x0, xmmp[0]);271x1 = _mm_xor_si128(x1, xmmp[1]);272x2 = _mm_xor_si128(x2, xmmp[2]);273x3 = _mm_xor_si128(x3, xmmp[3]);274x4 = _mm_xor_si128(x4, xmmp[4]);275x5 = _mm_xor_si128(x5, xmmp[5]);276x6 = _mm_xor_si128(x6, xmmp[6]);277x7 = _mm_xor_si128(x7, xmmp[7]);278}279280t0 = x0;281t1 = x1;282t2 = x2;283t3 = x3;284t4 = x4;285t5 = x5;286t6 = x6;287t7 = x7;288289for (rounds = 8; rounds; rounds -= 2) {290z0 = _mm_add_epi64(x0, x2);291z1 = _mm_add_epi64(x1, x3);292z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));293z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));294x6 = _mm_xor_si128(x6, z0);295x7 = _mm_xor_si128(x7, z1);296297z0 = _mm_add_epi64(x6, x0);298z1 = _mm_add_epi64(x7, x1);299z2 = _mm_srli_epi64(z0, 64-13);300z3 = _mm_srli_epi64(z1, 64-13);301z0 = _mm_slli_epi64(z0, 13);302z1 = _mm_slli_epi64(z1, 13);303x4 = _mm_xor_si128(x4, z2);304x5 = _mm_xor_si128(x5, z3);305x4 = _mm_xor_si128(x4, z0);306x5 = _mm_xor_si128(x5, z1);307308z0 = _mm_add_epi64(x4, x6);309z1 = _mm_add_epi64(x5, x7);310z2 = _mm_srli_epi64(z0, 64-39);311z3 = _mm_srli_epi64(z1, 64-39);312z0 = _mm_slli_epi64(z0, 39);313z1 = _mm_slli_epi64(z1, 39);314x2 = _mm_xor_si128(x2, z2);315x3 = _mm_xor_si128(x3, z3);316x2 = _mm_xor_si128(x2, z0);317x3 = _mm_xor_si128(x3, z1);318319z0 = _mm_add_epi64(x2, x4);320z1 = _mm_add_epi64(x3, x5);321z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));322z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));323x0 = _mm_xor_si128(x0, z0);324x1 = _mm_xor_si128(x1, z1);325326z0 = x4;327z1 = x5;328z2 = x2;329z3 = x3;330x4 = z1;331x5 = z0;332x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));333x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));334x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));335x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));336337z0 = _mm_add_epi64(x0, x2);338z1 = _mm_add_epi64(x1, x3);339z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));340z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));341x6 = _mm_xor_si128(x6, z0);342x7 = _mm_xor_si128(x7, z1);343344z0 = _mm_add_epi64(x6, x0);345z1 = _mm_add_epi64(x7, x1);346z2 = _mm_srli_epi64(z0, 64-13);347z3 = _mm_srli_epi64(z1, 64-13);348z0 = _mm_slli_epi64(z0, 13);349z1 = _mm_slli_epi64(z1, 13);350x4 = _mm_xor_si128(x4, z2);351x5 = _mm_xor_si128(x5, z3);352x4 = _mm_xor_si128(x4, z0);353x5 = _mm_xor_si128(x5, z1);354355z0 = _mm_add_epi64(x4, x6);356z1 = _mm_add_epi64(x5, x7);357z2 = _mm_srli_epi64(z0, 64-39);358z3 = _mm_srli_epi64(z1, 64-39);359z0 = _mm_slli_epi64(z0, 39);360z1 = _mm_slli_epi64(z1, 39);361x2 = _mm_xor_si128(x2, z2);362x3 = _mm_xor_si128(x3, z3);363x2 = _mm_xor_si128(x2, z0);364x3 = _mm_xor_si128(x3, z1);365366z0 = _mm_add_epi64(x2, x4);367z1 = _mm_add_epi64(x3, x5);368z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));369z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));370x0 = _mm_xor_si128(x0, z0);371x1 = _mm_xor_si128(x1, z1);372373z0 = x4;374z1 = x5;375z2 = x2;376z3 = x3;377x4 = z1;378x5 = z0;379x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));380x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));381x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));382x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));383}384385x0 = _mm_add_epi64(x0, t0);386x1 = _mm_add_epi64(x1, t1);387x2 = _mm_add_epi64(x2, t2);388x3 = _mm_add_epi64(x3, t3);389x4 = _mm_add_epi64(x4, t4);390x5 = _mm_add_epi64(x5, t5);391x6 = _mm_add_epi64(x6, t6);392x7 = _mm_add_epi64(x7, t7);393394/* 4: Y_i = X */395/* 6: B'[0..r-1] = Y_even */396/* 6: B'[r..2r-1] = Y_odd */397xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);398xmmp[0] = x0;399xmmp[1] = x1;400xmmp[2] = x2;401xmmp[3] = x3;402xmmp[4] = x4;403xmmp[5] = x5;404xmmp[6] = x6;405xmmp[7] = x7;406}407}408409#endif410411#if defined(SCRYPT_SALSA64_SSE2)412#undef SCRYPT_MIX413#define SCRYPT_MIX "Salsa64/8-SSE2"414#undef SCRYPT_SALSA64_INCLUDED415#define SCRYPT_SALSA64_INCLUDED416#endif417418/* sse3/avx use this as well */419#if defined(SCRYPT_SALSA64_INCLUDED)420/*421Default layout:4220 1 2 34234 5 6 74248 9 10 1142512 13 14 15426427SSE2 layout:4280 5 10 1542912 1 6 114308 13 2 74314 9 14 3432*/433434435static void asm_calling_convention436salsa64_core_tangle_sse2(uint64_t *blocks, size_t count) {437uint64_t t;438while (count--) {439t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t;440t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t;441t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t;442t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t;443t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t;444t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t;445blocks += 16;446}447}448#endif449450