Path: blob/a-new-beginning/SharedDependencies/Sources/cryptopp/donna_sse.cpp
2 views
// donna_sse.cpp - written and placed in public domain by Jeffrey Walton1// This is a integration of Andrew Moon's public domain code.2// Also see https://github.com/floodyberry/curve25519-donna.34// This is a integration of Andrew Moon's public domain code. The port was5// clean, but it has one potential problem. The original code is C and relies6// upon unions. Accessing the inactive union member is undefined behavior in7// C++. That means copying the array into packedelem8.u is OK; but then using8// packedelem8.v in a calculation is UB. Fortunately most (all?) compilers9// take pity on C++ developers and compile the code. We will have to keep an10// eye on things or rewrite significant portions of this code.1112// If needed, see Moon's commit "Go back to ignoring 256th bit [sic]",13// https://github.com/floodyberry/curve25519-donna/commit/57a683d18721a6581415#include "pch.h"1617#include "config.h"18#include "donna.h"19#include "secblock.h"20#include "misc.h"2122// The data is aligned, but Clang issues warning based on type23// and not the actual alignment of the variable and data.24#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE25# pragma GCC diagnostic ignored "-Wcast-align"26# pragma GCC diagnostic ignored "-Wunused-function"27#endif2829#if CRYPTOPP_MSC_VERSION30# pragma warning(disable: 4244)31#endif3233// Squash MS LNK4221 and libtool warnings34extern const char DONNA_SSE_FNAME[] = __FILE__;3536#if (CRYPTOPP_CURVE25519_SSE2)3738#include "donna_sse.h"3940ANONYMOUS_NAMESPACE_BEGIN4142using CryptoPP::byte;43using CryptoPP::word32;44using CryptoPP::sword32;45using CryptoPP::word64;46using CryptoPP::sword64;47using CryptoPP::GetBlock;48using CryptoPP::LittleEndian;4950// Bring in all the symbols from the SSE header51using namespace CryptoPP::Donna::ArchSSE;5253/* Copy a bignum to another: out = in */54inline void55curve25519_copy(bignum25519 out, const bignum25519 in) {56xmmi x0,x1,x2;57x0 = _mm_load_si128((xmmi*)in + 0);58x1 = _mm_load_si128((xmmi*)in + 1);59x2 = _mm_load_si128((xmmi*)in + 2);60_mm_store_si128((xmmi*)out + 0, x0);61_mm_store_si128((xmmi*)out + 1, x1);62_mm_store_si128((xmmi*)out + 2, x2);63}6465/* Take a little-endian, 32-byte number and expand it into polynomial form */66inline void67curve25519_expand(bignum25519 out, const byte in[32]) {68word32 x0,x1,x2,x3,x4,x5,x6,x7;6970x0 = *(word32 *)(in + 0);71x1 = *(word32 *)(in + 4);72x2 = *(word32 *)(in + 8);73x3 = *(word32 *)(in + 12);74x4 = *(word32 *)(in + 16);75x5 = *(word32 *)(in + 20);76x6 = *(word32 *)(in + 24);77x7 = *(word32 *)(in + 28);7879out[0] = ( x0 ) & reduce_mask_26;80out[1] = ((((word64)x1 << 32) | x0) >> 26) & reduce_mask_25;81out[2] = ((((word64)x2 << 32) | x1) >> 19) & reduce_mask_26;82out[3] = ((((word64)x3 << 32) | x2) >> 13) & reduce_mask_25;83out[4] = (( x3) >> 6) & reduce_mask_26;84out[5] = ( x4 ) & reduce_mask_25;85out[6] = ((((word64)x5 << 32) | x4) >> 25) & reduce_mask_26;86out[7] = ((((word64)x6 << 32) | x5) >> 19) & reduce_mask_25;87out[8] = ((((word64)x7 << 32) | x6) >> 12) & reduce_mask_26;88out[9] = (( x7) >> 6) & reduce_mask_25; /* ignore the top bit */8990out[10] = 0;91out[11] = 0;92}9394/* Take a fully reduced polynomial form number and contract it into a95* little-endian, 32-byte array96*/97inline void98curve25519_contract(byte out[32], const bignum25519 in) {99ALIGN(16) bignum25519 f;100101curve25519_copy(f, in);102103#define carry_pass() \104f[1] += f[0] >> 26; f[0] &= reduce_mask_26; \105f[2] += f[1] >> 25; f[1] &= reduce_mask_25; \106f[3] += f[2] >> 26; f[2] &= reduce_mask_26; \107f[4] += f[3] >> 25; f[3] &= reduce_mask_25; \108f[5] += f[4] >> 26; f[4] &= reduce_mask_26; \109f[6] += f[5] >> 25; f[5] &= reduce_mask_25; \110f[7] += f[6] >> 26; f[6] &= reduce_mask_26; \111f[8] += f[7] >> 25; f[7] &= reduce_mask_25; \112f[9] += f[8] >> 26; f[8] &= reduce_mask_26;113114#define carry_pass_full() \115carry_pass() \116f[0] += 19 * (f[9] >> 25); f[9] &= reduce_mask_25;117118#define carry_pass_final() \119carry_pass() \120f[9] &= reduce_mask_25;121122carry_pass_full()123carry_pass_full()124125/* now t is between 0 and 2^255-1, properly carried. */126/* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */127f[0] += 19;128carry_pass_full()129130/* now between 19 and 2^255-1 in both cases, and offset by 19. */131f[0] += (1 << 26) - 19;132f[1] += (1 << 25) - 1;133f[2] += (1 << 26) - 1;134f[3] += (1 << 25) - 1;135f[4] += (1 << 26) - 1;136f[5] += (1 << 25) - 1;137f[6] += (1 << 26) - 1;138f[7] += (1 << 25) - 1;139f[8] += (1 << 26) - 1;140f[9] += (1 << 25) - 1;141142/* now between 2^255 and 2^256-20, and offset by 2^255. */143carry_pass_final()144145#undef carry_pass146#undef carry_full147#undef carry_final148149*(word32 *)(out + 0) = ((f[0] ) | (f[1] << 26));150*(word32 *)(out + 4) = ((f[1] >> 6) | (f[2] << 19));151*(word32 *)(out + 8) = ((f[2] >> 13) | (f[3] << 13));152*(word32 *)(out + 12) = ((f[3] >> 19) | (f[4] << 6));153*(word32 *)(out + 16) = ((f[5] ) | (f[6] << 25));154*(word32 *)(out + 20) = ((f[6] >> 7) | (f[7] << 19));155*(word32 *)(out + 24) = ((f[7] >> 13) | (f[8] << 12));156*(word32 *)(out + 28) = ((f[8] >> 20) | (f[9] << 6));157}158159/*160* Maybe swap the contents of two felem arrays (@a and @b), each 5 elements161* long. Perform the swap iff @swap is non-zero.162*/163inline void164curve25519_swap_conditional(bignum25519 a, bignum25519 b, word32 iswap) {165const word32 swap = (word32)(-(sword32)iswap);166xmmi a0,a1,a2,b0,b1,b2,x0,x1,x2;167xmmi mask = _mm_cvtsi32_si128(swap);168mask = _mm_shuffle_epi32(mask, 0);169a0 = _mm_load_si128((xmmi *)a + 0);170a1 = _mm_load_si128((xmmi *)a + 1);171a2 = _mm_load_si128((xmmi *)a + 2);172b0 = _mm_load_si128((xmmi *)b + 0);173b1 = _mm_load_si128((xmmi *)b + 1);174b2 = _mm_load_si128((xmmi *)b + 2);175b0 = _mm_xor_si128(a0, b0);176b1 = _mm_xor_si128(a1, b1);177b2 = _mm_xor_si128(a2, b2);178x0 = _mm_and_si128(b0, mask);179x1 = _mm_and_si128(b1, mask);180x2 = _mm_and_si128(b2, mask);181x0 = _mm_xor_si128(x0, a0);182x1 = _mm_xor_si128(x1, a1);183x2 = _mm_xor_si128(x2, a2);184a0 = _mm_xor_si128(x0, b0);185a1 = _mm_xor_si128(x1, b1);186a2 = _mm_xor_si128(x2, b2);187_mm_store_si128((xmmi *)a + 0, x0);188_mm_store_si128((xmmi *)a + 1, x1);189_mm_store_si128((xmmi *)a + 2, x2);190_mm_store_si128((xmmi *)b + 0, a0);191_mm_store_si128((xmmi *)b + 1, a1);192_mm_store_si128((xmmi *)b + 2, a2);193}194195/* interleave two bignums */196inline void197curve25519_tangle32(packedelem32 *out, const bignum25519 x, const bignum25519 z) {198xmmi x0,x1,x2,z0,z1,z2;199200x0 = _mm_load_si128((xmmi *)(x + 0));201x1 = _mm_load_si128((xmmi *)(x + 4));202x2 = _mm_load_si128((xmmi *)(x + 8));203z0 = _mm_load_si128((xmmi *)(z + 0));204z1 = _mm_load_si128((xmmi *)(z + 4));205z2 = _mm_load_si128((xmmi *)(z + 8));206207out[0].v = _mm_unpacklo_epi32(x0, z0);208out[1].v = _mm_unpackhi_epi32(x0, z0);209out[2].v = _mm_unpacklo_epi32(x1, z1);210out[3].v = _mm_unpackhi_epi32(x1, z1);211out[4].v = _mm_unpacklo_epi32(x2, z2);212}213214/* split a packed bignum in to it's two parts */215inline void216curve25519_untangle64(bignum25519 x, bignum25519 z, const packedelem64 *in) {217_mm_store_si128((xmmi *)(x + 0), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[0].v, in[1].v), _mm_unpacklo_epi32(in[2].v, in[3].v)));218_mm_store_si128((xmmi *)(x + 4), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[4].v, in[5].v), _mm_unpacklo_epi32(in[6].v, in[7].v)));219_mm_store_si128((xmmi *)(x + 8), _mm_unpacklo_epi32(in[8].v, in[9].v) );220_mm_store_si128((xmmi *)(z + 0), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[0].v, in[1].v), _mm_unpackhi_epi32(in[2].v, in[3].v)));221_mm_store_si128((xmmi *)(z + 4), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[4].v, in[5].v), _mm_unpackhi_epi32(in[6].v, in[7].v)));222_mm_store_si128((xmmi *)(z + 8), _mm_unpackhi_epi32(in[8].v, in[9].v) );223}224225/* add two packed bignums */226inline void227curve25519_add_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {228out[0].v = _mm_add_epi32(r[0].v, s[0].v);229out[1].v = _mm_add_epi32(r[1].v, s[1].v);230out[2].v = _mm_add_epi32(r[2].v, s[2].v);231out[3].v = _mm_add_epi32(r[3].v, s[3].v);232out[4].v = _mm_add_epi32(r[4].v, s[4].v);233}234235/* subtract two packed bignums */236inline void237curve25519_sub_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {238xmmi r0,r1,r2,r3,r4;239xmmi s0,s1,s2,s3;240xmmi c1,c2;241242r0 = _mm_add_epi32(r[0].v, packed32zeromodp0.v);243r1 = _mm_add_epi32(r[1].v, packed32zeromodp1.v);244r2 = _mm_add_epi32(r[2].v, packed32zeromodp1.v);245r3 = _mm_add_epi32(r[3].v, packed32zeromodp1.v);246r4 = _mm_add_epi32(r[4].v, packed32zeromodp1.v);247r0 = _mm_sub_epi32(r0, s[0].v); /* 00 11 */248r1 = _mm_sub_epi32(r1, s[1].v); /* 22 33 */249r2 = _mm_sub_epi32(r2, s[2].v); /* 44 55 */250r3 = _mm_sub_epi32(r3, s[3].v); /* 66 77 */251r4 = _mm_sub_epi32(r4, s[4].v); /* 88 99 */252253s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */254s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */255s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */256s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */257258c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);259c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); r4 = _mm_add_epi32(r4, _mm_srli_si128(c2, 8)); s0 = _mm_add_epi32(s0, _mm_slli_si128(c2, 8));260261out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */262out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */263out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */264out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */265out[4].v = r4; /* 88 99 */266}267268/* multiply two packed bignums */269inline void270curve25519_mul_packed64(packedelem64 *out, const packedelem64 *r, const packedelem64 *s) {271xmmi r1,r2,r3,r4,r5,r6,r7,r8,r9;272xmmi r1_2,r3_2,r5_2,r7_2,r9_2;273xmmi c1,c2;274275out[0].v = _mm_mul_epu32(r[0].v, s[0].v);276out[1].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[1].v), _mm_mul_epu32(r[1].v, s[0].v));277r1_2 = _mm_slli_epi32(r[1].v, 1);278out[2].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[1].v), _mm_mul_epu32(r[2].v, s[0].v)));279out[3].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[1].v), _mm_mul_epu32(r[3].v, s[0].v))));280r3_2 = _mm_slli_epi32(r[3].v, 1);281out[4].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[1].v), _mm_mul_epu32(r[4].v, s[0].v)))));282out[5].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[1].v), _mm_mul_epu32(r[5].v, s[0].v))))));283r5_2 = _mm_slli_epi32(r[5].v, 1);284out[6].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[1].v), _mm_mul_epu32(r[6].v, s[0].v)))))));285out[7].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[1].v), _mm_mul_epu32(r[7].v , s[0].v))))))));286r7_2 = _mm_slli_epi32(r[7].v, 1);287out[8].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[7].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2 , s[1].v), _mm_mul_epu32(r[8].v, s[0].v)))))))));288out[9].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[9].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[7].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[8].v, s[1].v), _mm_mul_epu32(r[9].v, s[0].v))))))))));289290r1 = _mm_mul_epu32(r[1].v, packednineteen.v);291r2 = _mm_mul_epu32(r[2].v, packednineteen.v);292r1_2 = _mm_slli_epi32(r1, 1);293r3 = _mm_mul_epu32(r[3].v, packednineteen.v);294r4 = _mm_mul_epu32(r[4].v, packednineteen.v);295r3_2 = _mm_slli_epi32(r3, 1);296r5 = _mm_mul_epu32(r[5].v, packednineteen.v);297r6 = _mm_mul_epu32(r[6].v, packednineteen.v);298r5_2 = _mm_slli_epi32(r5, 1);299r7 = _mm_mul_epu32(r[7].v, packednineteen.v);300r8 = _mm_mul_epu32(r[8].v, packednineteen.v);301r7_2 = _mm_slli_epi32(r7, 1);302r9 = _mm_mul_epu32(r[9].v, packednineteen.v);303r9_2 = _mm_slli_epi32(r9, 1);304305out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[1].v), _mm_add_epi64(_mm_mul_epu32(r8, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r6, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r4, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r2, s[8].v), _mm_mul_epu32(r1_2, s[9].v))))))))));306out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[2].v), _mm_add_epi64(_mm_mul_epu32(r8, s[3].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r6, s[5].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r4, s[7].v), _mm_add_epi64(_mm_mul_epu32(r3 , s[8].v), _mm_mul_epu32(r2, s[9].v)))))))));307out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r8, s[4].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r6, s[6].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r4, s[8].v), _mm_mul_epu32(r3_2, s[9].v))))))));308out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r8, s[5].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r6, s[7].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[8].v), _mm_mul_epu32(r4, s[9].v)))))));309out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r8, s[6].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r6, s[8].v), _mm_mul_epu32(r5_2, s[9].v))))));310out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r8, s[7].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[8].v), _mm_mul_epu32(r6, s[9].v)))));311out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r8, s[8].v), _mm_mul_epu32(r7_2, s[9].v))));312out[7].v = _mm_add_epi64(out[7].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[8].v), _mm_mul_epu32(r8, s[9].v)));313out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(r9_2, s[9].v));314315c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);316c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);317c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);318c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);319c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);320c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));321c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);322}323324/* multiply a bignum */325void326curve25519_mul(bignum25519 out, const bignum25519 r, const bignum25519 s) {327xmmi m01,m23,m45,m67,m89;328xmmi m0123,m4567;329xmmi s0123,s4567;330xmmi s01,s23,s45,s67,s89;331xmmi s12,s34,s56,s78,s9;332xmmi r0,r2,r4,r6,r8;333xmmi r1,r3,r5,r7,r9;334xmmi r119,r219,r319,r419,r519,r619,r719,r819,r919;335xmmi c1,c2,c3;336337s0123 = _mm_load_si128((xmmi*)s + 0);338s01 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,1,2,0));339s12 = _mm_shuffle_epi32(s0123, _MM_SHUFFLE(2,2,1,1));340s23 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,3,2,2));341s4567 = _mm_load_si128((xmmi*)s + 1);342s34 = _mm_unpacklo_epi64(_mm_srli_si128(s0123,12),s4567);343s45 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,1,2,0));344s56 = _mm_shuffle_epi32(s4567, _MM_SHUFFLE(2,2,1,1));345s67 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,3,2,2));346s89 = _mm_load_si128((xmmi*)s + 2);347s78 = _mm_unpacklo_epi64(_mm_srli_si128(s4567,12),s89);348s89 = _mm_shuffle_epi32(s89,_MM_SHUFFLE(3,1,2,0));349s9 = _mm_shuffle_epi32(s89, _MM_SHUFFLE(3,3,2,2));350351r0 = _mm_load_si128((xmmi*)r + 0);352r1 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(1,1,1,1));353r1 = _mm_add_epi64(r1, _mm_and_si128(r1, sse2_top64bitmask.v));354r2 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(2,2,2,2));355r3 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(3,3,3,3));356r3 = _mm_add_epi64(r3, _mm_and_si128(r3, sse2_top64bitmask.v));357r0 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(0,0,0,0));358r4 = _mm_load_si128((xmmi*)r + 1);359r5 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(1,1,1,1));360r5 = _mm_add_epi64(r5, _mm_and_si128(r5, sse2_top64bitmask.v));361r6 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(2,2,2,2));362r7 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(3,3,3,3));363r7 = _mm_add_epi64(r7, _mm_and_si128(r7, sse2_top64bitmask.v));364r4 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(0,0,0,0));365r8 = _mm_load_si128((xmmi*)r + 2);366r9 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,1,3,1));367r9 = _mm_add_epi64(r9, _mm_and_si128(r9, sse2_top64bitmask.v));368r8 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,0,3,0));369370m01 = _mm_mul_epu32(r1,s01);371m23 = _mm_mul_epu32(r1,s23);372m45 = _mm_mul_epu32(r1,s45);373m67 = _mm_mul_epu32(r1,s67);374m23 = _mm_add_epi64(m23,_mm_mul_epu32(r3,s01));375m45 = _mm_add_epi64(m45,_mm_mul_epu32(r3,s23));376m67 = _mm_add_epi64(m67,_mm_mul_epu32(r3,s45));377m89 = _mm_mul_epu32(r1,s89);378m45 = _mm_add_epi64(m45,_mm_mul_epu32(r5,s01));379m67 = _mm_add_epi64(m67,_mm_mul_epu32(r5,s23));380m89 = _mm_add_epi64(m89,_mm_mul_epu32(r3,s67));381m67 = _mm_add_epi64(m67,_mm_mul_epu32(r7,s01));382m89 = _mm_add_epi64(m89,_mm_mul_epu32(r5,s45));383m89 = _mm_add_epi64(m89,_mm_mul_epu32(r7,s23));384m89 = _mm_add_epi64(m89,_mm_mul_epu32(r9,s01));385386/* shift up */387m89 = _mm_unpackhi_epi64(m67,_mm_slli_si128(m89,8));388m67 = _mm_unpackhi_epi64(m45,_mm_slli_si128(m67,8));389m45 = _mm_unpackhi_epi64(m23,_mm_slli_si128(m45,8));390m23 = _mm_unpackhi_epi64(m01,_mm_slli_si128(m23,8));391m01 = _mm_unpackhi_epi64(_mm_setzero_si128(),_mm_slli_si128(m01,8));392393m01 = _mm_add_epi64(m01,_mm_mul_epu32(r0,s01));394m23 = _mm_add_epi64(m23,_mm_mul_epu32(r0,s23));395m45 = _mm_add_epi64(m45,_mm_mul_epu32(r0,s45));396m67 = _mm_add_epi64(m67,_mm_mul_epu32(r0,s67));397m23 = _mm_add_epi64(m23,_mm_mul_epu32(r2,s01));398m45 = _mm_add_epi64(m45,_mm_mul_epu32(r2,s23));399m67 = _mm_add_epi64(m67,_mm_mul_epu32(r4,s23));400m89 = _mm_add_epi64(m89,_mm_mul_epu32(r0,s89));401m45 = _mm_add_epi64(m45,_mm_mul_epu32(r4,s01));402m67 = _mm_add_epi64(m67,_mm_mul_epu32(r2,s45));403m89 = _mm_add_epi64(m89,_mm_mul_epu32(r2,s67));404m67 = _mm_add_epi64(m67,_mm_mul_epu32(r6,s01));405m89 = _mm_add_epi64(m89,_mm_mul_epu32(r4,s45));406m89 = _mm_add_epi64(m89,_mm_mul_epu32(r6,s23));407m89 = _mm_add_epi64(m89,_mm_mul_epu32(r8,s01));408409r219 = _mm_mul_epu32(r2, packednineteen.v);410r419 = _mm_mul_epu32(r4, packednineteen.v);411r619 = _mm_mul_epu32(r6, packednineteen.v);412r819 = _mm_mul_epu32(r8, packednineteen.v);413r119 = _mm_shuffle_epi32(r1,_MM_SHUFFLE(0,0,2,2)); r119 = _mm_mul_epu32(r119, packednineteen.v);414r319 = _mm_shuffle_epi32(r3,_MM_SHUFFLE(0,0,2,2)); r319 = _mm_mul_epu32(r319, packednineteen.v);415r519 = _mm_shuffle_epi32(r5,_MM_SHUFFLE(0,0,2,2)); r519 = _mm_mul_epu32(r519, packednineteen.v);416r719 = _mm_shuffle_epi32(r7,_MM_SHUFFLE(0,0,2,2)); r719 = _mm_mul_epu32(r719, packednineteen.v);417r919 = _mm_shuffle_epi32(r9,_MM_SHUFFLE(0,0,2,2)); r919 = _mm_mul_epu32(r919, packednineteen.v);418419m01 = _mm_add_epi64(m01,_mm_mul_epu32(r919,s12));420m23 = _mm_add_epi64(m23,_mm_mul_epu32(r919,s34));421m45 = _mm_add_epi64(m45,_mm_mul_epu32(r919,s56));422m67 = _mm_add_epi64(m67,_mm_mul_epu32(r919,s78));423m01 = _mm_add_epi64(m01,_mm_mul_epu32(r719,s34));424m23 = _mm_add_epi64(m23,_mm_mul_epu32(r719,s56));425m45 = _mm_add_epi64(m45,_mm_mul_epu32(r719,s78));426m67 = _mm_add_epi64(m67,_mm_mul_epu32(r719,s9));427m01 = _mm_add_epi64(m01,_mm_mul_epu32(r519,s56));428m23 = _mm_add_epi64(m23,_mm_mul_epu32(r519,s78));429m45 = _mm_add_epi64(m45,_mm_mul_epu32(r519,s9));430m67 = _mm_add_epi64(m67,_mm_mul_epu32(r819,s89));431m01 = _mm_add_epi64(m01,_mm_mul_epu32(r319,s78));432m23 = _mm_add_epi64(m23,_mm_mul_epu32(r319,s9));433m45 = _mm_add_epi64(m45,_mm_mul_epu32(r619,s89));434m89 = _mm_add_epi64(m89,_mm_mul_epu32(r919,s9));435m01 = _mm_add_epi64(m01,_mm_mul_epu32(r819,s23));436m23 = _mm_add_epi64(m23,_mm_mul_epu32(r819,s45));437m45 = _mm_add_epi64(m45,_mm_mul_epu32(r819,s67));438m01 = _mm_add_epi64(m01,_mm_mul_epu32(r619,s45));439m23 = _mm_add_epi64(m23,_mm_mul_epu32(r619,s67));440m01 = _mm_add_epi64(m01,_mm_mul_epu32(r419,s67));441m23 = _mm_add_epi64(m23,_mm_mul_epu32(r419,s89));442m01 = _mm_add_epi64(m01,_mm_mul_epu32(r219,s89));443m01 = _mm_add_epi64(m01,_mm_mul_epu32(r119,s9));444445r0 = _mm_unpacklo_epi64(m01, m45);446r1 = _mm_unpackhi_epi64(m01, m45);447r2 = _mm_unpacklo_epi64(m23, m67);448r3 = _mm_unpackhi_epi64(m23, m67);449r4 = _mm_unpacklo_epi64(m89, m89);450r5 = _mm_unpackhi_epi64(m89, m89);451452c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);453c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);454c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);455c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));456c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);457458m0123 = _mm_unpacklo_epi32(r0, r1);459m4567 = _mm_unpackhi_epi32(r0, r1);460m0123 = _mm_unpacklo_epi64(m0123, _mm_unpacklo_epi32(r2, r3));461m4567 = _mm_unpacklo_epi64(m4567, _mm_unpackhi_epi32(r2, r3));462m89 = _mm_unpackhi_epi32(r4, r5);463464_mm_store_si128((xmmi*)out + 0, m0123);465_mm_store_si128((xmmi*)out + 1, m4567);466_mm_store_si128((xmmi*)out + 2, m89);467}468469typedef struct bignum25519mulprecomp_t {470xmmi r0,r2,r4,r6,r8;471xmmi r1,r3,r5,r7,r9;472xmmi r119,r219,r319,r419,r519,r619,r719,r819,r919;473} bignum25519mulprecomp;474475/* precompute a constant to multiply by */476inline void477curve25519_mul_precompute(bignum25519mulprecomp *pre, const bignum25519 r) {478pre->r0 = _mm_load_si128((xmmi*)r + 0);479pre->r1 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(1,1,1,1));480pre->r1 = _mm_add_epi64(pre->r1, _mm_and_si128(pre->r1, sse2_top64bitmask.v));481pre->r2 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(2,2,2,2));482pre->r3 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(3,3,3,3));483pre->r3 = _mm_add_epi64(pre->r3, _mm_and_si128(pre->r3, sse2_top64bitmask.v));484pre->r0 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(0,0,0,0));485pre->r4 = _mm_load_si128((xmmi*)r + 1);486pre->r5 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(1,1,1,1));487pre->r5 = _mm_add_epi64(pre->r5, _mm_and_si128(pre->r5, sse2_top64bitmask.v));488pre->r6 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(2,2,2,2));489pre->r7 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(3,3,3,3));490pre->r7 = _mm_add_epi64(pre->r7, _mm_and_si128(pre->r7, sse2_top64bitmask.v));491pre->r4 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(0,0,0,0));492pre->r8 = _mm_load_si128((xmmi*)r + 2);493pre->r9 = _mm_shuffle_epi32(pre->r8, _MM_SHUFFLE(3,1,3,1));494pre->r9 = _mm_add_epi64(pre->r9, _mm_and_si128(pre->r9, sse2_top64bitmask.v));495pre->r8 = _mm_shuffle_epi32(pre->r8, _MM_SHUFFLE(3,0,3,0));496497pre->r219 = _mm_mul_epu32(pre->r2, packednineteen.v);498pre->r419 = _mm_mul_epu32(pre->r4, packednineteen.v);499pre->r619 = _mm_mul_epu32(pre->r6, packednineteen.v);500pre->r819 = _mm_mul_epu32(pre->r8, packednineteen.v);501pre->r119 = _mm_shuffle_epi32(pre->r1,_MM_SHUFFLE(0,0,2,2)); pre->r119 = _mm_mul_epu32(pre->r119, packednineteen.v);502pre->r319 = _mm_shuffle_epi32(pre->r3,_MM_SHUFFLE(0,0,2,2)); pre->r319 = _mm_mul_epu32(pre->r319, packednineteen.v);503pre->r519 = _mm_shuffle_epi32(pre->r5,_MM_SHUFFLE(0,0,2,2)); pre->r519 = _mm_mul_epu32(pre->r519, packednineteen.v);504pre->r719 = _mm_shuffle_epi32(pre->r7,_MM_SHUFFLE(0,0,2,2)); pre->r719 = _mm_mul_epu32(pre->r719, packednineteen.v);505pre->r919 = _mm_shuffle_epi32(pre->r9,_MM_SHUFFLE(0,0,2,2)); pre->r919 = _mm_mul_epu32(pre->r919, packednineteen.v);506}507508509/* multiply a bignum by a pre-computed constant */510inline void511curve25519_mul_precomputed(bignum25519 out, const bignum25519 s, const bignum25519mulprecomp *r) {512xmmi m01,m23,m45,m67,m89;513xmmi m0123,m4567;514xmmi s0123,s4567;515xmmi s01,s23,s45,s67,s89;516xmmi s12,s34,s56,s78,s9;517xmmi r0,r1,r2,r3,r4,r5;518xmmi c1,c2,c3;519520s0123 = _mm_load_si128((xmmi*)s + 0);521s01 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,1,2,0));522s12 = _mm_shuffle_epi32(s0123, _MM_SHUFFLE(2,2,1,1));523s23 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,3,2,2));524s4567 = _mm_load_si128((xmmi*)s + 1);525s34 = _mm_unpacklo_epi64(_mm_srli_si128(s0123,12),s4567);526s45 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,1,2,0));527s56 = _mm_shuffle_epi32(s4567, _MM_SHUFFLE(2,2,1,1));528s67 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,3,2,2));529s89 = _mm_load_si128((xmmi*)s + 2);530s78 = _mm_unpacklo_epi64(_mm_srli_si128(s4567,12),s89);531s89 = _mm_shuffle_epi32(s89,_MM_SHUFFLE(3,1,2,0));532s9 = _mm_shuffle_epi32(s89, _MM_SHUFFLE(3,3,2,2));533534m01 = _mm_mul_epu32(r->r1,s01);535m23 = _mm_mul_epu32(r->r1,s23);536m45 = _mm_mul_epu32(r->r1,s45);537m67 = _mm_mul_epu32(r->r1,s67);538m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r3,s01));539m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r3,s23));540m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r3,s45));541m89 = _mm_mul_epu32(r->r1,s89);542m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r5,s01));543m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r5,s23));544m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r3,s67));545m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r7,s01));546m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r5,s45));547m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r7,s23));548m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r9,s01));549550/* shift up */551m89 = _mm_unpackhi_epi64(m67,_mm_slli_si128(m89,8));552m67 = _mm_unpackhi_epi64(m45,_mm_slli_si128(m67,8));553m45 = _mm_unpackhi_epi64(m23,_mm_slli_si128(m45,8));554m23 = _mm_unpackhi_epi64(m01,_mm_slli_si128(m23,8));555m01 = _mm_unpackhi_epi64(_mm_setzero_si128(),_mm_slli_si128(m01,8));556557m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r0,s01));558m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r0,s23));559m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r0,s45));560m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r0,s67));561m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r2,s01));562m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r2,s23));563m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r4,s23));564m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r0,s89));565m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r4,s01));566m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r2,s45));567m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r2,s67));568m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r6,s01));569m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r4,s45));570m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r6,s23));571m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r8,s01));572m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r919,s12));573m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r919,s34));574m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r919,s56));575m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r919,s78));576m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r719,s34));577m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r719,s56));578m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r719,s78));579m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r719,s9));580m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r519,s56));581m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r519,s78));582m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r519,s9));583m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r819,s89));584m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r319,s78));585m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r319,s9));586m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r619,s89));587m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r919,s9));588m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r819,s23));589m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r819,s45));590m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r819,s67));591m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r619,s45));592m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r619,s67));593m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r419,s67));594m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r419,s89));595m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r219,s89));596m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r119,s9));597598r0 = _mm_unpacklo_epi64(m01, m45);599r1 = _mm_unpackhi_epi64(m01, m45);600r2 = _mm_unpacklo_epi64(m23, m67);601r3 = _mm_unpackhi_epi64(m23, m67);602r4 = _mm_unpacklo_epi64(m89, m89);603r5 = _mm_unpackhi_epi64(m89, m89);604605c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);606c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);607c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);608c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));609c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);610611m0123 = _mm_unpacklo_epi32(r0, r1);612m4567 = _mm_unpackhi_epi32(r0, r1);613m0123 = _mm_unpacklo_epi64(m0123, _mm_unpacklo_epi32(r2, r3));614m4567 = _mm_unpacklo_epi64(m4567, _mm_unpackhi_epi32(r2, r3));615m89 = _mm_unpackhi_epi32(r4, r5);616617_mm_store_si128((xmmi*)out + 0, m0123);618_mm_store_si128((xmmi*)out + 1, m4567);619_mm_store_si128((xmmi*)out + 2, m89);620}621622/* square a bignum 'count' times */623#define curve25519_square(r,x) curve25519_square_times(r,x,1)624625void626curve25519_square_times(bignum25519 r, const bignum25519 in, int count) {627xmmi m01,m23,m45,m67,m89;628xmmi r0,r1,r2,r3,r4,r5,r6,r7,r8,r9;629xmmi r0a,r1a,r2a,r3a,r7a,r9a;630xmmi r0123,r4567;631xmmi r01,r23,r45,r67,r6x,r89,r8x;632xmmi r12,r34,r56,r78,r9x;633xmmi r5619;634xmmi c1,c2,c3;635636r0123 = _mm_load_si128((xmmi*)in + 0);637r01 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,1,2,0));638r23 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,3,2,2));639r4567 = _mm_load_si128((xmmi*)in + 1);640r45 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,1,2,0));641r67 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,3,2,2));642r89 = _mm_load_si128((xmmi*)in + 2);643r89 = _mm_shuffle_epi32(r89,_MM_SHUFFLE(3,1,2,0));644645do {646r12 = _mm_unpackhi_epi64(r01, _mm_slli_si128(r23, 8));647r0 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(0,0,0,0));648r0 = _mm_add_epi64(r0, _mm_and_si128(r0, sse2_top64bitmask.v));649r0a = _mm_shuffle_epi32(r0,_MM_SHUFFLE(3,2,1,2));650r1 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(2,2,2,2));651r2 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(0,0,0,0));652r2 = _mm_add_epi64(r2, _mm_and_si128(r2, sse2_top64bitmask.v));653r2a = _mm_shuffle_epi32(r2,_MM_SHUFFLE(3,2,1,2));654r3 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,2,2,2));655r34 = _mm_unpackhi_epi64(r23, _mm_slli_si128(r45, 8));656r4 = _mm_shuffle_epi32(r45, _MM_SHUFFLE(0,0,0,0));657r4 = _mm_add_epi64(r4, _mm_and_si128(r4, sse2_top64bitmask.v));658r56 = _mm_unpackhi_epi64(r45, _mm_slli_si128(r67, 8));659r5619 = _mm_mul_epu32(r56, packednineteen.v);660r5 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(1,1,1,0));661r6 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(3,2,3,2));662r78 = _mm_unpackhi_epi64(r67, _mm_slli_si128(r89, 8));663r6x = _mm_unpacklo_epi64(r67, _mm_setzero_si128());664r7 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,2,2,2));665r7 = _mm_mul_epu32(r7, packed3819.v);666r7a = _mm_shuffle_epi32(r7, _MM_SHUFFLE(3,3,3,2));667r8x = _mm_unpacklo_epi64(r89, _mm_setzero_si128());668r8 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(0,0,0,0));669r8 = _mm_mul_epu32(r8, packednineteen.v);670r9 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(2,2,2,2));671r9x = _mm_slli_epi32(_mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,3,2)), 1);672r9 = _mm_mul_epu32(r9, packed3819.v);673r9a = _mm_shuffle_epi32(r9, _MM_SHUFFLE(2,2,2,2));674675m01 = _mm_mul_epu32(r01, r0);676m23 = _mm_mul_epu32(r23, r0a);677m45 = _mm_mul_epu32(r45, r0a);678m45 = _mm_add_epi64(m45, _mm_mul_epu32(r23, r2));679r23 = _mm_slli_epi32(r23, 1);680m67 = _mm_mul_epu32(r67, r0a);681m67 = _mm_add_epi64(m67, _mm_mul_epu32(r45, r2a));682m89 = _mm_mul_epu32(r89, r0a);683m89 = _mm_add_epi64(m89, _mm_mul_epu32(r67, r2a));684r67 = _mm_slli_epi32(r67, 1);685m89 = _mm_add_epi64(m89, _mm_mul_epu32(r45, r4));686r45 = _mm_slli_epi32(r45, 1);687688r1 = _mm_slli_epi32(r1, 1);689r3 = _mm_slli_epi32(r3, 1);690r1a = _mm_add_epi64(r1, _mm_and_si128(r1, sse2_bot64bitmask.v));691r3a = _mm_add_epi64(r3, _mm_and_si128(r3, sse2_bot64bitmask.v));692693m23 = _mm_add_epi64(m23, _mm_mul_epu32(r12, r1));694m45 = _mm_add_epi64(m45, _mm_mul_epu32(r34, r1a));695m67 = _mm_add_epi64(m67, _mm_mul_epu32(r56, r1a));696m67 = _mm_add_epi64(m67, _mm_mul_epu32(r34, r3));697r34 = _mm_slli_epi32(r34, 1);698m89 = _mm_add_epi64(m89, _mm_mul_epu32(r78, r1a));699r78 = _mm_slli_epi32(r78, 1);700m89 = _mm_add_epi64(m89, _mm_mul_epu32(r56, r3a));701r56 = _mm_slli_epi32(r56, 1);702703m01 = _mm_add_epi64(m01, _mm_mul_epu32(_mm_slli_epi32(r12, 1), r9));704m01 = _mm_add_epi64(m01, _mm_mul_epu32(r34, r7));705m23 = _mm_add_epi64(m23, _mm_mul_epu32(r34, r9));706m01 = _mm_add_epi64(m01, _mm_mul_epu32(r56, r5));707m23 = _mm_add_epi64(m23, _mm_mul_epu32(r56, r7));708m45 = _mm_add_epi64(m45, _mm_mul_epu32(r56, r9));709m01 = _mm_add_epi64(m01, _mm_mul_epu32(r23, r8));710m01 = _mm_add_epi64(m01, _mm_mul_epu32(r45, r6));711m23 = _mm_add_epi64(m23, _mm_mul_epu32(r45, r8));712m23 = _mm_add_epi64(m23, _mm_mul_epu32(r6x, r6));713m45 = _mm_add_epi64(m45, _mm_mul_epu32(r78, r7a));714m67 = _mm_add_epi64(m67, _mm_mul_epu32(r78, r9));715m45 = _mm_add_epi64(m45, _mm_mul_epu32(r67, r8));716m67 = _mm_add_epi64(m67, _mm_mul_epu32(r8x, r8));717m89 = _mm_add_epi64(m89, _mm_mul_epu32(r9x, r9a));718719r0 = _mm_unpacklo_epi64(m01, m45);720r1 = _mm_unpackhi_epi64(m01, m45);721r2 = _mm_unpacklo_epi64(m23, m67);722r3 = _mm_unpackhi_epi64(m23, m67);723r4 = _mm_unpacklo_epi64(m89, m89);724r5 = _mm_unpackhi_epi64(m89, m89);725726c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);727c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);728c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);729c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));730c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);731732r01 = _mm_unpacklo_epi64(r0, r1);733r45 = _mm_unpackhi_epi64(r0, r1);734r23 = _mm_unpacklo_epi64(r2, r3);735r67 = _mm_unpackhi_epi64(r2, r3);736r89 = _mm_unpackhi_epi64(r4, r5);737} while (--count);738739r0123 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,0,3,3));740r4567 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,0,3,3));741r0123 = _mm_or_si128(r0123, _mm_shuffle_epi32(r01, _MM_SHUFFLE(3,3,2,0)));742r4567 = _mm_or_si128(r4567, _mm_shuffle_epi32(r45, _MM_SHUFFLE(3,3,2,0)));743r89 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,2,0));744745_mm_store_si128((xmmi*)r + 0, r0123);746_mm_store_si128((xmmi*)r + 1, r4567);747_mm_store_si128((xmmi*)r + 2, r89);748}749750/* square two packed bignums */751inline void752curve25519_square_packed64(packedelem64 *out, const packedelem64 *r) {753xmmi r0,r1,r2,r3;754xmmi r1_2,r3_2,r4_2,r5_2,r6_2,r7_2;755xmmi d5,d6,d7,d8,d9;756xmmi c1,c2;757758r0 = r[0].v;759r1 = r[1].v;760r2 = r[2].v;761r3 = r[3].v;762763out[0].v = _mm_mul_epu32(r0, r0);764r0 = _mm_slli_epi32(r0, 1);765out[1].v = _mm_mul_epu32(r0, r1);766r1_2 = _mm_slli_epi32(r1, 1);767out[2].v = _mm_add_epi64(_mm_mul_epu32(r0, r2 ), _mm_mul_epu32(r1, r1_2));768r1 = r1_2;769out[3].v = _mm_add_epi64(_mm_mul_epu32(r0, r3 ), _mm_mul_epu32(r1, r2 ));770r3_2 = _mm_slli_epi32(r3, 1);771out[4].v = _mm_add_epi64(_mm_mul_epu32(r0, r[4].v), _mm_add_epi64(_mm_mul_epu32(r1, r3_2 ), _mm_mul_epu32(r2, r2)));772r2 = _mm_slli_epi32(r2, 1);773out[5].v = _mm_add_epi64(_mm_mul_epu32(r0, r[5].v), _mm_add_epi64(_mm_mul_epu32(r1, r[4].v), _mm_mul_epu32(r2, r3)));774r5_2 = _mm_slli_epi32(r[5].v, 1);775out[6].v = _mm_add_epi64(_mm_mul_epu32(r0, r[6].v), _mm_add_epi64(_mm_mul_epu32(r1, r5_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[4].v), _mm_mul_epu32(r3, r3_2 ))));776r3 = r3_2;777out[7].v = _mm_add_epi64(_mm_mul_epu32(r0, r[7].v), _mm_add_epi64(_mm_mul_epu32(r1, r[6].v), _mm_add_epi64(_mm_mul_epu32(r2, r[5].v), _mm_mul_epu32(r3, r[4].v))));778r7_2 = _mm_slli_epi32(r[7].v, 1);779out[8].v = _mm_add_epi64(_mm_mul_epu32(r0, r[8].v), _mm_add_epi64(_mm_mul_epu32(r1, r7_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[6].v), _mm_add_epi64(_mm_mul_epu32(r3, r5_2 ), _mm_mul_epu32(r[4].v, r[4].v)))));780out[9].v = _mm_add_epi64(_mm_mul_epu32(r0, r[9].v), _mm_add_epi64(_mm_mul_epu32(r1, r[8].v), _mm_add_epi64(_mm_mul_epu32(r2, r[7].v), _mm_add_epi64(_mm_mul_epu32(r3, r[6].v), _mm_mul_epu32(r[4].v, r5_2 )))));781782d5 = _mm_mul_epu32(r[5].v, packedthirtyeight.v);783d6 = _mm_mul_epu32(r[6].v, packednineteen.v);784d7 = _mm_mul_epu32(r[7].v, packedthirtyeight.v);785d8 = _mm_mul_epu32(r[8].v, packednineteen.v);786d9 = _mm_mul_epu32(r[9].v, packedthirtyeight.v);787788r4_2 = _mm_slli_epi32(r[4].v, 1);789r6_2 = _mm_slli_epi32(r[6].v, 1);790out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(d9, r1 ), _mm_add_epi64(_mm_mul_epu32(d8, r2 ), _mm_add_epi64(_mm_mul_epu32(d7, r3 ), _mm_add_epi64(_mm_mul_epu32(d6, r4_2), _mm_mul_epu32(d5, r[5].v))))));791out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(d9, _mm_srli_epi32(r2, 1)), _mm_add_epi64(_mm_mul_epu32(d8, r3 ), _mm_add_epi64(_mm_mul_epu32(d7, r[4].v), _mm_mul_epu32(d6, r5_2 )))));792out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(d9, r3 ), _mm_add_epi64(_mm_mul_epu32(d8, r4_2), _mm_add_epi64(_mm_mul_epu32(d7, r5_2 ), _mm_mul_epu32(d6, r[6].v)))));793out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(d9, r[4].v ), _mm_add_epi64(_mm_mul_epu32(d8, r5_2), _mm_mul_epu32(d7, r[6].v))));794out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(d9, r5_2 ), _mm_add_epi64(_mm_mul_epu32(d8, r6_2), _mm_mul_epu32(d7, r[7].v))));795out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(d9, r[6].v ), _mm_mul_epu32(d8, r7_2 )));796out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(d9, r7_2 ), _mm_mul_epu32(d8, r[8].v)));797out[7].v = _mm_add_epi64(out[7].v, _mm_mul_epu32(d9, r[8].v));798out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(d9, r[9].v));799800c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);801c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);802c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);803c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);804c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);805c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));806c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);807}808809/* make [nqx+nqz,nqpqx+nqpqz], [nqpqx-nqpqz,nqx-nqz] from [nqx+nqz,nqpqx+nqpqz], [nqx-nqz,nqpqx-nqpqz] */810inline void811curve25519_make_nqpq(packedelem64 *primex, packedelem64 *primez, const packedelem32 *pqx, const packedelem32 *pqz) {812primex[0].v = _mm_shuffle_epi32(pqx[0].v, _MM_SHUFFLE(1,1,0,0));813primex[1].v = _mm_shuffle_epi32(pqx[0].v, _MM_SHUFFLE(3,3,2,2));814primex[2].v = _mm_shuffle_epi32(pqx[1].v, _MM_SHUFFLE(1,1,0,0));815primex[3].v = _mm_shuffle_epi32(pqx[1].v, _MM_SHUFFLE(3,3,2,2));816primex[4].v = _mm_shuffle_epi32(pqx[2].v, _MM_SHUFFLE(1,1,0,0));817primex[5].v = _mm_shuffle_epi32(pqx[2].v, _MM_SHUFFLE(3,3,2,2));818primex[6].v = _mm_shuffle_epi32(pqx[3].v, _MM_SHUFFLE(1,1,0,0));819primex[7].v = _mm_shuffle_epi32(pqx[3].v, _MM_SHUFFLE(3,3,2,2));820primex[8].v = _mm_shuffle_epi32(pqx[4].v, _MM_SHUFFLE(1,1,0,0));821primex[9].v = _mm_shuffle_epi32(pqx[4].v, _MM_SHUFFLE(3,3,2,2));822primez[0].v = _mm_shuffle_epi32(pqz[0].v, _MM_SHUFFLE(0,0,1,1));823primez[1].v = _mm_shuffle_epi32(pqz[0].v, _MM_SHUFFLE(2,2,3,3));824primez[2].v = _mm_shuffle_epi32(pqz[1].v, _MM_SHUFFLE(0,0,1,1));825primez[3].v = _mm_shuffle_epi32(pqz[1].v, _MM_SHUFFLE(2,2,3,3));826primez[4].v = _mm_shuffle_epi32(pqz[2].v, _MM_SHUFFLE(0,0,1,1));827primez[5].v = _mm_shuffle_epi32(pqz[2].v, _MM_SHUFFLE(2,2,3,3));828primez[6].v = _mm_shuffle_epi32(pqz[3].v, _MM_SHUFFLE(0,0,1,1));829primez[7].v = _mm_shuffle_epi32(pqz[3].v, _MM_SHUFFLE(2,2,3,3));830primez[8].v = _mm_shuffle_epi32(pqz[4].v, _MM_SHUFFLE(0,0,1,1));831primez[9].v = _mm_shuffle_epi32(pqz[4].v, _MM_SHUFFLE(2,2,3,3));832}833834/* make [nqx+nqz,nqx-nqz] from [nqx+nqz,nqpqx+nqpqz], [nqx-nqz,nqpqx-nqpqz] */835inline void836curve25519_make_nq(packedelem64 *nq, const packedelem32 *pqx, const packedelem32 *pqz) {837nq[0].v = _mm_unpacklo_epi64(pqx[0].v, pqz[0].v);838nq[1].v = _mm_unpackhi_epi64(pqx[0].v, pqz[0].v);839nq[2].v = _mm_unpacklo_epi64(pqx[1].v, pqz[1].v);840nq[3].v = _mm_unpackhi_epi64(pqx[1].v, pqz[1].v);841nq[4].v = _mm_unpacklo_epi64(pqx[2].v, pqz[2].v);842nq[5].v = _mm_unpackhi_epi64(pqx[2].v, pqz[2].v);843nq[6].v = _mm_unpacklo_epi64(pqx[3].v, pqz[3].v);844nq[7].v = _mm_unpackhi_epi64(pqx[3].v, pqz[3].v);845nq[8].v = _mm_unpacklo_epi64(pqx[4].v, pqz[4].v);846nq[9].v = _mm_unpackhi_epi64(pqx[4].v, pqz[4].v);847}848849/* compute [nqx+nqz,nqx-nqz] from nqx, nqz */850inline void851curve25519_compute_nq(packedelem64 *nq, const bignum25519 nqx, const bignum25519 nqz) {852xmmi x0,x1,x2;853xmmi z0,z1,z2;854xmmi a0,a1,a2;855xmmi s0,s1,s2;856xmmi r0,r1;857xmmi c1,c2;858x0 = _mm_load_si128((xmmi*)nqx + 0);859x1 = _mm_load_si128((xmmi*)nqx + 1);860x2 = _mm_load_si128((xmmi*)nqx + 2);861z0 = _mm_load_si128((xmmi*)nqz + 0);862z1 = _mm_load_si128((xmmi*)nqz + 1);863z2 = _mm_load_si128((xmmi*)nqz + 2);864a0 = _mm_add_epi32(x0, z0);865a1 = _mm_add_epi32(x1, z1);866a2 = _mm_add_epi32(x2, z2);867s0 = _mm_add_epi32(x0, packed2p0.v);868s1 = _mm_add_epi32(x1, packed2p1.v);869s2 = _mm_add_epi32(x2, packed2p2.v);870s0 = _mm_sub_epi32(s0, z0);871s1 = _mm_sub_epi32(s1, z1);872s2 = _mm_sub_epi32(s2, z2);873r0 = _mm_and_si128(_mm_shuffle_epi32(s0, _MM_SHUFFLE(2,2,0,0)), sse2_bot32bitmask.v);874r1 = _mm_and_si128(_mm_shuffle_epi32(s0, _MM_SHUFFLE(3,3,1,1)), sse2_bot32bitmask.v);875c1 = _mm_srli_epi32(r0, 26);876c2 = _mm_srli_epi32(r1, 25);877r0 = _mm_and_si128(r0, packedmask26.v);878r1 = _mm_and_si128(r1, packedmask25.v);879r0 = _mm_add_epi32(r0, _mm_slli_si128(c2, 8));880r1 = _mm_add_epi32(r1, c1);881s0 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpackhi_epi32(r0, r1));882s1 = _mm_add_epi32(s1, _mm_srli_si128(c2, 8));883nq[0].v = _mm_unpacklo_epi64(a0, s0);884nq[2].v = _mm_unpackhi_epi64(a0, s0);885nq[4].v = _mm_unpacklo_epi64(a1, s1);886nq[6].v = _mm_unpackhi_epi64(a1, s1);887nq[8].v = _mm_unpacklo_epi64(a2, s2);888nq[1].v = _mm_shuffle_epi32(nq[0].v, _MM_SHUFFLE(3,3,1,1));889nq[3].v = _mm_shuffle_epi32(nq[2].v, _MM_SHUFFLE(3,3,1,1));890nq[5].v = _mm_shuffle_epi32(nq[4].v, _MM_SHUFFLE(3,3,1,1));891nq[7].v = _mm_shuffle_epi32(nq[6].v, _MM_SHUFFLE(3,3,1,1));892nq[9].v = _mm_shuffle_epi32(nq[8].v, _MM_SHUFFLE(3,3,1,1));893}894895896/* compute [x+z,x-z] from [x,z] */897inline void898curve25519_addsub_packed64(packedelem64 *r) {899packed32bignum25519 x,z,add,sub;900901x[0].v = _mm_unpacklo_epi64(r[0].v, r[1].v);902z[0].v = _mm_unpackhi_epi64(r[0].v, r[1].v);903x[1].v = _mm_unpacklo_epi64(r[2].v, r[3].v);904z[1].v = _mm_unpackhi_epi64(r[2].v, r[3].v);905x[2].v = _mm_unpacklo_epi64(r[4].v, r[5].v);906z[2].v = _mm_unpackhi_epi64(r[4].v, r[5].v);907x[3].v = _mm_unpacklo_epi64(r[6].v, r[7].v);908z[3].v = _mm_unpackhi_epi64(r[6].v, r[7].v);909x[4].v = _mm_unpacklo_epi64(r[8].v, r[9].v);910z[4].v = _mm_unpackhi_epi64(r[8].v, r[9].v);911912curve25519_add_packed32(add, x, z);913curve25519_sub_packed32(sub, x, z);914915r[0].v = _mm_unpacklo_epi64(add[0].v, sub[0].v);916r[1].v = _mm_unpackhi_epi64(add[0].v, sub[0].v);917r[2].v = _mm_unpacklo_epi64(add[1].v, sub[1].v);918r[3].v = _mm_unpackhi_epi64(add[1].v, sub[1].v);919r[4].v = _mm_unpacklo_epi64(add[2].v, sub[2].v);920r[5].v = _mm_unpackhi_epi64(add[2].v, sub[2].v);921r[6].v = _mm_unpacklo_epi64(add[3].v, sub[3].v);922r[7].v = _mm_unpackhi_epi64(add[3].v, sub[3].v);923r[8].v = _mm_unpacklo_epi64(add[4].v, sub[4].v);924r[9].v = _mm_unpackhi_epi64(add[4].v, sub[4].v);925}926927/* compute [x,z] * [121666,121665] */928inline void929curve25519_121665_packed64(packedelem64 *out, const packedelem64 *in) {930xmmi c1,c2;931932out[0].v = _mm_mul_epu32(in[0].v, packed121666121665.v);933out[1].v = _mm_mul_epu32(in[1].v, packed121666121665.v);934out[2].v = _mm_mul_epu32(in[2].v, packed121666121665.v);935out[3].v = _mm_mul_epu32(in[3].v, packed121666121665.v);936out[4].v = _mm_mul_epu32(in[4].v, packed121666121665.v);937out[5].v = _mm_mul_epu32(in[5].v, packed121666121665.v);938out[6].v = _mm_mul_epu32(in[6].v, packed121666121665.v);939out[7].v = _mm_mul_epu32(in[7].v, packed121666121665.v);940out[8].v = _mm_mul_epu32(in[8].v, packed121666121665.v);941out[9].v = _mm_mul_epu32(in[9].v, packed121666121665.v);942943c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);944c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);945c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);946c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);947c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);948c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));949c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);950}951952/* compute [sq.x,sqscalar.x-sqscalar.z] * [sq.z,sq.x-sq.z] */953inline void954curve25519_final_nq(packedelem64 *nq, const packedelem64 *sq, const packedelem64 *sq121665) {955packed32bignum25519 x, z, sub;956packed64bignum25519 t, nqa, nqb;957958x[0].v = _mm_or_si128(_mm_unpacklo_epi64(sq[0].v, sq[1].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[0].v, sq121665[1].v), 4));959z[0].v = _mm_or_si128(_mm_unpackhi_epi64(sq[0].v, sq[1].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[0].v, sq121665[1].v), 4));960x[1].v = _mm_or_si128(_mm_unpacklo_epi64(sq[2].v, sq[3].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[2].v, sq121665[3].v), 4));961z[1].v = _mm_or_si128(_mm_unpackhi_epi64(sq[2].v, sq[3].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[2].v, sq121665[3].v), 4));962x[2].v = _mm_or_si128(_mm_unpacklo_epi64(sq[4].v, sq[5].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[4].v, sq121665[5].v), 4));963z[2].v = _mm_or_si128(_mm_unpackhi_epi64(sq[4].v, sq[5].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[4].v, sq121665[5].v), 4));964x[3].v = _mm_or_si128(_mm_unpacklo_epi64(sq[6].v, sq[7].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[6].v, sq121665[7].v), 4));965z[3].v = _mm_or_si128(_mm_unpackhi_epi64(sq[6].v, sq[7].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[6].v, sq121665[7].v), 4));966x[4].v = _mm_or_si128(_mm_unpacklo_epi64(sq[8].v, sq[9].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[8].v, sq121665[9].v), 4));967z[4].v = _mm_or_si128(_mm_unpackhi_epi64(sq[8].v, sq[9].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[8].v, sq121665[9].v), 4));968969curve25519_sub_packed32(sub, x, z);970971t[0].v = _mm_shuffle_epi32(sub[0].v, _MM_SHUFFLE(1,1,0,0));972t[1].v = _mm_shuffle_epi32(sub[0].v, _MM_SHUFFLE(3,3,2,2));973t[2].v = _mm_shuffle_epi32(sub[1].v, _MM_SHUFFLE(1,1,0,0));974t[3].v = _mm_shuffle_epi32(sub[1].v, _MM_SHUFFLE(3,3,2,2));975t[4].v = _mm_shuffle_epi32(sub[2].v, _MM_SHUFFLE(1,1,0,0));976t[5].v = _mm_shuffle_epi32(sub[2].v, _MM_SHUFFLE(3,3,2,2));977t[6].v = _mm_shuffle_epi32(sub[3].v, _MM_SHUFFLE(1,1,0,0));978t[7].v = _mm_shuffle_epi32(sub[3].v, _MM_SHUFFLE(3,3,2,2));979t[8].v = _mm_shuffle_epi32(sub[4].v, _MM_SHUFFLE(1,1,0,0));980t[9].v = _mm_shuffle_epi32(sub[4].v, _MM_SHUFFLE(3,3,2,2));981982nqa[0].v = _mm_unpacklo_epi64(sq[0].v, t[0].v);983nqb[0].v = _mm_unpackhi_epi64(sq[0].v, t[0].v);984nqa[1].v = _mm_unpacklo_epi64(sq[1].v, t[1].v);985nqb[1].v = _mm_unpackhi_epi64(sq[1].v, t[1].v);986nqa[2].v = _mm_unpacklo_epi64(sq[2].v, t[2].v);987nqb[2].v = _mm_unpackhi_epi64(sq[2].v, t[2].v);988nqa[3].v = _mm_unpacklo_epi64(sq[3].v, t[3].v);989nqb[3].v = _mm_unpackhi_epi64(sq[3].v, t[3].v);990nqa[4].v = _mm_unpacklo_epi64(sq[4].v, t[4].v);991nqb[4].v = _mm_unpackhi_epi64(sq[4].v, t[4].v);992nqa[5].v = _mm_unpacklo_epi64(sq[5].v, t[5].v);993nqb[5].v = _mm_unpackhi_epi64(sq[5].v, t[5].v);994nqa[6].v = _mm_unpacklo_epi64(sq[6].v, t[6].v);995nqb[6].v = _mm_unpackhi_epi64(sq[6].v, t[6].v);996nqa[7].v = _mm_unpacklo_epi64(sq[7].v, t[7].v);997nqb[7].v = _mm_unpackhi_epi64(sq[7].v, t[7].v);998nqa[8].v = _mm_unpacklo_epi64(sq[8].v, t[8].v);999nqb[8].v = _mm_unpackhi_epi64(sq[8].v, t[8].v);1000nqa[9].v = _mm_unpacklo_epi64(sq[9].v, t[9].v);1001nqb[9].v = _mm_unpackhi_epi64(sq[9].v, t[9].v);10021003curve25519_mul_packed64(nq, nqa, nqb);1004}10051006/*1007* In: b = 2^5 - 2^01008* Out: b = 2^250 - 2^01009*/1010void1011curve25519_pow_two5mtwo0_two250mtwo0(bignum25519 b) {1012ALIGN(16) bignum25519 t0,c;10131014/* 2^5 - 2^0 */ /* b */1015/* 2^10 - 2^5 */ curve25519_square_times(t0, b, 5);1016/* 2^10 - 2^0 */ curve25519_mul(b, t0, b);1017/* 2^20 - 2^10 */ curve25519_square_times(t0, b, 10);1018/* 2^20 - 2^0 */ curve25519_mul(c, t0, b);1019/* 2^40 - 2^20 */ curve25519_square_times(t0, c, 20);1020/* 2^40 - 2^0 */ curve25519_mul(t0, t0, c);1021/* 2^50 - 2^10 */ curve25519_square_times(t0, t0, 10);1022/* 2^50 - 2^0 */ curve25519_mul(b, t0, b);1023/* 2^100 - 2^50 */ curve25519_square_times(t0, b, 50);1024/* 2^100 - 2^0 */ curve25519_mul(c, t0, b);1025/* 2^200 - 2^100 */ curve25519_square_times(t0, c, 100);1026/* 2^200 - 2^0 */ curve25519_mul(t0, t0, c);1027/* 2^250 - 2^50 */ curve25519_square_times(t0, t0, 50);1028/* 2^250 - 2^0 */ curve25519_mul(b, t0, b);1029}10301031/*1032* z^(p - 2) = z(2^255 - 21)1033*/1034void1035curve25519_recip(bignum25519 out, const bignum25519 z) {1036ALIGN(16) bignum25519 a, t0, b;10371038/* 2 */ curve25519_square(a, z); /* a = 2 */1039/* 8 */ curve25519_square_times(t0, a, 2);1040/* 9 */ curve25519_mul(b, t0, z); /* b = 9 */1041/* 11 */ curve25519_mul(a, b, a); /* a = 11 */1042/* 22 */ curve25519_square(t0, a);1043/* 2^5 - 2^0 = 31 */ curve25519_mul(b, t0, b);1044/* 2^250 - 2^0 */ curve25519_pow_two5mtwo0_two250mtwo0(b);1045/* 2^255 - 2^5 */ curve25519_square_times(b, b, 5);1046/* 2^255 - 21 */ curve25519_mul(out, b, a);1047}10481049ANONYMOUS_NAMESPACE_END10501051NAMESPACE_BEGIN(CryptoPP)1052NAMESPACE_BEGIN(Donna)10531054int curve25519_mult_SSE2(byte sharedKey[32], const byte secretKey[32], const byte othersKey[32])1055{1056FixedSizeSecBlock<byte, 32> e;1057for (size_t i = 0;i < 32;++i)1058e[i] = secretKey[i];1059e[0] &= 0xf8; e[31] &= 0x7f; e[31] |= 0x40;10601061ALIGN(16) bignum25519 nqx = {1}, nqpqz = {1}, nqz = {0}, nqpqx, zmone;1062packed32bignum25519 qx, qz, pqz, pqx;1063packed64bignum25519 nq, sq, sqscalar, prime, primex, primez, nqpq;1064bignum25519mulprecomp preq;1065size_t bit=0;10661067curve25519_expand(nqpqx, othersKey);1068curve25519_mul_precompute(&preq, nqpqx);10691070/* do bits 254..3 */1071for (size_t i = 254, lastbit=0; i >= 3; i--) {1072bit = (e[i/8] >> (i & 7)) & 1;1073curve25519_swap_conditional(nqx, nqpqx, (word32)(bit ^ lastbit));1074curve25519_swap_conditional(nqz, nqpqz, (word32)(bit ^ lastbit));1075lastbit = bit;10761077curve25519_tangle32(qx, nqx, nqpqx); /* qx = [nqx,nqpqx] */1078curve25519_tangle32(qz, nqz, nqpqz); /* qz = [nqz,nqpqz] */10791080curve25519_add_packed32(pqx, qx, qz); /* pqx = [nqx+nqz,nqpqx+nqpqz] */1081curve25519_sub_packed32(pqz, qx, qz); /* pqz = [nqx-nqz,nqpqx-nqpqz] */10821083curve25519_make_nqpq(primex, primez, pqx, pqz); /* primex = [nqx+nqz,nqpqx+nqpqz], primez = [nqpqx-nqpqz,nqx-nqz] */1084curve25519_mul_packed64(prime, primex, primez); /* prime = [nqx+nqz,nqpqx+nqpqz] * [nqpqx-nqpqz,nqx-nqz] */1085curve25519_addsub_packed64(prime); /* prime = [prime.x+prime.z,prime.x-prime.z] */1086curve25519_square_packed64(nqpq, prime); /* nqpq = prime^2 */1087curve25519_untangle64(nqpqx, nqpqz, nqpq);1088curve25519_mul_precomputed(nqpqz, nqpqz, &preq); /* nqpqz = nqpqz * q */10891090/* (((sq.x-sq.z)*121665)+sq.x) * (sq.x-sq.z) is equivalent to (sq.x*121666-sq.z*121665) * (sq.x-sq.z) */1091curve25519_make_nq(nq, pqx, pqz); /* nq = [nqx+nqz,nqx-nqz] */1092curve25519_square_packed64(sq, nq); /* sq = nq^2 */1093curve25519_121665_packed64(sqscalar, sq); /* sqscalar = sq * [121666,121665] */1094curve25519_final_nq(nq, sq, sqscalar); /* nq = [sq.x,sqscalar.x-sqscalar.z] * [sq.z,sq.x-sq.z] */1095curve25519_untangle64(nqx, nqz, nq);1096};10971098/* it's possible to get rid of this swap with the swap in the above loop1099at the bottom instead of the top, but compilers seem to optimize better this way */1100curve25519_swap_conditional(nqx, nqpqx, (word32)bit);1101curve25519_swap_conditional(nqz, nqpqz, (word32)bit);11021103/* do bits 2..0 */1104for (size_t i = 0; i < 3; i++) {1105curve25519_compute_nq(nq, nqx, nqz);1106curve25519_square_packed64(sq, nq); /* sq = nq^2 */1107curve25519_121665_packed64(sqscalar, sq); /* sqscalar = sq * [121666,121665] */1108curve25519_final_nq(nq, sq, sqscalar); /* nq = [sq.x,sqscalar.x-sqscalar.z] * [sq.z,sq.x-sq.z] */1109curve25519_untangle64(nqx, nqz, nq);1110}11111112curve25519_recip(zmone, nqz);1113curve25519_mul(nqz, nqx, zmone);1114curve25519_contract(sharedKey, nqz);11151116return 0;1117}11181119NAMESPACE_END // Donna1120NAMESPACE_END // CryptoPP11211122#endif // CRYPTOPP_CURVE25519_SSE2112311241125