Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
folium-app
GitHub Repository: folium-app/Folium
Path: blob/a-new-beginning/SharedDependencies/Sources/cryptopp/donna_sse.cpp
2 views
1
// donna_sse.cpp - written and placed in public domain by Jeffrey Walton
2
// This is a integration of Andrew Moon's public domain code.
3
// Also see https://github.com/floodyberry/curve25519-donna.
4
5
// This is a integration of Andrew Moon's public domain code. The port was
6
// clean, but it has one potential problem. The original code is C and relies
7
// upon unions. Accessing the inactive union member is undefined behavior in
8
// C++. That means copying the array into packedelem8.u is OK; but then using
9
// packedelem8.v in a calculation is UB. Fortunately most (all?) compilers
10
// take pity on C++ developers and compile the code. We will have to keep an
11
// eye on things or rewrite significant portions of this code.
12
13
// If needed, see Moon's commit "Go back to ignoring 256th bit [sic]",
14
// https://github.com/floodyberry/curve25519-donna/commit/57a683d18721a658
15
16
#include "pch.h"
17
18
#include "config.h"
19
#include "donna.h"
20
#include "secblock.h"
21
#include "misc.h"
22
23
// The data is aligned, but Clang issues warning based on type
24
// and not the actual alignment of the variable and data.
25
#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
26
# pragma GCC diagnostic ignored "-Wcast-align"
27
# pragma GCC diagnostic ignored "-Wunused-function"
28
#endif
29
30
#if CRYPTOPP_MSC_VERSION
31
# pragma warning(disable: 4244)
32
#endif
33
34
// Squash MS LNK4221 and libtool warnings
35
extern const char DONNA_SSE_FNAME[] = __FILE__;
36
37
#if (CRYPTOPP_CURVE25519_SSE2)
38
39
#include "donna_sse.h"
40
41
ANONYMOUS_NAMESPACE_BEGIN
42
43
using CryptoPP::byte;
44
using CryptoPP::word32;
45
using CryptoPP::sword32;
46
using CryptoPP::word64;
47
using CryptoPP::sword64;
48
using CryptoPP::GetBlock;
49
using CryptoPP::LittleEndian;
50
51
// Bring in all the symbols from the SSE header
52
using namespace CryptoPP::Donna::ArchSSE;
53
54
/* Copy a bignum to another: out = in */
55
inline void
56
curve25519_copy(bignum25519 out, const bignum25519 in) {
57
xmmi x0,x1,x2;
58
x0 = _mm_load_si128((xmmi*)in + 0);
59
x1 = _mm_load_si128((xmmi*)in + 1);
60
x2 = _mm_load_si128((xmmi*)in + 2);
61
_mm_store_si128((xmmi*)out + 0, x0);
62
_mm_store_si128((xmmi*)out + 1, x1);
63
_mm_store_si128((xmmi*)out + 2, x2);
64
}
65
66
/* Take a little-endian, 32-byte number and expand it into polynomial form */
67
inline void
68
curve25519_expand(bignum25519 out, const byte in[32]) {
69
word32 x0,x1,x2,x3,x4,x5,x6,x7;
70
71
x0 = *(word32 *)(in + 0);
72
x1 = *(word32 *)(in + 4);
73
x2 = *(word32 *)(in + 8);
74
x3 = *(word32 *)(in + 12);
75
x4 = *(word32 *)(in + 16);
76
x5 = *(word32 *)(in + 20);
77
x6 = *(word32 *)(in + 24);
78
x7 = *(word32 *)(in + 28);
79
80
out[0] = ( x0 ) & reduce_mask_26;
81
out[1] = ((((word64)x1 << 32) | x0) >> 26) & reduce_mask_25;
82
out[2] = ((((word64)x2 << 32) | x1) >> 19) & reduce_mask_26;
83
out[3] = ((((word64)x3 << 32) | x2) >> 13) & reduce_mask_25;
84
out[4] = (( x3) >> 6) & reduce_mask_26;
85
out[5] = ( x4 ) & reduce_mask_25;
86
out[6] = ((((word64)x5 << 32) | x4) >> 25) & reduce_mask_26;
87
out[7] = ((((word64)x6 << 32) | x5) >> 19) & reduce_mask_25;
88
out[8] = ((((word64)x7 << 32) | x6) >> 12) & reduce_mask_26;
89
out[9] = (( x7) >> 6) & reduce_mask_25; /* ignore the top bit */
90
91
out[10] = 0;
92
out[11] = 0;
93
}
94
95
/* Take a fully reduced polynomial form number and contract it into a
96
* little-endian, 32-byte array
97
*/
98
inline void
99
curve25519_contract(byte out[32], const bignum25519 in) {
100
ALIGN(16) bignum25519 f;
101
102
curve25519_copy(f, in);
103
104
#define carry_pass() \
105
f[1] += f[0] >> 26; f[0] &= reduce_mask_26; \
106
f[2] += f[1] >> 25; f[1] &= reduce_mask_25; \
107
f[3] += f[2] >> 26; f[2] &= reduce_mask_26; \
108
f[4] += f[3] >> 25; f[3] &= reduce_mask_25; \
109
f[5] += f[4] >> 26; f[4] &= reduce_mask_26; \
110
f[6] += f[5] >> 25; f[5] &= reduce_mask_25; \
111
f[7] += f[6] >> 26; f[6] &= reduce_mask_26; \
112
f[8] += f[7] >> 25; f[7] &= reduce_mask_25; \
113
f[9] += f[8] >> 26; f[8] &= reduce_mask_26;
114
115
#define carry_pass_full() \
116
carry_pass() \
117
f[0] += 19 * (f[9] >> 25); f[9] &= reduce_mask_25;
118
119
#define carry_pass_final() \
120
carry_pass() \
121
f[9] &= reduce_mask_25;
122
123
carry_pass_full()
124
carry_pass_full()
125
126
/* now t is between 0 and 2^255-1, properly carried. */
127
/* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */
128
f[0] += 19;
129
carry_pass_full()
130
131
/* now between 19 and 2^255-1 in both cases, and offset by 19. */
132
f[0] += (1 << 26) - 19;
133
f[1] += (1 << 25) - 1;
134
f[2] += (1 << 26) - 1;
135
f[3] += (1 << 25) - 1;
136
f[4] += (1 << 26) - 1;
137
f[5] += (1 << 25) - 1;
138
f[6] += (1 << 26) - 1;
139
f[7] += (1 << 25) - 1;
140
f[8] += (1 << 26) - 1;
141
f[9] += (1 << 25) - 1;
142
143
/* now between 2^255 and 2^256-20, and offset by 2^255. */
144
carry_pass_final()
145
146
#undef carry_pass
147
#undef carry_full
148
#undef carry_final
149
150
*(word32 *)(out + 0) = ((f[0] ) | (f[1] << 26));
151
*(word32 *)(out + 4) = ((f[1] >> 6) | (f[2] << 19));
152
*(word32 *)(out + 8) = ((f[2] >> 13) | (f[3] << 13));
153
*(word32 *)(out + 12) = ((f[3] >> 19) | (f[4] << 6));
154
*(word32 *)(out + 16) = ((f[5] ) | (f[6] << 25));
155
*(word32 *)(out + 20) = ((f[6] >> 7) | (f[7] << 19));
156
*(word32 *)(out + 24) = ((f[7] >> 13) | (f[8] << 12));
157
*(word32 *)(out + 28) = ((f[8] >> 20) | (f[9] << 6));
158
}
159
160
/*
161
* Maybe swap the contents of two felem arrays (@a and @b), each 5 elements
162
* long. Perform the swap iff @swap is non-zero.
163
*/
164
inline void
165
curve25519_swap_conditional(bignum25519 a, bignum25519 b, word32 iswap) {
166
const word32 swap = (word32)(-(sword32)iswap);
167
xmmi a0,a1,a2,b0,b1,b2,x0,x1,x2;
168
xmmi mask = _mm_cvtsi32_si128(swap);
169
mask = _mm_shuffle_epi32(mask, 0);
170
a0 = _mm_load_si128((xmmi *)a + 0);
171
a1 = _mm_load_si128((xmmi *)a + 1);
172
a2 = _mm_load_si128((xmmi *)a + 2);
173
b0 = _mm_load_si128((xmmi *)b + 0);
174
b1 = _mm_load_si128((xmmi *)b + 1);
175
b2 = _mm_load_si128((xmmi *)b + 2);
176
b0 = _mm_xor_si128(a0, b0);
177
b1 = _mm_xor_si128(a1, b1);
178
b2 = _mm_xor_si128(a2, b2);
179
x0 = _mm_and_si128(b0, mask);
180
x1 = _mm_and_si128(b1, mask);
181
x2 = _mm_and_si128(b2, mask);
182
x0 = _mm_xor_si128(x0, a0);
183
x1 = _mm_xor_si128(x1, a1);
184
x2 = _mm_xor_si128(x2, a2);
185
a0 = _mm_xor_si128(x0, b0);
186
a1 = _mm_xor_si128(x1, b1);
187
a2 = _mm_xor_si128(x2, b2);
188
_mm_store_si128((xmmi *)a + 0, x0);
189
_mm_store_si128((xmmi *)a + 1, x1);
190
_mm_store_si128((xmmi *)a + 2, x2);
191
_mm_store_si128((xmmi *)b + 0, a0);
192
_mm_store_si128((xmmi *)b + 1, a1);
193
_mm_store_si128((xmmi *)b + 2, a2);
194
}
195
196
/* interleave two bignums */
197
inline void
198
curve25519_tangle32(packedelem32 *out, const bignum25519 x, const bignum25519 z) {
199
xmmi x0,x1,x2,z0,z1,z2;
200
201
x0 = _mm_load_si128((xmmi *)(x + 0));
202
x1 = _mm_load_si128((xmmi *)(x + 4));
203
x2 = _mm_load_si128((xmmi *)(x + 8));
204
z0 = _mm_load_si128((xmmi *)(z + 0));
205
z1 = _mm_load_si128((xmmi *)(z + 4));
206
z2 = _mm_load_si128((xmmi *)(z + 8));
207
208
out[0].v = _mm_unpacklo_epi32(x0, z0);
209
out[1].v = _mm_unpackhi_epi32(x0, z0);
210
out[2].v = _mm_unpacklo_epi32(x1, z1);
211
out[3].v = _mm_unpackhi_epi32(x1, z1);
212
out[4].v = _mm_unpacklo_epi32(x2, z2);
213
}
214
215
/* split a packed bignum in to it's two parts */
216
inline void
217
curve25519_untangle64(bignum25519 x, bignum25519 z, const packedelem64 *in) {
218
_mm_store_si128((xmmi *)(x + 0), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[0].v, in[1].v), _mm_unpacklo_epi32(in[2].v, in[3].v)));
219
_mm_store_si128((xmmi *)(x + 4), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[4].v, in[5].v), _mm_unpacklo_epi32(in[6].v, in[7].v)));
220
_mm_store_si128((xmmi *)(x + 8), _mm_unpacklo_epi32(in[8].v, in[9].v) );
221
_mm_store_si128((xmmi *)(z + 0), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[0].v, in[1].v), _mm_unpackhi_epi32(in[2].v, in[3].v)));
222
_mm_store_si128((xmmi *)(z + 4), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[4].v, in[5].v), _mm_unpackhi_epi32(in[6].v, in[7].v)));
223
_mm_store_si128((xmmi *)(z + 8), _mm_unpackhi_epi32(in[8].v, in[9].v) );
224
}
225
226
/* add two packed bignums */
227
inline void
228
curve25519_add_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
229
out[0].v = _mm_add_epi32(r[0].v, s[0].v);
230
out[1].v = _mm_add_epi32(r[1].v, s[1].v);
231
out[2].v = _mm_add_epi32(r[2].v, s[2].v);
232
out[3].v = _mm_add_epi32(r[3].v, s[3].v);
233
out[4].v = _mm_add_epi32(r[4].v, s[4].v);
234
}
235
236
/* subtract two packed bignums */
237
inline void
238
curve25519_sub_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
239
xmmi r0,r1,r2,r3,r4;
240
xmmi s0,s1,s2,s3;
241
xmmi c1,c2;
242
243
r0 = _mm_add_epi32(r[0].v, packed32zeromodp0.v);
244
r1 = _mm_add_epi32(r[1].v, packed32zeromodp1.v);
245
r2 = _mm_add_epi32(r[2].v, packed32zeromodp1.v);
246
r3 = _mm_add_epi32(r[3].v, packed32zeromodp1.v);
247
r4 = _mm_add_epi32(r[4].v, packed32zeromodp1.v);
248
r0 = _mm_sub_epi32(r0, s[0].v); /* 00 11 */
249
r1 = _mm_sub_epi32(r1, s[1].v); /* 22 33 */
250
r2 = _mm_sub_epi32(r2, s[2].v); /* 44 55 */
251
r3 = _mm_sub_epi32(r3, s[3].v); /* 66 77 */
252
r4 = _mm_sub_epi32(r4, s[4].v); /* 88 99 */
253
254
s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
255
s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
256
s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
257
s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
258
259
c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
260
c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); r4 = _mm_add_epi32(r4, _mm_srli_si128(c2, 8)); s0 = _mm_add_epi32(s0, _mm_slli_si128(c2, 8));
261
262
out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
263
out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
264
out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
265
out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
266
out[4].v = r4; /* 88 99 */
267
}
268
269
/* multiply two packed bignums */
270
inline void
271
curve25519_mul_packed64(packedelem64 *out, const packedelem64 *r, const packedelem64 *s) {
272
xmmi r1,r2,r3,r4,r5,r6,r7,r8,r9;
273
xmmi r1_2,r3_2,r5_2,r7_2,r9_2;
274
xmmi c1,c2;
275
276
out[0].v = _mm_mul_epu32(r[0].v, s[0].v);
277
out[1].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[1].v), _mm_mul_epu32(r[1].v, s[0].v));
278
r1_2 = _mm_slli_epi32(r[1].v, 1);
279
out[2].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[1].v), _mm_mul_epu32(r[2].v, s[0].v)));
280
out[3].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[1].v), _mm_mul_epu32(r[3].v, s[0].v))));
281
r3_2 = _mm_slli_epi32(r[3].v, 1);
282
out[4].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[1].v), _mm_mul_epu32(r[4].v, s[0].v)))));
283
out[5].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[1].v), _mm_mul_epu32(r[5].v, s[0].v))))));
284
r5_2 = _mm_slli_epi32(r[5].v, 1);
285
out[6].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[1].v), _mm_mul_epu32(r[6].v, s[0].v)))))));
286
out[7].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[1].v), _mm_mul_epu32(r[7].v , s[0].v))))))));
287
r7_2 = _mm_slli_epi32(r[7].v, 1);
288
out[8].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[7].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2 , s[1].v), _mm_mul_epu32(r[8].v, s[0].v)))))))));
289
out[9].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[9].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[7].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[8].v, s[1].v), _mm_mul_epu32(r[9].v, s[0].v))))))))));
290
291
r1 = _mm_mul_epu32(r[1].v, packednineteen.v);
292
r2 = _mm_mul_epu32(r[2].v, packednineteen.v);
293
r1_2 = _mm_slli_epi32(r1, 1);
294
r3 = _mm_mul_epu32(r[3].v, packednineteen.v);
295
r4 = _mm_mul_epu32(r[4].v, packednineteen.v);
296
r3_2 = _mm_slli_epi32(r3, 1);
297
r5 = _mm_mul_epu32(r[5].v, packednineteen.v);
298
r6 = _mm_mul_epu32(r[6].v, packednineteen.v);
299
r5_2 = _mm_slli_epi32(r5, 1);
300
r7 = _mm_mul_epu32(r[7].v, packednineteen.v);
301
r8 = _mm_mul_epu32(r[8].v, packednineteen.v);
302
r7_2 = _mm_slli_epi32(r7, 1);
303
r9 = _mm_mul_epu32(r[9].v, packednineteen.v);
304
r9_2 = _mm_slli_epi32(r9, 1);
305
306
out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[1].v), _mm_add_epi64(_mm_mul_epu32(r8, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r6, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r4, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r2, s[8].v), _mm_mul_epu32(r1_2, s[9].v))))))))));
307
out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[2].v), _mm_add_epi64(_mm_mul_epu32(r8, s[3].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r6, s[5].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r4, s[7].v), _mm_add_epi64(_mm_mul_epu32(r3 , s[8].v), _mm_mul_epu32(r2, s[9].v)))))))));
308
out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r8, s[4].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r6, s[6].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r4, s[8].v), _mm_mul_epu32(r3_2, s[9].v))))))));
309
out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r8, s[5].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r6, s[7].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[8].v), _mm_mul_epu32(r4, s[9].v)))))));
310
out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r8, s[6].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r6, s[8].v), _mm_mul_epu32(r5_2, s[9].v))))));
311
out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r8, s[7].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[8].v), _mm_mul_epu32(r6, s[9].v)))));
312
out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r8, s[8].v), _mm_mul_epu32(r7_2, s[9].v))));
313
out[7].v = _mm_add_epi64(out[7].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[8].v), _mm_mul_epu32(r8, s[9].v)));
314
out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(r9_2, s[9].v));
315
316
c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
317
c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
318
c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
319
c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
320
c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);
321
c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
322
c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
323
}
324
325
/* multiply a bignum */
326
void
327
curve25519_mul(bignum25519 out, const bignum25519 r, const bignum25519 s) {
328
xmmi m01,m23,m45,m67,m89;
329
xmmi m0123,m4567;
330
xmmi s0123,s4567;
331
xmmi s01,s23,s45,s67,s89;
332
xmmi s12,s34,s56,s78,s9;
333
xmmi r0,r2,r4,r6,r8;
334
xmmi r1,r3,r5,r7,r9;
335
xmmi r119,r219,r319,r419,r519,r619,r719,r819,r919;
336
xmmi c1,c2,c3;
337
338
s0123 = _mm_load_si128((xmmi*)s + 0);
339
s01 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,1,2,0));
340
s12 = _mm_shuffle_epi32(s0123, _MM_SHUFFLE(2,2,1,1));
341
s23 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,3,2,2));
342
s4567 = _mm_load_si128((xmmi*)s + 1);
343
s34 = _mm_unpacklo_epi64(_mm_srli_si128(s0123,12),s4567);
344
s45 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,1,2,0));
345
s56 = _mm_shuffle_epi32(s4567, _MM_SHUFFLE(2,2,1,1));
346
s67 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,3,2,2));
347
s89 = _mm_load_si128((xmmi*)s + 2);
348
s78 = _mm_unpacklo_epi64(_mm_srli_si128(s4567,12),s89);
349
s89 = _mm_shuffle_epi32(s89,_MM_SHUFFLE(3,1,2,0));
350
s9 = _mm_shuffle_epi32(s89, _MM_SHUFFLE(3,3,2,2));
351
352
r0 = _mm_load_si128((xmmi*)r + 0);
353
r1 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(1,1,1,1));
354
r1 = _mm_add_epi64(r1, _mm_and_si128(r1, sse2_top64bitmask.v));
355
r2 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(2,2,2,2));
356
r3 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(3,3,3,3));
357
r3 = _mm_add_epi64(r3, _mm_and_si128(r3, sse2_top64bitmask.v));
358
r0 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(0,0,0,0));
359
r4 = _mm_load_si128((xmmi*)r + 1);
360
r5 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(1,1,1,1));
361
r5 = _mm_add_epi64(r5, _mm_and_si128(r5, sse2_top64bitmask.v));
362
r6 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(2,2,2,2));
363
r7 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(3,3,3,3));
364
r7 = _mm_add_epi64(r7, _mm_and_si128(r7, sse2_top64bitmask.v));
365
r4 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(0,0,0,0));
366
r8 = _mm_load_si128((xmmi*)r + 2);
367
r9 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,1,3,1));
368
r9 = _mm_add_epi64(r9, _mm_and_si128(r9, sse2_top64bitmask.v));
369
r8 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,0,3,0));
370
371
m01 = _mm_mul_epu32(r1,s01);
372
m23 = _mm_mul_epu32(r1,s23);
373
m45 = _mm_mul_epu32(r1,s45);
374
m67 = _mm_mul_epu32(r1,s67);
375
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r3,s01));
376
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r3,s23));
377
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r3,s45));
378
m89 = _mm_mul_epu32(r1,s89);
379
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r5,s01));
380
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r5,s23));
381
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r3,s67));
382
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r7,s01));
383
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r5,s45));
384
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r7,s23));
385
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r9,s01));
386
387
/* shift up */
388
m89 = _mm_unpackhi_epi64(m67,_mm_slli_si128(m89,8));
389
m67 = _mm_unpackhi_epi64(m45,_mm_slli_si128(m67,8));
390
m45 = _mm_unpackhi_epi64(m23,_mm_slli_si128(m45,8));
391
m23 = _mm_unpackhi_epi64(m01,_mm_slli_si128(m23,8));
392
m01 = _mm_unpackhi_epi64(_mm_setzero_si128(),_mm_slli_si128(m01,8));
393
394
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r0,s01));
395
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r0,s23));
396
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r0,s45));
397
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r0,s67));
398
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r2,s01));
399
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r2,s23));
400
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r4,s23));
401
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r0,s89));
402
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r4,s01));
403
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r2,s45));
404
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r2,s67));
405
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r6,s01));
406
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r4,s45));
407
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r6,s23));
408
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r8,s01));
409
410
r219 = _mm_mul_epu32(r2, packednineteen.v);
411
r419 = _mm_mul_epu32(r4, packednineteen.v);
412
r619 = _mm_mul_epu32(r6, packednineteen.v);
413
r819 = _mm_mul_epu32(r8, packednineteen.v);
414
r119 = _mm_shuffle_epi32(r1,_MM_SHUFFLE(0,0,2,2)); r119 = _mm_mul_epu32(r119, packednineteen.v);
415
r319 = _mm_shuffle_epi32(r3,_MM_SHUFFLE(0,0,2,2)); r319 = _mm_mul_epu32(r319, packednineteen.v);
416
r519 = _mm_shuffle_epi32(r5,_MM_SHUFFLE(0,0,2,2)); r519 = _mm_mul_epu32(r519, packednineteen.v);
417
r719 = _mm_shuffle_epi32(r7,_MM_SHUFFLE(0,0,2,2)); r719 = _mm_mul_epu32(r719, packednineteen.v);
418
r919 = _mm_shuffle_epi32(r9,_MM_SHUFFLE(0,0,2,2)); r919 = _mm_mul_epu32(r919, packednineteen.v);
419
420
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r919,s12));
421
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r919,s34));
422
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r919,s56));
423
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r919,s78));
424
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r719,s34));
425
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r719,s56));
426
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r719,s78));
427
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r719,s9));
428
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r519,s56));
429
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r519,s78));
430
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r519,s9));
431
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r819,s89));
432
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r319,s78));
433
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r319,s9));
434
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r619,s89));
435
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r919,s9));
436
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r819,s23));
437
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r819,s45));
438
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r819,s67));
439
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r619,s45));
440
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r619,s67));
441
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r419,s67));
442
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r419,s89));
443
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r219,s89));
444
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r119,s9));
445
446
r0 = _mm_unpacklo_epi64(m01, m45);
447
r1 = _mm_unpackhi_epi64(m01, m45);
448
r2 = _mm_unpacklo_epi64(m23, m67);
449
r3 = _mm_unpackhi_epi64(m23, m67);
450
r4 = _mm_unpacklo_epi64(m89, m89);
451
r5 = _mm_unpackhi_epi64(m89, m89);
452
453
c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
454
c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
455
c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
456
c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
457
c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
458
459
m0123 = _mm_unpacklo_epi32(r0, r1);
460
m4567 = _mm_unpackhi_epi32(r0, r1);
461
m0123 = _mm_unpacklo_epi64(m0123, _mm_unpacklo_epi32(r2, r3));
462
m4567 = _mm_unpacklo_epi64(m4567, _mm_unpackhi_epi32(r2, r3));
463
m89 = _mm_unpackhi_epi32(r4, r5);
464
465
_mm_store_si128((xmmi*)out + 0, m0123);
466
_mm_store_si128((xmmi*)out + 1, m4567);
467
_mm_store_si128((xmmi*)out + 2, m89);
468
}
469
470
typedef struct bignum25519mulprecomp_t {
471
xmmi r0,r2,r4,r6,r8;
472
xmmi r1,r3,r5,r7,r9;
473
xmmi r119,r219,r319,r419,r519,r619,r719,r819,r919;
474
} bignum25519mulprecomp;
475
476
/* precompute a constant to multiply by */
477
inline void
478
curve25519_mul_precompute(bignum25519mulprecomp *pre, const bignum25519 r) {
479
pre->r0 = _mm_load_si128((xmmi*)r + 0);
480
pre->r1 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(1,1,1,1));
481
pre->r1 = _mm_add_epi64(pre->r1, _mm_and_si128(pre->r1, sse2_top64bitmask.v));
482
pre->r2 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(2,2,2,2));
483
pre->r3 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(3,3,3,3));
484
pre->r3 = _mm_add_epi64(pre->r3, _mm_and_si128(pre->r3, sse2_top64bitmask.v));
485
pre->r0 = _mm_shuffle_epi32(pre->r0, _MM_SHUFFLE(0,0,0,0));
486
pre->r4 = _mm_load_si128((xmmi*)r + 1);
487
pre->r5 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(1,1,1,1));
488
pre->r5 = _mm_add_epi64(pre->r5, _mm_and_si128(pre->r5, sse2_top64bitmask.v));
489
pre->r6 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(2,2,2,2));
490
pre->r7 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(3,3,3,3));
491
pre->r7 = _mm_add_epi64(pre->r7, _mm_and_si128(pre->r7, sse2_top64bitmask.v));
492
pre->r4 = _mm_shuffle_epi32(pre->r4, _MM_SHUFFLE(0,0,0,0));
493
pre->r8 = _mm_load_si128((xmmi*)r + 2);
494
pre->r9 = _mm_shuffle_epi32(pre->r8, _MM_SHUFFLE(3,1,3,1));
495
pre->r9 = _mm_add_epi64(pre->r9, _mm_and_si128(pre->r9, sse2_top64bitmask.v));
496
pre->r8 = _mm_shuffle_epi32(pre->r8, _MM_SHUFFLE(3,0,3,0));
497
498
pre->r219 = _mm_mul_epu32(pre->r2, packednineteen.v);
499
pre->r419 = _mm_mul_epu32(pre->r4, packednineteen.v);
500
pre->r619 = _mm_mul_epu32(pre->r6, packednineteen.v);
501
pre->r819 = _mm_mul_epu32(pre->r8, packednineteen.v);
502
pre->r119 = _mm_shuffle_epi32(pre->r1,_MM_SHUFFLE(0,0,2,2)); pre->r119 = _mm_mul_epu32(pre->r119, packednineteen.v);
503
pre->r319 = _mm_shuffle_epi32(pre->r3,_MM_SHUFFLE(0,0,2,2)); pre->r319 = _mm_mul_epu32(pre->r319, packednineteen.v);
504
pre->r519 = _mm_shuffle_epi32(pre->r5,_MM_SHUFFLE(0,0,2,2)); pre->r519 = _mm_mul_epu32(pre->r519, packednineteen.v);
505
pre->r719 = _mm_shuffle_epi32(pre->r7,_MM_SHUFFLE(0,0,2,2)); pre->r719 = _mm_mul_epu32(pre->r719, packednineteen.v);
506
pre->r919 = _mm_shuffle_epi32(pre->r9,_MM_SHUFFLE(0,0,2,2)); pre->r919 = _mm_mul_epu32(pre->r919, packednineteen.v);
507
}
508
509
510
/* multiply a bignum by a pre-computed constant */
511
inline void
512
curve25519_mul_precomputed(bignum25519 out, const bignum25519 s, const bignum25519mulprecomp *r) {
513
xmmi m01,m23,m45,m67,m89;
514
xmmi m0123,m4567;
515
xmmi s0123,s4567;
516
xmmi s01,s23,s45,s67,s89;
517
xmmi s12,s34,s56,s78,s9;
518
xmmi r0,r1,r2,r3,r4,r5;
519
xmmi c1,c2,c3;
520
521
s0123 = _mm_load_si128((xmmi*)s + 0);
522
s01 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,1,2,0));
523
s12 = _mm_shuffle_epi32(s0123, _MM_SHUFFLE(2,2,1,1));
524
s23 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,3,2,2));
525
s4567 = _mm_load_si128((xmmi*)s + 1);
526
s34 = _mm_unpacklo_epi64(_mm_srli_si128(s0123,12),s4567);
527
s45 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,1,2,0));
528
s56 = _mm_shuffle_epi32(s4567, _MM_SHUFFLE(2,2,1,1));
529
s67 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,3,2,2));
530
s89 = _mm_load_si128((xmmi*)s + 2);
531
s78 = _mm_unpacklo_epi64(_mm_srli_si128(s4567,12),s89);
532
s89 = _mm_shuffle_epi32(s89,_MM_SHUFFLE(3,1,2,0));
533
s9 = _mm_shuffle_epi32(s89, _MM_SHUFFLE(3,3,2,2));
534
535
m01 = _mm_mul_epu32(r->r1,s01);
536
m23 = _mm_mul_epu32(r->r1,s23);
537
m45 = _mm_mul_epu32(r->r1,s45);
538
m67 = _mm_mul_epu32(r->r1,s67);
539
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r3,s01));
540
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r3,s23));
541
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r3,s45));
542
m89 = _mm_mul_epu32(r->r1,s89);
543
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r5,s01));
544
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r5,s23));
545
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r3,s67));
546
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r7,s01));
547
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r5,s45));
548
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r7,s23));
549
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r9,s01));
550
551
/* shift up */
552
m89 = _mm_unpackhi_epi64(m67,_mm_slli_si128(m89,8));
553
m67 = _mm_unpackhi_epi64(m45,_mm_slli_si128(m67,8));
554
m45 = _mm_unpackhi_epi64(m23,_mm_slli_si128(m45,8));
555
m23 = _mm_unpackhi_epi64(m01,_mm_slli_si128(m23,8));
556
m01 = _mm_unpackhi_epi64(_mm_setzero_si128(),_mm_slli_si128(m01,8));
557
558
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r0,s01));
559
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r0,s23));
560
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r0,s45));
561
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r0,s67));
562
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r2,s01));
563
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r2,s23));
564
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r4,s23));
565
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r0,s89));
566
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r4,s01));
567
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r2,s45));
568
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r2,s67));
569
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r6,s01));
570
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r4,s45));
571
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r6,s23));
572
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r8,s01));
573
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r919,s12));
574
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r919,s34));
575
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r919,s56));
576
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r919,s78));
577
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r719,s34));
578
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r719,s56));
579
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r719,s78));
580
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r719,s9));
581
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r519,s56));
582
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r519,s78));
583
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r519,s9));
584
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r->r819,s89));
585
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r319,s78));
586
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r319,s9));
587
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r619,s89));
588
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r->r919,s9));
589
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r819,s23));
590
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r819,s45));
591
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r->r819,s67));
592
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r619,s45));
593
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r619,s67));
594
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r419,s67));
595
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r->r419,s89));
596
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r219,s89));
597
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r->r119,s9));
598
599
r0 = _mm_unpacklo_epi64(m01, m45);
600
r1 = _mm_unpackhi_epi64(m01, m45);
601
r2 = _mm_unpacklo_epi64(m23, m67);
602
r3 = _mm_unpackhi_epi64(m23, m67);
603
r4 = _mm_unpacklo_epi64(m89, m89);
604
r5 = _mm_unpackhi_epi64(m89, m89);
605
606
c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
607
c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
608
c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
609
c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
610
c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
611
612
m0123 = _mm_unpacklo_epi32(r0, r1);
613
m4567 = _mm_unpackhi_epi32(r0, r1);
614
m0123 = _mm_unpacklo_epi64(m0123, _mm_unpacklo_epi32(r2, r3));
615
m4567 = _mm_unpacklo_epi64(m4567, _mm_unpackhi_epi32(r2, r3));
616
m89 = _mm_unpackhi_epi32(r4, r5);
617
618
_mm_store_si128((xmmi*)out + 0, m0123);
619
_mm_store_si128((xmmi*)out + 1, m4567);
620
_mm_store_si128((xmmi*)out + 2, m89);
621
}
622
623
/* square a bignum 'count' times */
624
#define curve25519_square(r,x) curve25519_square_times(r,x,1)
625
626
void
627
curve25519_square_times(bignum25519 r, const bignum25519 in, int count) {
628
xmmi m01,m23,m45,m67,m89;
629
xmmi r0,r1,r2,r3,r4,r5,r6,r7,r8,r9;
630
xmmi r0a,r1a,r2a,r3a,r7a,r9a;
631
xmmi r0123,r4567;
632
xmmi r01,r23,r45,r67,r6x,r89,r8x;
633
xmmi r12,r34,r56,r78,r9x;
634
xmmi r5619;
635
xmmi c1,c2,c3;
636
637
r0123 = _mm_load_si128((xmmi*)in + 0);
638
r01 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,1,2,0));
639
r23 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,3,2,2));
640
r4567 = _mm_load_si128((xmmi*)in + 1);
641
r45 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,1,2,0));
642
r67 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,3,2,2));
643
r89 = _mm_load_si128((xmmi*)in + 2);
644
r89 = _mm_shuffle_epi32(r89,_MM_SHUFFLE(3,1,2,0));
645
646
do {
647
r12 = _mm_unpackhi_epi64(r01, _mm_slli_si128(r23, 8));
648
r0 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(0,0,0,0));
649
r0 = _mm_add_epi64(r0, _mm_and_si128(r0, sse2_top64bitmask.v));
650
r0a = _mm_shuffle_epi32(r0,_MM_SHUFFLE(3,2,1,2));
651
r1 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(2,2,2,2));
652
r2 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(0,0,0,0));
653
r2 = _mm_add_epi64(r2, _mm_and_si128(r2, sse2_top64bitmask.v));
654
r2a = _mm_shuffle_epi32(r2,_MM_SHUFFLE(3,2,1,2));
655
r3 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,2,2,2));
656
r34 = _mm_unpackhi_epi64(r23, _mm_slli_si128(r45, 8));
657
r4 = _mm_shuffle_epi32(r45, _MM_SHUFFLE(0,0,0,0));
658
r4 = _mm_add_epi64(r4, _mm_and_si128(r4, sse2_top64bitmask.v));
659
r56 = _mm_unpackhi_epi64(r45, _mm_slli_si128(r67, 8));
660
r5619 = _mm_mul_epu32(r56, packednineteen.v);
661
r5 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(1,1,1,0));
662
r6 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(3,2,3,2));
663
r78 = _mm_unpackhi_epi64(r67, _mm_slli_si128(r89, 8));
664
r6x = _mm_unpacklo_epi64(r67, _mm_setzero_si128());
665
r7 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,2,2,2));
666
r7 = _mm_mul_epu32(r7, packed3819.v);
667
r7a = _mm_shuffle_epi32(r7, _MM_SHUFFLE(3,3,3,2));
668
r8x = _mm_unpacklo_epi64(r89, _mm_setzero_si128());
669
r8 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(0,0,0,0));
670
r8 = _mm_mul_epu32(r8, packednineteen.v);
671
r9 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(2,2,2,2));
672
r9x = _mm_slli_epi32(_mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,3,2)), 1);
673
r9 = _mm_mul_epu32(r9, packed3819.v);
674
r9a = _mm_shuffle_epi32(r9, _MM_SHUFFLE(2,2,2,2));
675
676
m01 = _mm_mul_epu32(r01, r0);
677
m23 = _mm_mul_epu32(r23, r0a);
678
m45 = _mm_mul_epu32(r45, r0a);
679
m45 = _mm_add_epi64(m45, _mm_mul_epu32(r23, r2));
680
r23 = _mm_slli_epi32(r23, 1);
681
m67 = _mm_mul_epu32(r67, r0a);
682
m67 = _mm_add_epi64(m67, _mm_mul_epu32(r45, r2a));
683
m89 = _mm_mul_epu32(r89, r0a);
684
m89 = _mm_add_epi64(m89, _mm_mul_epu32(r67, r2a));
685
r67 = _mm_slli_epi32(r67, 1);
686
m89 = _mm_add_epi64(m89, _mm_mul_epu32(r45, r4));
687
r45 = _mm_slli_epi32(r45, 1);
688
689
r1 = _mm_slli_epi32(r1, 1);
690
r3 = _mm_slli_epi32(r3, 1);
691
r1a = _mm_add_epi64(r1, _mm_and_si128(r1, sse2_bot64bitmask.v));
692
r3a = _mm_add_epi64(r3, _mm_and_si128(r3, sse2_bot64bitmask.v));
693
694
m23 = _mm_add_epi64(m23, _mm_mul_epu32(r12, r1));
695
m45 = _mm_add_epi64(m45, _mm_mul_epu32(r34, r1a));
696
m67 = _mm_add_epi64(m67, _mm_mul_epu32(r56, r1a));
697
m67 = _mm_add_epi64(m67, _mm_mul_epu32(r34, r3));
698
r34 = _mm_slli_epi32(r34, 1);
699
m89 = _mm_add_epi64(m89, _mm_mul_epu32(r78, r1a));
700
r78 = _mm_slli_epi32(r78, 1);
701
m89 = _mm_add_epi64(m89, _mm_mul_epu32(r56, r3a));
702
r56 = _mm_slli_epi32(r56, 1);
703
704
m01 = _mm_add_epi64(m01, _mm_mul_epu32(_mm_slli_epi32(r12, 1), r9));
705
m01 = _mm_add_epi64(m01, _mm_mul_epu32(r34, r7));
706
m23 = _mm_add_epi64(m23, _mm_mul_epu32(r34, r9));
707
m01 = _mm_add_epi64(m01, _mm_mul_epu32(r56, r5));
708
m23 = _mm_add_epi64(m23, _mm_mul_epu32(r56, r7));
709
m45 = _mm_add_epi64(m45, _mm_mul_epu32(r56, r9));
710
m01 = _mm_add_epi64(m01, _mm_mul_epu32(r23, r8));
711
m01 = _mm_add_epi64(m01, _mm_mul_epu32(r45, r6));
712
m23 = _mm_add_epi64(m23, _mm_mul_epu32(r45, r8));
713
m23 = _mm_add_epi64(m23, _mm_mul_epu32(r6x, r6));
714
m45 = _mm_add_epi64(m45, _mm_mul_epu32(r78, r7a));
715
m67 = _mm_add_epi64(m67, _mm_mul_epu32(r78, r9));
716
m45 = _mm_add_epi64(m45, _mm_mul_epu32(r67, r8));
717
m67 = _mm_add_epi64(m67, _mm_mul_epu32(r8x, r8));
718
m89 = _mm_add_epi64(m89, _mm_mul_epu32(r9x, r9a));
719
720
r0 = _mm_unpacklo_epi64(m01, m45);
721
r1 = _mm_unpackhi_epi64(m01, m45);
722
r2 = _mm_unpacklo_epi64(m23, m67);
723
r3 = _mm_unpackhi_epi64(m23, m67);
724
r4 = _mm_unpacklo_epi64(m89, m89);
725
r5 = _mm_unpackhi_epi64(m89, m89);
726
727
c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
728
c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
729
c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
730
c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
731
c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
732
733
r01 = _mm_unpacklo_epi64(r0, r1);
734
r45 = _mm_unpackhi_epi64(r0, r1);
735
r23 = _mm_unpacklo_epi64(r2, r3);
736
r67 = _mm_unpackhi_epi64(r2, r3);
737
r89 = _mm_unpackhi_epi64(r4, r5);
738
} while (--count);
739
740
r0123 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,0,3,3));
741
r4567 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,0,3,3));
742
r0123 = _mm_or_si128(r0123, _mm_shuffle_epi32(r01, _MM_SHUFFLE(3,3,2,0)));
743
r4567 = _mm_or_si128(r4567, _mm_shuffle_epi32(r45, _MM_SHUFFLE(3,3,2,0)));
744
r89 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,2,0));
745
746
_mm_store_si128((xmmi*)r + 0, r0123);
747
_mm_store_si128((xmmi*)r + 1, r4567);
748
_mm_store_si128((xmmi*)r + 2, r89);
749
}
750
751
/* square two packed bignums */
752
inline void
753
curve25519_square_packed64(packedelem64 *out, const packedelem64 *r) {
754
xmmi r0,r1,r2,r3;
755
xmmi r1_2,r3_2,r4_2,r5_2,r6_2,r7_2;
756
xmmi d5,d6,d7,d8,d9;
757
xmmi c1,c2;
758
759
r0 = r[0].v;
760
r1 = r[1].v;
761
r2 = r[2].v;
762
r3 = r[3].v;
763
764
out[0].v = _mm_mul_epu32(r0, r0);
765
r0 = _mm_slli_epi32(r0, 1);
766
out[1].v = _mm_mul_epu32(r0, r1);
767
r1_2 = _mm_slli_epi32(r1, 1);
768
out[2].v = _mm_add_epi64(_mm_mul_epu32(r0, r2 ), _mm_mul_epu32(r1, r1_2));
769
r1 = r1_2;
770
out[3].v = _mm_add_epi64(_mm_mul_epu32(r0, r3 ), _mm_mul_epu32(r1, r2 ));
771
r3_2 = _mm_slli_epi32(r3, 1);
772
out[4].v = _mm_add_epi64(_mm_mul_epu32(r0, r[4].v), _mm_add_epi64(_mm_mul_epu32(r1, r3_2 ), _mm_mul_epu32(r2, r2)));
773
r2 = _mm_slli_epi32(r2, 1);
774
out[5].v = _mm_add_epi64(_mm_mul_epu32(r0, r[5].v), _mm_add_epi64(_mm_mul_epu32(r1, r[4].v), _mm_mul_epu32(r2, r3)));
775
r5_2 = _mm_slli_epi32(r[5].v, 1);
776
out[6].v = _mm_add_epi64(_mm_mul_epu32(r0, r[6].v), _mm_add_epi64(_mm_mul_epu32(r1, r5_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[4].v), _mm_mul_epu32(r3, r3_2 ))));
777
r3 = r3_2;
778
out[7].v = _mm_add_epi64(_mm_mul_epu32(r0, r[7].v), _mm_add_epi64(_mm_mul_epu32(r1, r[6].v), _mm_add_epi64(_mm_mul_epu32(r2, r[5].v), _mm_mul_epu32(r3, r[4].v))));
779
r7_2 = _mm_slli_epi32(r[7].v, 1);
780
out[8].v = _mm_add_epi64(_mm_mul_epu32(r0, r[8].v), _mm_add_epi64(_mm_mul_epu32(r1, r7_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[6].v), _mm_add_epi64(_mm_mul_epu32(r3, r5_2 ), _mm_mul_epu32(r[4].v, r[4].v)))));
781
out[9].v = _mm_add_epi64(_mm_mul_epu32(r0, r[9].v), _mm_add_epi64(_mm_mul_epu32(r1, r[8].v), _mm_add_epi64(_mm_mul_epu32(r2, r[7].v), _mm_add_epi64(_mm_mul_epu32(r3, r[6].v), _mm_mul_epu32(r[4].v, r5_2 )))));
782
783
d5 = _mm_mul_epu32(r[5].v, packedthirtyeight.v);
784
d6 = _mm_mul_epu32(r[6].v, packednineteen.v);
785
d7 = _mm_mul_epu32(r[7].v, packedthirtyeight.v);
786
d8 = _mm_mul_epu32(r[8].v, packednineteen.v);
787
d9 = _mm_mul_epu32(r[9].v, packedthirtyeight.v);
788
789
r4_2 = _mm_slli_epi32(r[4].v, 1);
790
r6_2 = _mm_slli_epi32(r[6].v, 1);
791
out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(d9, r1 ), _mm_add_epi64(_mm_mul_epu32(d8, r2 ), _mm_add_epi64(_mm_mul_epu32(d7, r3 ), _mm_add_epi64(_mm_mul_epu32(d6, r4_2), _mm_mul_epu32(d5, r[5].v))))));
792
out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(d9, _mm_srli_epi32(r2, 1)), _mm_add_epi64(_mm_mul_epu32(d8, r3 ), _mm_add_epi64(_mm_mul_epu32(d7, r[4].v), _mm_mul_epu32(d6, r5_2 )))));
793
out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(d9, r3 ), _mm_add_epi64(_mm_mul_epu32(d8, r4_2), _mm_add_epi64(_mm_mul_epu32(d7, r5_2 ), _mm_mul_epu32(d6, r[6].v)))));
794
out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(d9, r[4].v ), _mm_add_epi64(_mm_mul_epu32(d8, r5_2), _mm_mul_epu32(d7, r[6].v))));
795
out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(d9, r5_2 ), _mm_add_epi64(_mm_mul_epu32(d8, r6_2), _mm_mul_epu32(d7, r[7].v))));
796
out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(d9, r[6].v ), _mm_mul_epu32(d8, r7_2 )));
797
out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(d9, r7_2 ), _mm_mul_epu32(d8, r[8].v)));
798
out[7].v = _mm_add_epi64(out[7].v, _mm_mul_epu32(d9, r[8].v));
799
out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(d9, r[9].v));
800
801
c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
802
c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
803
c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
804
c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
805
c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);
806
c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
807
c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
808
}
809
810
/* make [nqx+nqz,nqpqx+nqpqz], [nqpqx-nqpqz,nqx-nqz] from [nqx+nqz,nqpqx+nqpqz], [nqx-nqz,nqpqx-nqpqz] */
811
inline void
812
curve25519_make_nqpq(packedelem64 *primex, packedelem64 *primez, const packedelem32 *pqx, const packedelem32 *pqz) {
813
primex[0].v = _mm_shuffle_epi32(pqx[0].v, _MM_SHUFFLE(1,1,0,0));
814
primex[1].v = _mm_shuffle_epi32(pqx[0].v, _MM_SHUFFLE(3,3,2,2));
815
primex[2].v = _mm_shuffle_epi32(pqx[1].v, _MM_SHUFFLE(1,1,0,0));
816
primex[3].v = _mm_shuffle_epi32(pqx[1].v, _MM_SHUFFLE(3,3,2,2));
817
primex[4].v = _mm_shuffle_epi32(pqx[2].v, _MM_SHUFFLE(1,1,0,0));
818
primex[5].v = _mm_shuffle_epi32(pqx[2].v, _MM_SHUFFLE(3,3,2,2));
819
primex[6].v = _mm_shuffle_epi32(pqx[3].v, _MM_SHUFFLE(1,1,0,0));
820
primex[7].v = _mm_shuffle_epi32(pqx[3].v, _MM_SHUFFLE(3,3,2,2));
821
primex[8].v = _mm_shuffle_epi32(pqx[4].v, _MM_SHUFFLE(1,1,0,0));
822
primex[9].v = _mm_shuffle_epi32(pqx[4].v, _MM_SHUFFLE(3,3,2,2));
823
primez[0].v = _mm_shuffle_epi32(pqz[0].v, _MM_SHUFFLE(0,0,1,1));
824
primez[1].v = _mm_shuffle_epi32(pqz[0].v, _MM_SHUFFLE(2,2,3,3));
825
primez[2].v = _mm_shuffle_epi32(pqz[1].v, _MM_SHUFFLE(0,0,1,1));
826
primez[3].v = _mm_shuffle_epi32(pqz[1].v, _MM_SHUFFLE(2,2,3,3));
827
primez[4].v = _mm_shuffle_epi32(pqz[2].v, _MM_SHUFFLE(0,0,1,1));
828
primez[5].v = _mm_shuffle_epi32(pqz[2].v, _MM_SHUFFLE(2,2,3,3));
829
primez[6].v = _mm_shuffle_epi32(pqz[3].v, _MM_SHUFFLE(0,0,1,1));
830
primez[7].v = _mm_shuffle_epi32(pqz[3].v, _MM_SHUFFLE(2,2,3,3));
831
primez[8].v = _mm_shuffle_epi32(pqz[4].v, _MM_SHUFFLE(0,0,1,1));
832
primez[9].v = _mm_shuffle_epi32(pqz[4].v, _MM_SHUFFLE(2,2,3,3));
833
}
834
835
/* make [nqx+nqz,nqx-nqz] from [nqx+nqz,nqpqx+nqpqz], [nqx-nqz,nqpqx-nqpqz] */
836
inline void
837
curve25519_make_nq(packedelem64 *nq, const packedelem32 *pqx, const packedelem32 *pqz) {
838
nq[0].v = _mm_unpacklo_epi64(pqx[0].v, pqz[0].v);
839
nq[1].v = _mm_unpackhi_epi64(pqx[0].v, pqz[0].v);
840
nq[2].v = _mm_unpacklo_epi64(pqx[1].v, pqz[1].v);
841
nq[3].v = _mm_unpackhi_epi64(pqx[1].v, pqz[1].v);
842
nq[4].v = _mm_unpacklo_epi64(pqx[2].v, pqz[2].v);
843
nq[5].v = _mm_unpackhi_epi64(pqx[2].v, pqz[2].v);
844
nq[6].v = _mm_unpacklo_epi64(pqx[3].v, pqz[3].v);
845
nq[7].v = _mm_unpackhi_epi64(pqx[3].v, pqz[3].v);
846
nq[8].v = _mm_unpacklo_epi64(pqx[4].v, pqz[4].v);
847
nq[9].v = _mm_unpackhi_epi64(pqx[4].v, pqz[4].v);
848
}
849
850
/* compute [nqx+nqz,nqx-nqz] from nqx, nqz */
851
inline void
852
curve25519_compute_nq(packedelem64 *nq, const bignum25519 nqx, const bignum25519 nqz) {
853
xmmi x0,x1,x2;
854
xmmi z0,z1,z2;
855
xmmi a0,a1,a2;
856
xmmi s0,s1,s2;
857
xmmi r0,r1;
858
xmmi c1,c2;
859
x0 = _mm_load_si128((xmmi*)nqx + 0);
860
x1 = _mm_load_si128((xmmi*)nqx + 1);
861
x2 = _mm_load_si128((xmmi*)nqx + 2);
862
z0 = _mm_load_si128((xmmi*)nqz + 0);
863
z1 = _mm_load_si128((xmmi*)nqz + 1);
864
z2 = _mm_load_si128((xmmi*)nqz + 2);
865
a0 = _mm_add_epi32(x0, z0);
866
a1 = _mm_add_epi32(x1, z1);
867
a2 = _mm_add_epi32(x2, z2);
868
s0 = _mm_add_epi32(x0, packed2p0.v);
869
s1 = _mm_add_epi32(x1, packed2p1.v);
870
s2 = _mm_add_epi32(x2, packed2p2.v);
871
s0 = _mm_sub_epi32(s0, z0);
872
s1 = _mm_sub_epi32(s1, z1);
873
s2 = _mm_sub_epi32(s2, z2);
874
r0 = _mm_and_si128(_mm_shuffle_epi32(s0, _MM_SHUFFLE(2,2,0,0)), sse2_bot32bitmask.v);
875
r1 = _mm_and_si128(_mm_shuffle_epi32(s0, _MM_SHUFFLE(3,3,1,1)), sse2_bot32bitmask.v);
876
c1 = _mm_srli_epi32(r0, 26);
877
c2 = _mm_srli_epi32(r1, 25);
878
r0 = _mm_and_si128(r0, packedmask26.v);
879
r1 = _mm_and_si128(r1, packedmask25.v);
880
r0 = _mm_add_epi32(r0, _mm_slli_si128(c2, 8));
881
r1 = _mm_add_epi32(r1, c1);
882
s0 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpackhi_epi32(r0, r1));
883
s1 = _mm_add_epi32(s1, _mm_srli_si128(c2, 8));
884
nq[0].v = _mm_unpacklo_epi64(a0, s0);
885
nq[2].v = _mm_unpackhi_epi64(a0, s0);
886
nq[4].v = _mm_unpacklo_epi64(a1, s1);
887
nq[6].v = _mm_unpackhi_epi64(a1, s1);
888
nq[8].v = _mm_unpacklo_epi64(a2, s2);
889
nq[1].v = _mm_shuffle_epi32(nq[0].v, _MM_SHUFFLE(3,3,1,1));
890
nq[3].v = _mm_shuffle_epi32(nq[2].v, _MM_SHUFFLE(3,3,1,1));
891
nq[5].v = _mm_shuffle_epi32(nq[4].v, _MM_SHUFFLE(3,3,1,1));
892
nq[7].v = _mm_shuffle_epi32(nq[6].v, _MM_SHUFFLE(3,3,1,1));
893
nq[9].v = _mm_shuffle_epi32(nq[8].v, _MM_SHUFFLE(3,3,1,1));
894
}
895
896
897
/* compute [x+z,x-z] from [x,z] */
898
inline void
899
curve25519_addsub_packed64(packedelem64 *r) {
900
packed32bignum25519 x,z,add,sub;
901
902
x[0].v = _mm_unpacklo_epi64(r[0].v, r[1].v);
903
z[0].v = _mm_unpackhi_epi64(r[0].v, r[1].v);
904
x[1].v = _mm_unpacklo_epi64(r[2].v, r[3].v);
905
z[1].v = _mm_unpackhi_epi64(r[2].v, r[3].v);
906
x[2].v = _mm_unpacklo_epi64(r[4].v, r[5].v);
907
z[2].v = _mm_unpackhi_epi64(r[4].v, r[5].v);
908
x[3].v = _mm_unpacklo_epi64(r[6].v, r[7].v);
909
z[3].v = _mm_unpackhi_epi64(r[6].v, r[7].v);
910
x[4].v = _mm_unpacklo_epi64(r[8].v, r[9].v);
911
z[4].v = _mm_unpackhi_epi64(r[8].v, r[9].v);
912
913
curve25519_add_packed32(add, x, z);
914
curve25519_sub_packed32(sub, x, z);
915
916
r[0].v = _mm_unpacklo_epi64(add[0].v, sub[0].v);
917
r[1].v = _mm_unpackhi_epi64(add[0].v, sub[0].v);
918
r[2].v = _mm_unpacklo_epi64(add[1].v, sub[1].v);
919
r[3].v = _mm_unpackhi_epi64(add[1].v, sub[1].v);
920
r[4].v = _mm_unpacklo_epi64(add[2].v, sub[2].v);
921
r[5].v = _mm_unpackhi_epi64(add[2].v, sub[2].v);
922
r[6].v = _mm_unpacklo_epi64(add[3].v, sub[3].v);
923
r[7].v = _mm_unpackhi_epi64(add[3].v, sub[3].v);
924
r[8].v = _mm_unpacklo_epi64(add[4].v, sub[4].v);
925
r[9].v = _mm_unpackhi_epi64(add[4].v, sub[4].v);
926
}
927
928
/* compute [x,z] * [121666,121665] */
929
inline void
930
curve25519_121665_packed64(packedelem64 *out, const packedelem64 *in) {
931
xmmi c1,c2;
932
933
out[0].v = _mm_mul_epu32(in[0].v, packed121666121665.v);
934
out[1].v = _mm_mul_epu32(in[1].v, packed121666121665.v);
935
out[2].v = _mm_mul_epu32(in[2].v, packed121666121665.v);
936
out[3].v = _mm_mul_epu32(in[3].v, packed121666121665.v);
937
out[4].v = _mm_mul_epu32(in[4].v, packed121666121665.v);
938
out[5].v = _mm_mul_epu32(in[5].v, packed121666121665.v);
939
out[6].v = _mm_mul_epu32(in[6].v, packed121666121665.v);
940
out[7].v = _mm_mul_epu32(in[7].v, packed121666121665.v);
941
out[8].v = _mm_mul_epu32(in[8].v, packed121666121665.v);
942
out[9].v = _mm_mul_epu32(in[9].v, packed121666121665.v);
943
944
c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
945
c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
946
c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
947
c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
948
c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);
949
c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
950
c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
951
}
952
953
/* compute [sq.x,sqscalar.x-sqscalar.z] * [sq.z,sq.x-sq.z] */
954
inline void
955
curve25519_final_nq(packedelem64 *nq, const packedelem64 *sq, const packedelem64 *sq121665) {
956
packed32bignum25519 x, z, sub;
957
packed64bignum25519 t, nqa, nqb;
958
959
x[0].v = _mm_or_si128(_mm_unpacklo_epi64(sq[0].v, sq[1].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[0].v, sq121665[1].v), 4));
960
z[0].v = _mm_or_si128(_mm_unpackhi_epi64(sq[0].v, sq[1].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[0].v, sq121665[1].v), 4));
961
x[1].v = _mm_or_si128(_mm_unpacklo_epi64(sq[2].v, sq[3].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[2].v, sq121665[3].v), 4));
962
z[1].v = _mm_or_si128(_mm_unpackhi_epi64(sq[2].v, sq[3].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[2].v, sq121665[3].v), 4));
963
x[2].v = _mm_or_si128(_mm_unpacklo_epi64(sq[4].v, sq[5].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[4].v, sq121665[5].v), 4));
964
z[2].v = _mm_or_si128(_mm_unpackhi_epi64(sq[4].v, sq[5].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[4].v, sq121665[5].v), 4));
965
x[3].v = _mm_or_si128(_mm_unpacklo_epi64(sq[6].v, sq[7].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[6].v, sq121665[7].v), 4));
966
z[3].v = _mm_or_si128(_mm_unpackhi_epi64(sq[6].v, sq[7].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[6].v, sq121665[7].v), 4));
967
x[4].v = _mm_or_si128(_mm_unpacklo_epi64(sq[8].v, sq[9].v), _mm_slli_si128(_mm_unpacklo_epi64(sq121665[8].v, sq121665[9].v), 4));
968
z[4].v = _mm_or_si128(_mm_unpackhi_epi64(sq[8].v, sq[9].v), _mm_slli_si128(_mm_unpackhi_epi64(sq121665[8].v, sq121665[9].v), 4));
969
970
curve25519_sub_packed32(sub, x, z);
971
972
t[0].v = _mm_shuffle_epi32(sub[0].v, _MM_SHUFFLE(1,1,0,0));
973
t[1].v = _mm_shuffle_epi32(sub[0].v, _MM_SHUFFLE(3,3,2,2));
974
t[2].v = _mm_shuffle_epi32(sub[1].v, _MM_SHUFFLE(1,1,0,0));
975
t[3].v = _mm_shuffle_epi32(sub[1].v, _MM_SHUFFLE(3,3,2,2));
976
t[4].v = _mm_shuffle_epi32(sub[2].v, _MM_SHUFFLE(1,1,0,0));
977
t[5].v = _mm_shuffle_epi32(sub[2].v, _MM_SHUFFLE(3,3,2,2));
978
t[6].v = _mm_shuffle_epi32(sub[3].v, _MM_SHUFFLE(1,1,0,0));
979
t[7].v = _mm_shuffle_epi32(sub[3].v, _MM_SHUFFLE(3,3,2,2));
980
t[8].v = _mm_shuffle_epi32(sub[4].v, _MM_SHUFFLE(1,1,0,0));
981
t[9].v = _mm_shuffle_epi32(sub[4].v, _MM_SHUFFLE(3,3,2,2));
982
983
nqa[0].v = _mm_unpacklo_epi64(sq[0].v, t[0].v);
984
nqb[0].v = _mm_unpackhi_epi64(sq[0].v, t[0].v);
985
nqa[1].v = _mm_unpacklo_epi64(sq[1].v, t[1].v);
986
nqb[1].v = _mm_unpackhi_epi64(sq[1].v, t[1].v);
987
nqa[2].v = _mm_unpacklo_epi64(sq[2].v, t[2].v);
988
nqb[2].v = _mm_unpackhi_epi64(sq[2].v, t[2].v);
989
nqa[3].v = _mm_unpacklo_epi64(sq[3].v, t[3].v);
990
nqb[3].v = _mm_unpackhi_epi64(sq[3].v, t[3].v);
991
nqa[4].v = _mm_unpacklo_epi64(sq[4].v, t[4].v);
992
nqb[4].v = _mm_unpackhi_epi64(sq[4].v, t[4].v);
993
nqa[5].v = _mm_unpacklo_epi64(sq[5].v, t[5].v);
994
nqb[5].v = _mm_unpackhi_epi64(sq[5].v, t[5].v);
995
nqa[6].v = _mm_unpacklo_epi64(sq[6].v, t[6].v);
996
nqb[6].v = _mm_unpackhi_epi64(sq[6].v, t[6].v);
997
nqa[7].v = _mm_unpacklo_epi64(sq[7].v, t[7].v);
998
nqb[7].v = _mm_unpackhi_epi64(sq[7].v, t[7].v);
999
nqa[8].v = _mm_unpacklo_epi64(sq[8].v, t[8].v);
1000
nqb[8].v = _mm_unpackhi_epi64(sq[8].v, t[8].v);
1001
nqa[9].v = _mm_unpacklo_epi64(sq[9].v, t[9].v);
1002
nqb[9].v = _mm_unpackhi_epi64(sq[9].v, t[9].v);
1003
1004
curve25519_mul_packed64(nq, nqa, nqb);
1005
}
1006
1007
/*
1008
* In: b = 2^5 - 2^0
1009
* Out: b = 2^250 - 2^0
1010
*/
1011
void
1012
curve25519_pow_two5mtwo0_two250mtwo0(bignum25519 b) {
1013
ALIGN(16) bignum25519 t0,c;
1014
1015
/* 2^5 - 2^0 */ /* b */
1016
/* 2^10 - 2^5 */ curve25519_square_times(t0, b, 5);
1017
/* 2^10 - 2^0 */ curve25519_mul(b, t0, b);
1018
/* 2^20 - 2^10 */ curve25519_square_times(t0, b, 10);
1019
/* 2^20 - 2^0 */ curve25519_mul(c, t0, b);
1020
/* 2^40 - 2^20 */ curve25519_square_times(t0, c, 20);
1021
/* 2^40 - 2^0 */ curve25519_mul(t0, t0, c);
1022
/* 2^50 - 2^10 */ curve25519_square_times(t0, t0, 10);
1023
/* 2^50 - 2^0 */ curve25519_mul(b, t0, b);
1024
/* 2^100 - 2^50 */ curve25519_square_times(t0, b, 50);
1025
/* 2^100 - 2^0 */ curve25519_mul(c, t0, b);
1026
/* 2^200 - 2^100 */ curve25519_square_times(t0, c, 100);
1027
/* 2^200 - 2^0 */ curve25519_mul(t0, t0, c);
1028
/* 2^250 - 2^50 */ curve25519_square_times(t0, t0, 50);
1029
/* 2^250 - 2^0 */ curve25519_mul(b, t0, b);
1030
}
1031
1032
/*
1033
* z^(p - 2) = z(2^255 - 21)
1034
*/
1035
void
1036
curve25519_recip(bignum25519 out, const bignum25519 z) {
1037
ALIGN(16) bignum25519 a, t0, b;
1038
1039
/* 2 */ curve25519_square(a, z); /* a = 2 */
1040
/* 8 */ curve25519_square_times(t0, a, 2);
1041
/* 9 */ curve25519_mul(b, t0, z); /* b = 9 */
1042
/* 11 */ curve25519_mul(a, b, a); /* a = 11 */
1043
/* 22 */ curve25519_square(t0, a);
1044
/* 2^5 - 2^0 = 31 */ curve25519_mul(b, t0, b);
1045
/* 2^250 - 2^0 */ curve25519_pow_two5mtwo0_two250mtwo0(b);
1046
/* 2^255 - 2^5 */ curve25519_square_times(b, b, 5);
1047
/* 2^255 - 21 */ curve25519_mul(out, b, a);
1048
}
1049
1050
ANONYMOUS_NAMESPACE_END
1051
1052
NAMESPACE_BEGIN(CryptoPP)
1053
NAMESPACE_BEGIN(Donna)
1054
1055
int curve25519_mult_SSE2(byte sharedKey[32], const byte secretKey[32], const byte othersKey[32])
1056
{
1057
FixedSizeSecBlock<byte, 32> e;
1058
for (size_t i = 0;i < 32;++i)
1059
e[i] = secretKey[i];
1060
e[0] &= 0xf8; e[31] &= 0x7f; e[31] |= 0x40;
1061
1062
ALIGN(16) bignum25519 nqx = {1}, nqpqz = {1}, nqz = {0}, nqpqx, zmone;
1063
packed32bignum25519 qx, qz, pqz, pqx;
1064
packed64bignum25519 nq, sq, sqscalar, prime, primex, primez, nqpq;
1065
bignum25519mulprecomp preq;
1066
size_t bit=0;
1067
1068
curve25519_expand(nqpqx, othersKey);
1069
curve25519_mul_precompute(&preq, nqpqx);
1070
1071
/* do bits 254..3 */
1072
for (size_t i = 254, lastbit=0; i >= 3; i--) {
1073
bit = (e[i/8] >> (i & 7)) & 1;
1074
curve25519_swap_conditional(nqx, nqpqx, (word32)(bit ^ lastbit));
1075
curve25519_swap_conditional(nqz, nqpqz, (word32)(bit ^ lastbit));
1076
lastbit = bit;
1077
1078
curve25519_tangle32(qx, nqx, nqpqx); /* qx = [nqx,nqpqx] */
1079
curve25519_tangle32(qz, nqz, nqpqz); /* qz = [nqz,nqpqz] */
1080
1081
curve25519_add_packed32(pqx, qx, qz); /* pqx = [nqx+nqz,nqpqx+nqpqz] */
1082
curve25519_sub_packed32(pqz, qx, qz); /* pqz = [nqx-nqz,nqpqx-nqpqz] */
1083
1084
curve25519_make_nqpq(primex, primez, pqx, pqz); /* primex = [nqx+nqz,nqpqx+nqpqz], primez = [nqpqx-nqpqz,nqx-nqz] */
1085
curve25519_mul_packed64(prime, primex, primez); /* prime = [nqx+nqz,nqpqx+nqpqz] * [nqpqx-nqpqz,nqx-nqz] */
1086
curve25519_addsub_packed64(prime); /* prime = [prime.x+prime.z,prime.x-prime.z] */
1087
curve25519_square_packed64(nqpq, prime); /* nqpq = prime^2 */
1088
curve25519_untangle64(nqpqx, nqpqz, nqpq);
1089
curve25519_mul_precomputed(nqpqz, nqpqz, &preq); /* nqpqz = nqpqz * q */
1090
1091
/* (((sq.x-sq.z)*121665)+sq.x) * (sq.x-sq.z) is equivalent to (sq.x*121666-sq.z*121665) * (sq.x-sq.z) */
1092
curve25519_make_nq(nq, pqx, pqz); /* nq = [nqx+nqz,nqx-nqz] */
1093
curve25519_square_packed64(sq, nq); /* sq = nq^2 */
1094
curve25519_121665_packed64(sqscalar, sq); /* sqscalar = sq * [121666,121665] */
1095
curve25519_final_nq(nq, sq, sqscalar); /* nq = [sq.x,sqscalar.x-sqscalar.z] * [sq.z,sq.x-sq.z] */
1096
curve25519_untangle64(nqx, nqz, nq);
1097
};
1098
1099
/* it's possible to get rid of this swap with the swap in the above loop
1100
at the bottom instead of the top, but compilers seem to optimize better this way */
1101
curve25519_swap_conditional(nqx, nqpqx, (word32)bit);
1102
curve25519_swap_conditional(nqz, nqpqz, (word32)bit);
1103
1104
/* do bits 2..0 */
1105
for (size_t i = 0; i < 3; i++) {
1106
curve25519_compute_nq(nq, nqx, nqz);
1107
curve25519_square_packed64(sq, nq); /* sq = nq^2 */
1108
curve25519_121665_packed64(sqscalar, sq); /* sqscalar = sq * [121666,121665] */
1109
curve25519_final_nq(nq, sq, sqscalar); /* nq = [sq.x,sqscalar.x-sqscalar.z] * [sq.z,sq.x-sq.z] */
1110
curve25519_untangle64(nqx, nqz, nq);
1111
}
1112
1113
curve25519_recip(zmone, nqz);
1114
curve25519_mul(nqz, nqx, zmone);
1115
curve25519_contract(sharedKey, nqz);
1116
1117
return 0;
1118
}
1119
1120
NAMESPACE_END // Donna
1121
NAMESPACE_END // CryptoPP
1122
1123
#endif // CRYPTOPP_CURVE25519_SSE2
1124
1125