Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
241818 views
1
#include <stdlib.h>
2
#include <stdio.h>
3
#include <math.h>
4
#include <memory.h>
5
#include "gmp.h"
6
#include "mpzutil.h"
7
#include "hecurve.h"
8
#include "ffwrapper.h"
9
#include "ffpoly.h"
10
#include "cstd.h"
11
12
/*
13
Copyright 2008 Andrew V. Sutherland
14
15
This file is part of smalljac.
16
17
smalljac is free software: you can redistribute it and/or modify
18
it under the terms of the GNU General Public License as published by
19
the Free Software Foundation, either version 2 of the License, or
20
(at your option) any later version.
21
22
smalljac is distributed in the hope that it will be useful,
23
but WITHOUT ANY WARRANTY; without even the implied warranty of
24
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25
GNU General Public License for more details.
26
27
You should have received a copy of the GNU General Public License
28
along with smalljac. If not, see <http://www.gnu.org/licenses/>.
29
*/
30
31
/*
32
This module implements operations for genus 1 curves, both low level
33
point operations and functions for exponentiation computing the
34
group order, group structure, etc...
35
36
These functions are heavily optimized and responsible for a more than a 2x
37
improvement in genus 1 performance from smalljac version 3 over version 2.
38
39
We begin with basic point arithmetic functions (mostly inlined), the higher
40
level functions appear below. At the end of the module are a collection of functions
41
that were implement and test but are not used because they were found to be
42
sub-optimal in the current application.
43
*/
44
45
46
long hecurve_g1_JC_pp_order (hec_jc_t *a, long p, long q, ff_t f1);
47
int hecurve_g1_dlog (hec_jc_t *a, hec_jc_t *b, long o, ff_t f1);
48
int hecurve_g1_bsgs_search (long *ord, hec_jc_t *b, long low, long high, int m, int a, ff_t f1);
49
long hecurve_g1_fastorder (ff_t x, ff_t y, long k, ff_t f1);
50
int hecurve_g1_test_exponent (long e, ff_t f[4]);
51
void hecurve_g1_p_reduce (hec_jc_t *b1, long *q1, hec_jc_t *b2, long *q2, long p, ff_t f1);
52
53
unsigned long hecurve_expbits;
54
unsigned long hecurve_steps;
55
unsigned long hecurve_retries;
56
57
58
/*
59
We use a reduced form of the Chudnovsky Jacobian representation (JC) which uses (x,y,z^2,z^3) to represent the affine point (x/z^2,y/z^3), but does not maintain z.
60
Not computing z saves 1 field multiplication when compsing an affine point with JC point (usually called mixed addition). Squaring (doubling) costs an extra multiplication
61
but two fewer additions and the performance is comparable (in fact, when tested on an AMD Athlon 64, JC doubling was slighly faster, not clear why).
62
63
In terms of the Mumford representation used elsewhere, the point point (x,y,z^2,z^3) is u[1]=z, u[0]=-x/z^2, v[0]=y/x^3. (notice the sign on x)
64
When z is 1 this corresponds to u(t)=(t-x) and v(t)=y, so that x is a root of u and y=v(x).
65
66
Negating y inverts an element, as in the affine rep, and 2-torsion elements have y=0.
67
Any element with z=0 (or z2 for reduced Chudnovsky) is considered the identity, regardless of the value of x and y.
68
69
As we use multiplicative notation for group operations elsewhere, we generally speak of the elliptic curve group operation multiplicatively,
70
so we multiply, square, and exponentiate points, rather than adding, doubling, or multiplying by a scalar.
71
72
In our operation counts we don't distinguish field multiplication and squaring. For prime fields where p fits in a machine word, they are effectively the same.
73
We do count additions, as these are not negligible--roughly speaking, 1M is about 2.5A. Field inversions are horribly expensive relative to Montgomery multiplication,
74
costing 40M or more (for p~2^30, say). Whenever possible, we do n field inversions in parallel for a cost of I+3(n-1)M, effectively 3M per inverted element, for n large.
75
*/
76
77
// We trust the compiler to be smart about register allocation and don't hassle with macros (the speed difference appears to be negligible in testing)
78
// It would be nice to have C++ reference data types here...
79
80
81
static inline int hecurve_g1_JC_id (hec_jc_t *p) { return _ff_zero(p->z2); }
82
static inline int hecurve_g1_JC_2tor (hec_jc_t *p) { return _ff_zero(p->y); }
83
static inline void hecurve_g1_JC_invert (hec_jc_t *p) { ff_negate(p->y); }
84
85
static inline int hecurve_g1_JC_cmp (hec_jc_t *p1, hec_jc_t *p2)
86
{
87
register ff_t t0,t1;
88
89
_ff_mult(t0,p1->x,p2->z2); _ff_mult(t1,p2->x,p1->z2);
90
if ( ! _ff_equal(t0,t1) ) return 0;
91
_ff_mult(t0,p1->y,p2->z3); _ff_mult(t1,p2->y,p1->z3);
92
return ( _ff_equal(t0,t1) ? 1 : -1 );
93
}
94
95
// converts reduced Chudnovsky Jacobian to affine coords, given the inverse of z3, overlap ok
96
static inline void hecurve_g1_JC_to_A (ff_t *px, ff_t *py, hec_jc_t *p, ff_t z3inv)
97
{
98
register ff_t t0,t1;
99
100
ff_mult(*py,p->y,z3inv);
101
_ff_mult(t0,p->z2,z3inv);
102
_ff_square(t1,t0);
103
ff_mult(*px,p->x,t1);
104
// 4M
105
}
106
107
// squares (doubles) an affine point into reduced Chudnovsky Jacobian coords
108
static inline void hecurve_g1_2AJC (hec_jc_t *p, ff_t x, ff_t y, ff_t f1)
109
{
110
register ff_t t0,t1,t2,a,b;
111
112
_ff_add(t0,y,y); _ff_mult(t1,t0,y); _ff_add(p->z2,t1,t1); _ff_mult(a,x,p->z2); // a=4xy^2, t1=2y^2, z2=4y^2
113
_ff_mult(p->z3,t0,p->z2); // z3=8y^3
114
_ff_square(t1,x); _ff_add(b,t1,t1); _ff_addto(b,t1); _ff_addto(b,f1); // b = 3x^2+f1*1^4
115
_ff_square(t0,b); _ff_add(t2,a,a); _ff_sub(p->x,t0,t2); // x = b^2-2a
116
_ff_sub(t1,a,p->x); _ff_mult(t2,t1,b); _ff_mult(t0, y,p->z3); _ff_sub(p->y,t2,t0); // y = b(a-x)-8y^4 (note we use the new x here and new z3=8y^3)
117
// 7M+9A
118
}
119
120
// squares (doubles) an affine point into reduced Chudnovsky Jacobian coords and also sets az4 to f1*z3^4=16y^4 (cached value for squaring used below)
121
static inline void hecurve_g1_2AJC_cache (hec_jc_t *p, ff_t x, ff_t y, ff_t f1, ff_t *az4)
122
{
123
register ff_t t0,t1,t2,a,b;
124
125
_ff_add(t0,y,y); _ff_mult(t1,t0,y); _ff_add(p->z2,t1,t1); _ff_mult(a,x,p->z2); // a=4xy^2, t1=2y^2, z2=4y^2
126
_ff_mult(p->z3,t0,p->z2); // z3=8y^3
127
_ff_square(t1,x); _ff_add(b,t1,t1); _ff_addto(b,t1); _ff_addto(b,f1); // b = 3x^2+f1*1^4
128
_ff_square(t0,b); _ff_add(t2,a,a); _ff_sub(p->x,t0,t2); // x = b^2-2a
129
_ff_sub(t1,a,p->x); _ff_mult(t2,t1,b); _ff_mult(t0, y,p->z3); _ff_sub(p->y,t2,t0); // y = b(a-x)-8y^4 (note we use the new x here and new z3=8y^3)
130
_ff_add(*az4,t0,t0);
131
// 7M+10A
132
}
133
134
// squares a point p1 in reduced Chudnovsky Jacobian coords (p3 is the output, may be equal to p1)
135
// This code requires one more multiplication than doubling in standard Jacobian coordinates (but two fewer additions, which makes it a close call)
136
// Surprisingly, it is actually slightly faster than doubling in Jacobian coords when tested on an AMD Athlon 64 (YMMV).
137
static inline void hecurve_g1_2JC (hec_jc_t *p3, hec_jc_t *p1, ff_t f1)
138
{
139
register ff_t a, b, c, t0, t1, t2;
140
141
_ff_square(t0,p1->x); _ff_add(t1,t0,t0); _ff_addto(t1,t0); // t1 = 3x^2
142
_ff_square(t2,p1->z2); _ff_mult(t0,f1,t2); _ff_add(b,t1,t0); // b = 3x^2+f1*z^4 (note that f1=a4 in 13.2.1.c of HECHECC p.282)
143
_ff_add(c,p1->y,p1->y); _ff_square(t2,c); _ff_mult(a,t2,p1->x); // a=4xy^2, c=2y, t2=4y^2
144
_ff_add(t0,a,a); _ff_square(t1,b); _ff_sub(p3->x,t1,t0); // x = b^2-2a
145
_ff_mult(p3->z2,p1->z2,t2); _ff_mult(t1,c,t2); _ff_mult(p3->z3,p1->z3,t1); // z2=4y^2z2, z3=8y^3z3, t1=8y^3
146
_ff_mult(c,t1,p1->y); // c = 8y^4
147
_ff_sub(t0,a,p3->x); _ff_mult(t2,t0,b); _ff_sub(p3->y,t2,c); // y = b(a-x)-c -- note we use the new x value here
148
// 11M+8A
149
}
150
151
// same as above except the parameter az4 contains a_4*z1^4 = f1*z1^4 and is updated to hold a_4*z3^4 (cost 1M less, 1A more)
152
static inline void hecurve_g1_2JC_cache (hec_jc_t *p3, hec_jc_t *p1, ff_t *az4)
153
{
154
register ff_t a, b, c, t0, t1, t2;
155
156
_ff_square(t0,p1->x); _ff_add(t1,t0,t0); _ff_addto(t1,t0); _ff_add(b,t1,*az4); // b = 3x^2+f1*z^4
157
_ff_add(c,p1->y,p1->y); _ff_square(t2,c); _ff_mult(a,t2,p1->x); // a=4xy^2, c=2y, t2=4y^2
158
_ff_add(t0,a,a); _ff_square(t1,b); _ff_sub(p3->x,t1,t0); // x = b^2-2a
159
_ff_mult(p3->z2,p1->z2,t2); _ff_mult(t1,c,t2); _ff_mult(p3->z3,p1->z3,t1); // z2=4y^2z2, z3=8y^3z3, t1=8y^3
160
_ff_mult(c,t1,p1->y); _ff_add(t0,c,c); ff_mult(*az4,*az4,t0); // c = 8y^4, az4 = az4*16y^4
161
_ff_sub(t0,a,p3->x); _ff_mult(t2,t0,b); _ff_sub(p3->y,t2,c); // y = b(a-x)-c -- note we use the new x value here
162
// 10M+9A
163
}
164
165
// multiplies a non-identity point p1 in reduced Chudnovsky Jacobians coords by a (non-identity) affine point and puts result in p
166
// using the reduced Chudnosky form (no z, only z^2 and z^3) saves one field mult (this is faster than any of the alternatives in table 13.3 of HECHECC on p.284)
167
static inline void hecurve_g1_AJC (hec_jc_t *p, hec_jc_t *p1, ff_t x0, ff_t y0, ff_t f1)
168
{
169
register ff_t a, c, e, f, t0, t1, t2;
170
171
if ( _ff_zero(p1->z2) ) { _ff_set(p->x,x0); _ff_set(p->y,y0); _ff_set_one(p->z2); _ff_set_one(p->z3); return; }
172
_ff_mult(a,x0,p1->z2); // a = x0z^2, b = x*1^2=x (z0=1)
173
_ff_sub(e,p1->x,a); // e = a-b
174
_ff_mult(c,y0,p1->z3); // c = y0z^3, d=y*1^3=y (z0=1)
175
_ff_sub(f,p1->y,c); // f = d-c
176
if ( _ff_zero(e) && _ff_zero(f) ) { hecurve_g1_2AJC (p, x0, y0, f1); return; } // must use doubling code here, but at least it's an affine double (7M+9A) (inverses are ok)
177
_ff_square(t0,e); _ff_mult(t1,t0,e); _ff_mult(t2,t0,a); // e2=e^2, t1=e^3, t2=ae^2
178
ff_mult(p->z2,p1->z2,t0); ff_mult(p->z3,p1->z3,t1); // z2 = 1*z2*e^2, z3=1*z3*e^3
179
_ff_square(t0,f); _ff_sub(p->x,t0,t1); _ff_add(t0,t2,t2); _ff_subfrom(p->x,t0); // x = f^2-e^3-2ae^2
180
_ff_sub(t0,t2,p->x); _ff_mult(t2,f,t0); _ff_mult(t0,t1,c); _ff_sub(p->y,t2,t0); // y = f(ae^2-x) - ce^3
181
// 10M+6A
182
}
183
184
// multiplies p1 in reduced Chudnovsky Jacobian coords by p2 in reduced Chudnovsky Jacobian coords and puts the result in p1.
185
// Handles identity and will square (double) if needed. Cost is 13M+7A, one less than with standard Chudnovsky Jacobian coords.
186
static inline void hecurve_g1_JCJC (hec_jc_t *p1, hec_jc_t *p2, ff_t f1)
187
{
188
register ff_t a, b, c, d, e, f, t0, t1, t2;
189
190
// we need to handle identity cases in general (although in some cases we might be able to rule this out)
191
if ( _ff_zero(p2->z2) ) return;
192
if ( _ff_zero(p1->z2) ) { *p1=*p2; return; }
193
_ff_mult(a,p1->x,p2->z2); _ff_mult(b,p2->x,p1->z2); _ff_sub(e,b,a); // a = x1z2^2, b = x2z1^2, e = a-b
194
_ff_mult(c,p1->y,p2->z3); _ff_mult(d,p2->y,p1->z3); _ff_sub(f,d,c); // c = y1z2^2, d = y2z1^3, f = d-c
195
if ( _ff_zero(e) && _ff_zero(f) ) { hecurve_g1_2JC (p1,p1,f1); return; } // must double if pts are equal (11M+8A), inverses will end up with z2=0, so we let them through
196
_ff_square(t1,e); ff_mult(t0,p1->z2,p2->z2); _ff_mult(p1->z2,t0,t1); // z^2 = z1^2z2^2e^2, t1=e^2
197
_ff_mult(t2,t1,e); ff_mult(t0,p1->z3,p2->z3); _ff_mult(p1->z3,t0,t2); // z^2 = z1^2z2^2e^2, t2=e^3
198
ff_mult(t1,t1,a); // t1=ae^2
199
_ff_square(t0,f); _ff_sub(p1->x,t0,t2); _ff_add(t0,t1,t1); _ff_subfrom(p1->x,t0); // x = f^2-e^3-2ae^2
200
_ff_sub(t0,t1,p1->x); _ff_mult(t1,f,t0); _ff_mult(t0,t2,c); _ff_sub(p1->y,t1,t0); // y = f(ae^2-x) - ce^3 -- note we use the new x here
201
// 13M+7A
202
}
203
204
/*
205
Computes p=(x,y)^n where (x,y) !=1 is in affine coordinates and p is in Jacobian coordinates
206
It takes advantage of the fact that all additions are of the form J+A, requiring only 11M, doubling is done in J+J form, using 10M
207
*/
208
void hecurve_g1_AJC_exp_ui (hec_jc_t *p, ff_t x0, ff_t y0, unsigned long n, ff_t f1)
209
{
210
register unsigned long m;
211
unsigned long pbits, nbits;
212
register ff_t negy0;
213
int i;
214
215
if ( n == 0 ) { _ff_set_zero(p->z2); return; }
216
if ( n == 1 ) { _ff_set(p->x,x0); _ff_set(p->y,y0); _ff_set_one(p->z2); _ff_set_one(p->z3); return; }
217
hecurve_g1_2AJC(p,x0,y0,f1);
218
if ( n == 2 ) return;
219
ui_NAF(&pbits,&nbits,n);
220
i = _asm_highbit(pbits)-2; // we know the top two bits of the NAF are 10
221
_ff_neg(negy0,y0);
222
m = (1UL<<i);
223
for ( ; m ; m >>= 1 ) {
224
hecurve_g1_2JC(p,p,f1); // 11M+8A
225
if ( m&pbits ) hecurve_g1_AJC(p,p,x0,y0,f1); // 10M+6A
226
if ( m&nbits ) hecurve_g1_AJC(p,p,x0,negy0,f1); // 10M+6A
227
}
228
}
229
230
/*
231
Computes p=a^e where a!=1 is in reduced Jacobian Chudnovsky coords, and so is p. overlap is ok.
232
*/
233
void hecurve_g1_JC_exp_ui (hec_jc_t *p, hec_jc_t *a, unsigned long n, ff_t f1)
234
{
235
register unsigned long m;
236
unsigned long pbits, nbits;
237
hec_jc_t t, ai;
238
int i;
239
240
if ( n == 0 ) { _ff_set_zero(p->z2); return; }
241
if ( n == 1 ) { *p=*a; return; }
242
hecurve_g1_2JC(&t,a,f1); // 11M+8A
243
244
if ( n == 2 ) { *p = t; return; }
245
ui_NAF(&pbits,&nbits,n);
246
i = _asm_highbit(pbits)-2;
247
ai=*a; hecurve_g1_JC_invert(&ai);
248
m = (1UL<<i);
249
for ( ; m ; m >>= 1 ) {
250
hecurve_g1_2JC(&t,&t,f1); // 11M+8A
251
if ( m&pbits ) hecurve_g1_JCJC(&t,a,f1); // 13M+7A
252
if ( m&nbits ) hecurve_g1_JCJC(&t,&ai,f1); // 13M+7A
253
}
254
*p = t;
255
}
256
257
// Computes b=a^e where a and b are both in affine coordinates
258
void hecurve_g1_exp_ui (ff_t u[HECURVE_GENUS+1], ff_t v[HECURVE_GENUS], ff_t u1[HECURVE_GENUS+1], ff_t v1[HECURVE_GENUS], unsigned long n, ff_t f[HECURVE_DEGREE+1])
259
{
260
ff_t zinv,x0;
261
hec_jc_t o[1];
262
263
if ( _hecurve_is_identity(u1,v1) ) { _hecurve_set_identity(u,v); return; }
264
if ( ! _ff_one(u1[1]) ) { printf ("p=%ld, input to hecurve_g1_exp_ui most be in affine coords!\n",_ff_p); hecurve_print(u,v); exit(0); }
265
_ff_neg(x0,u1[0]);
266
hecurve_g1_AJC_exp_ui (o, x0, v1[0], n, f[1]);
267
if ( hecurve_g1_JC_id(o) ) { _hecurve_set_identity(u,v); return; }
268
ff_invert(zinv,o->z3);
269
hecurve_g1_JC_to_A(&x0,v,o,zinv);
270
_ff_neg(u[0],x0);
271
_ff_set_one(u[1]);
272
}
273
274
// Combines precomputed values p[i]=p[0]^(2^i) to compute p[0]^n. Assumes all required powers are present: NAF potentially requires one more bit!
275
// CAUTION: if this is false, the resulting behavior is very strange and unpredictable. You have been warned.
276
void hecurve_g1_JC_exp_powers(hec_jc_t *o, hec_jc_t *p, unsigned long n, ff_t f1)
277
{
278
register unsigned long m;
279
unsigned long pbits, nbits;
280
hec_jc_t t;
281
int i;
282
283
// handle small n quickly
284
switch (n) {
285
case 0: _ff_set_zero(o->z2); return;
286
case 1: *o=p[0]; return;
287
case 2: *o=p[1]; return;
288
case 3: *o=p[0]; hecurve_g1_JCJC(o,p+1,f1); return;
289
case 4: *o=p[2]; return;
290
case 5: *o=p[2]; hecurve_g1_JCJC(o,p,f1); return;
291
case 6: *o=p[2]; hecurve_g1_JCJC(o,p+1,f1); return;
292
case 7: *o=p[0]; hecurve_g1_JC_invert(o); hecurve_g1_JCJC(o,p+3,f1); return;
293
case 8: *o=p[3]; return;
294
}
295
ui_NAF(&pbits,&nbits,n);
296
i = _asm_highbit(pbits);
297
*o = p[i];
298
i-=2;
299
m = (1UL<<i);
300
for ( ; m ; m >>= 1, i-- ) {
301
if ( (m&pbits) ) hecurve_g1_JCJC(o,p+i,f1); // 13M+8A
302
if ( (m&nbits) ) { // 13M+10A (slightly more expensive)
303
hecurve_g1_JC_invert(o); hecurve_g1_JCJC(o,p+i,f1); hecurve_g1_JC_invert(o);
304
}
305
}
306
}
307
308
// Sets p[i] = p[0]^(2^i) for i in [0,k]. Input is an affine pt not equal to the identity, output is in reduced Jacobian Chudnovsky coords.
309
void hecurve_g1_AJC_powers (hec_jc_t p[], ff_t x0, ff_t y0, int k, ff_t f1)
310
{
311
register hec_jc_t *e;
312
ff_t az4;
313
314
_ff_set(p->x,x0); _ff_set(p->y,y0); _ff_set_one(p->z2); _ff_set_one(p->z3);
315
hecurve_g1_2AJC_cache (p+1, x0, y0, f1, &az4); // 7M + 10A
316
for ( e=p+k,p+=2 ; p <= e ; p++ ) hecurve_g1_2JC_cache(p,p-1,&az4); // 10M + 9A
317
}
318
319
// Sets p[i] = p[0]^(2^i) for i in [1,k]. Input is an pt not equal to the identity, input and output is in reduced Jacobian Chudnovsky coords.
320
void hecurve_g1_JC_powers (hec_jc_t p[], int k, ff_t f1)
321
{
322
register hec_jc_t *e;
323
324
// don't bother caching
325
for ( e=p+k,p++ ; p <= e ; p++ ) hecurve_g1_2JC(p,p-1,f1); // 11M + 8A
326
}
327
328
// Sets s[i] = s[0]*(x0,y0)^i for i < n. Returns k<n if s[k] is the identity (in which case no further elements are computed), otherwise returns n.
329
int hecurve_g1_AJC_steps_1 (hec_jc_t s[], ff_t x0, ff_t y0, int n, ff_t f1)
330
{
331
register hec_jc_t *end;
332
333
if ( hecurve_g1_JC_id(s) ) return 0;
334
end = s+n;
335
for ( s++ ; s < end ; s++ ) {
336
hecurve_g1_AJC (s,s-1,x0,y0,f1); // 10M + 6A
337
if ( hecurve_g1_JC_id(s) ) break;
338
}
339
hecurve_steps += n-(end-s);
340
return n-(end-s);
341
}
342
343
// Sets s[-i] = s[0]*(x0,-y0)^i for i < n. Returns k<n if s[k] is the identity (in which case no further elements are computed), otherwise returns n.
344
// Used for downward stepping - NOTE THAT s SHOULD POINT TO THE LAST ENTRY IN THE ARRAY
345
int hecurve_g1_AJC_dsteps_1 (hec_jc_t *s, ff_t x0, ff_t y0, int n, ff_t f1)
346
{
347
register hec_jc_t *end;
348
register ff_t negy0;
349
350
if ( hecurve_g1_JC_id(s) ) return 0;
351
end = s-n;
352
_ff_neg(negy0,y0);
353
for ( s-- ; s > end ; s-- ) {
354
hecurve_g1_AJC (s,s+1,x0,negy0,f1); // 10M + 6A
355
if ( hecurve_g1_JC_id(s) ) break;
356
}
357
hecurve_steps += n-(s-end);
358
return n-(s-end);
359
}
360
361
// sets s[2*i+j] = s[0]*(x0,y0)^(i+j)*(x1,y1)^i for 2*i+j < n with j=0 or 1. Returns k<n if s[k] is the identity (in which case no further elements are computed), otherwise returns n.
362
int hecurve_g1_AJC_steps_2 (hec_jc_t s[], ff_t x0, ff_t y0, ff_t x1, ff_t y1, int n, ff_t f1)
363
{
364
register hec_jc_t *end;
365
register int i;
366
367
if ( hecurve_g1_JC_id(s) ) return 0;
368
end = s+n;
369
for ( i=1,s++ ; s < end ; i++,s++ ) {
370
if ( i&1) {
371
hecurve_g1_AJC (s,s-1,x0,y0,f1); // 10M + 6A
372
} else {
373
hecurve_g1_AJC (s,s-1,x1,y1,f1); // 10M + 6A
374
}
375
if ( hecurve_g1_JC_id(s) ) break;
376
}
377
hecurve_steps += n-(end-s);
378
return n-(end-s);
379
}
380
381
static inline int inv_mod3 (int e) { return ((e%3)==1?1:2); }
382
383
/*
384
Fast group order computation for elliptic curves over F_p, designed for p<2^40, optimized for p~2^30.
385
Returns 1 if successful and sets *o to the group order and if pd is non-null, sets *pd to gcd(m,6), where the group structure is Z/mZ x Z/nZ with m dividing n (possibly m=1).
386
This information can be used to speed up group structure computations.
387
388
Failure can only happen for p < 229, in which case 0 is returned and *o is a divisor of the group order highly likely equal to the group exponent (*pd is as also set, as above).
389
*/
390
long hecurve_g1_order (long *pd, ff_t f[4])
391
{
392
long r, d, e, E, min, max, low, high, exp, M;
393
hec_jc_t t[1];
394
ff_t g[4],x,y,*h;
395
int a,i,j,m,twist,s2known,tor3,tor4,flag8;
396
397
/*
398
We compute the 3-torsion subgroup first, because this information may lead us to work in the twist.
399
The function ff_poly_g1_3tor() computes the 3-torsion subgroup of y^2=f(x), but if it is trivial, it will try to
400
determine the 3-torsion subgroup of the twist, using the factorization pattern of the 3-division polynomial (a quartic).
401
*/
402
h = f; twist = 0;
403
tor3 = ff_poly_g1_3tor(f);
404
if ( tor3<0 ) { ff_poly_twist(g,f,3); h = g; twist = 1; tor3=-tor3; } // negative return value indicates 3-torsion in the twist (but not in the primary)
405
d = ( tor3==9 ? 3 : 1); // d is a known divisor of |G|/lambda(|G|), i.e. it divides the group exponent (lambda(G)) and its square divides |G|.
406
407
if ( _ff_p < HECURVE_G1_4TOR_MINP ) { // when p is small, only compute 2-torsion but not 4-torsion (the marginal benefit does not justify the cost)
408
i = ff_poly_roots_d3(0,h);
409
if ( i == 0 ) { // no roots means there is no 2-torsion and the group order must be odd
410
s2known = 1; tor4 = 1; // the flag s2known indicates we know the entire 2-Sylow subgroup
411
} else { // this means that once we factor it out, the remaining exponent is known to be odd
412
s2known = 0; tor4 = 2; // if there is one root, we have 2-torsion Z/2Z, otherwise there are 3 roots and 2-torsion Z/2Z x Z/2Z
413
if ( i > 1 ) { tor4 = 4; d *= 2; }
414
}
415
} else {
416
s2known = 0;
417
flag8 = ( _ff_p < HECURVE_G1_8TOR_MINP ? 0 : 1); // flag8 set if we also want to check whether the 2-Sylow subgroup is a cyclic group containing Z/8Z
418
d *= ff_poly_g1_4tor(&tor4,h,flag8); // compute the 4-torsion subgroup (and, optionally, check Z/8Z as well)
419
if ( flag8 ) {
420
if ( tor4 <= 4 ) s2known = 1; // In these cases we know the entire 2-Sylow subgoup
421
} else {
422
if ( tor4 < 4 || (tor4==4 && d==2) ) s2known = 1; // if the 4-torsion subgroup contains no elements order 4, it must be equal to the 2-Sylow subgroup.
423
}
424
}
425
e = tor4*tor3/d; // e is our known divisor of lambda(|G|)
426
E = d*e; // E is our known divisor of |G|
427
m = (s2known==1?2:1)*(tor3==1?3:1); // we know that gcd(|G|/tor4,m) = 1, which we can use to speed up the BSGS search
428
429
a = ( tor3==1 && _ff_p1mod3 ? -1 : 0 ); // if neither the curve or its twist has 3-torsion and p=1mod3, we must have a_p=0 and group order 2 mod 3
430
// For any divisor x of |G| this means |G|/x must be equal to -1/x mod 3 (and also mod 6 if m=6)
431
r = (long)(2.0*sqrt(_ff_p));
432
exp = _ff_p+1;
433
min = exp-r;
434
max = exp+r;
435
low = _ui_ceil_ratio(min,E);
436
high = max/E;
437
if ( low > high ) { printf ("Error, no multiple of torsion derived e=%ld exists in interval [%ld,%ld] for p=%ld\n", e, min, max, _ff_p); exit(0); }
438
if ( pd ) { *pd = ( twist && tor3==9 ? d/3 : d ); if ( !((*pd)&3) ) *pd/=2; } // set *pd to reflect 2-torsion and 3-torsion information in y^2=f(x) (but not the twist)
439
440
//printf("%ld: tor3=%d, tor4=%d, e=%d, d=%d ", _ff_p, tor3, tor4, e, d); ff_poly_print(h,3);
441
442
// Note that for a non-CM curve the probability that hecurve_g1_bsgs succeeds on the first try is close to 1 and grows with p (say 99.9% for p~2^20)
443
for ( i = 0 ; i < HECURVE_G1_ORDER_RETRIES ; i++ ) {
444
if ( low==high ) {exp=low; break; }
445
if ( ! hecurve_random_point(&x,&y,h) ) continue; // this can fail, e.g. y^2=x^3+2x+2 over F_3 has no finite rational points
446
//printf ("%ld: e=%d, Random pt (%ld,%ld) on curve ", _ff_p, e, _ff_get_ui(x), _ff_get_ui(y)); ff_poly_print(h,3);
447
hecurve_g1_AJC_exp_ui (t, x, y, e, h[1]); if ( hecurve_g1_JC_id(t) ) continue;
448
// handle order 2 elements here so that bsgs search can assume |t| > 2
449
if ( hecurve_g1_JC_2tor(t) ) {
450
exp = 2;
451
//On paper, the code below should speed things up (it uses no inversions and fewer operations for small intervals), but in testing it actually slows things down slightly
452
//} else if ( high-low < HECURVE_G1_SHORT_INTERVAL ) {
453
// if ( hecurve_g1_bsgs_short (&exp, &t, low, high, h[1]) ) break;
454
} else {
455
if ( a ) { a = -inv_mod3(E); if ( m==6 && a==-2 ) a = -5; }
456
if ( hecurve_g1_bsgs_search (&exp, t, low, high, m, a, h[1]) ) break;
457
}
458
e *= exp; E *= exp;
459
low = _ui_ceil_ratio(min,E);
460
high = max/E;
461
// there are lot's of optimizations we could insert here, but none of them apply often enough to be worth doing, so keep it simple.
462
hecurve_retries++;
463
}
464
if ( i < HECURVE_G1_ORDER_RETRIES ) {
465
E *= exp;
466
if ( twist ) E = 2*(_ff_p+1)-E;
467
if ( E < min || E > max ) { printf ("Error, computed order %ld is not in Hasse-Weil interval [%ld,%ld] for p=%ld, h= ", E, min, max, _ff_p); ff_poly_print(h,3); exit(0); }
468
return E;
469
}
470
// For non-CM curves we will almost never reach this point.
471
472
if ( e > 2*r ) { printf("Error, exponent %ld for p=%ld has unique multiple in [%ld,%ld] that was not detected.\n", e, _ff_p, min, max); exit(0); }
473
if ( e < sqrt(min) ) { printf ("Error, exponent %ld is impossibly small for p=%ld\n", e, _ff_p); exit(0); }
474
475
/*
476
With very high probability (exponential in HECURVE_G1_ORDER_RETRIES), e is now the group exponent. For all but 21 primes (the largest of which is 547)
477
this uniquely determines the group order (this result is due to Cremona and Harley, or see Washington, "Elliptic Curves", Prop. 4.19).
478
In fact, taking our knowledge of 2-torsion and 3-torsion into account, the group order is uniquely determined in every case (AVS: to be written up).
479
Unfortunately, we can't prove that e is definitely the group exponent without computing the group structure.
480
481
Instead, we apply results of Schoof and Mestre (see Washington, Prop. 4.18) which tells us that either the curve or its twist contains an element whose
482
order has a unique multiple in the Hasse interval provided p > 229. If one examines the exceptional cases for p<=229, one finds that if we also
483
consider the information provided by 2-torsion, in every case but two (y^2=x^3+2 over F_13 and F_19) we obtain a known divisor of the group order
484
with a unique multiple in the Hasse interval, and the two exceptional cases are addressed if we consider 3-torsion. (AVS: to be written up).
485
486
In the two 3-torsion cases, we will have chosen h to be the twist with 3-torsion and should have succeeded above. If not we mustn't have computed
487
the full group exponent and we should try again (by recursively calling ourselves below)
488
489
For the 2-torsion cases, the twist will also have 2-torsion, we just need to use this information when we try to find a unique multiple of the exponent
490
in the twist below. If we fail, it must mean we got unlucky and should try again.
491
492
In summary, this yields a Las Vegas algorithm with provably correct output, for all odd p (for p=2 the curve y^2=f(x) is singular).
493
*/
494
495
m = (d%2?1:2); // m is a known divisor of the twist group order based on the rank of the 2-torsion subgroup, note that we don't keep track of d anymore and reuse it below
496
497
for ( M = e*_ui_ceil_ratio(min,e) ; M <= max ; M += e ) { d = M/e; if ( !(e%d) && !((_ff_p-1)%d) ) break; }
498
if ( M > max ) { printf ("Error: no multiple M of ambiguous e=%ld satisfies M/e | gcd(e,p-1) (p=%ld).\n", e, _ff_p); exit (0); }
499
if ( ! twist ) { ff_poly_twist(g,f,3); h = g; } else { h = f; }
500
501
do {
502
r = 2*(_ff_p+1)-M;
503
// Attempt to prove that r is the order of the twist. We don't bother trying to be efficient here, we expect to succeed on the first try.
504
for ( i = 0 ; i < HECURVE_G1_ORDER_RETRIES ; i++ ) {
505
if ( ! hecurve_random_point(&x,&y,h) ) { printf("hecurve_random_point failed!"); exit (0); }
506
hecurve_g1_AJC_exp_ui (t, x, y, r, h[1]);
507
if ( ! hecurve_g1_JC_id(t) ) break; // can't be the right M
508
exp = m*hecurve_g1_fastorder(x,y,r,h[1]); // compute the order of our random point, times our 2-torsion derived info
509
low = exp*_ui_ceil_ratio(min,exp); // low is the first multiple of exp >= min
510
if ( low <= max && low+exp > max ) { return ( twist ? 2*(_ff_p+1)-M : M ); } // success
511
}
512
// try another candidate, if there is one (this can happen for small p)
513
for ( M+= e; M <= max ; M += e ) { d = M/e; if ( !(e%d) && !((_ff_p-1)%d) ) break; }
514
} while ( M <= max );
515
// Otherwise, try again -- we must have not gotten the full group exponent (this essentially never happens unless HECURVE_G1_ORDER_RETRIES is set quite low).
516
return hecurve_g1_order(pd,f);
517
}
518
519
/*
520
Given the group order N, compute the isomorphism type of the group structure of an elliptic curve, of the form Z/n1Z x Z/n2Z with n1 dividing n2 (and also p-1), possibly n1=1.
521
The parameter d specifies gcd(6,n1), which can be derived from 2-torsion and 3-torsion info by hecurve_g1_order above (specify 0 if not known).
522
The rank of the group (1 or 2) is returned, and n[0]=n2 if n1=1, o.w. n[0]=n1 and n[1]=n2. Note that n2 is the group exponent.
523
524
It is possible to compute the group invariants (n1 and n2) in probabilistic (Las Vegas) polynomial time using the Weil pairing via Miller's algorithm
525
(although this doesn't actually determine a basis, i.e. independent elements of order n1 and n2).
526
527
We take a simpler generic approach which is very fast in the typical case. We may in the future want to invoke (refinements of) Miller's algorithm for
528
hard cases (large prime dividing p-1 whose square divides N). As it stands, the algorithm below has complexity O(q^(1/2)) where q is the largest prime dividing p-1
529
whose square divides N. This is O(N^(1/4)) in the worst case, but this rarely happens and in practice computing the group structure takes about 10% the
530
time it takes to compute the group order (for a non-CM curve). (It might be interesting to do a performance comparison wth Miller's algorithm at some point)
531
532
We just return the group invariants here, and optimize for computing these as quickly as possible, but the algorithm below can easily be modified to return a basis
533
without changing the complexity (but the constant factors will be slightly worse). This is a Las Vegas algorithm, i.e. provably correct output and bounded expected running time.
534
*/
535
int hecurve_g1_group_structure (long n[2], long N, long d, ff_t f[4])
536
{
537
long m,n1,n2,e;
538
hec_jc_t b1, b2;
539
unsigned long p[MAX_UI_PP_FACTORS], q[MAX_UI_PP_FACTORS], h[MAX_UI_PP_FACTORS];
540
ff_t x,y;
541
long q0, q1, q2, r;
542
int i, j, k;
543
544
/*
545
For each prime q|N, if q does not divide p-1 we know the q-Sylow subgroup is cyclic. Even when q does divide p-1, if q^2 does not divide N, we
546
again know the q-Sylow is cyclic (of prime order in this case). Applying just these 2 facts will often suffice to determine the group structure without
547
using any group operations, particularly if information in d is also applied (e.g. 2-torsion and 3-torsion info).
548
*/
549
k = ui_factor(p,h,ui_gcd(_ff_p-1,N));
550
for ( i = j = 0 ; i < k ; i++ ) { // make a list of primes p[i] dividing p-1 whose square divides N
551
if ( p[i]==2 && d && d%2 ) continue; // d=gcd(6,n1) not divisible by 2 implies 2-Sylow must be cyclic
552
if ( p[i]==3 && d && d%3 ) continue; // d=gcd(6,n1) not divisible by 3 implies 3-Sylow must be cyclic
553
if ( !(N%(p[i]*p[i])) ) p[j++] = p[i];
554
}
555
n1 = 1; n2 = N;
556
for ( i = 0 ; i < j ; i++ ) {
557
for ( q[i] = p[i]*p[i] ; !(N%(q[i]*p[i])) ; q[i] *= p[i] ); // determine power of p[i] dividing N
558
n2 /= q[i]; // remove q[i] from n2
559
}
560
//printf("%ld: N=%d ", _ff_p, N); ff_poly_print(f,3);
561
// Now determine the p[i]-Sylow subgroups for p[i] dividing p-1 and p[i]^2 dividing N (if there are none, we have no work to do).
562
for ( i = 0 ; i < j ; i++ ) {
563
if ( d ) for ( q0 = 1 ; ! (d%(p[i]*q0)) ; q0 *= p[i] ); else q0 = 1; // if d was specified, use d to compute q0, a known divisor of q1 (and q2), possibly 1
564
e = N/(q[i]*n1); // the p[i]-Sylow subgroup must lie in the image of the e-power map (we can take out the n1 we know)
565
for ( q1 = q2 = q0, k=0 ; q1*q2 < q[i] && k < 1000 ; k++ ) { // bound the number of retries as a safety valve, just in case the group order was invalid
566
if ( ! hecurve_random_point(&x,&y,f) ) continue; // get a random finite point
567
hecurve_g1_AJC_exp_ui (&b1, x, y, e, f[1]); // get an element of the p[i]-Sylow via the e-power map
568
r = hecurve_g1_JC_pp_order (&b1,p[i],q[i],f[1]); // compute its order, which will be a power of p[i]
569
if ( r <= q1 ) continue; // if the order of b1 is not strictly larger than q1, it isn't going to help us
570
if ( q2==q0 ) { b2=b1; q2 = r; continue; } // first time through we will just set q2
571
hecurve_g1_p_reduce (&b1, &r, &b2, &q2, p[i], f[1]); // reduce to a basis for <b1,b2>
572
q1 = ( r > q0 ? r : q0); // note that if r<q0, then q1 is not the order of b1, but we don't care
573
}
574
if ( k == 1000 ) { printf ("Group structure computation failed on %d-Sylow of size %d with group of order %d on curve over F_%d: ", p[i], q[i], N, _ff_p); ff_poly_print(f,3); exit (0); }
575
n1 *= q1; n2 *= q2;
576
}
577
if ( n1*n2 != N ) { printf ("bug in hecurve_g1_group_order, %d*%d != %d, F_%ld curve ", n1, n2, N, _ff_p); ff_poly_print(f,3); exit(0); } // sanity check
578
if ( n1 == 1 ) { n[0] = n2; return 1; }
579
n[0] = n1; n[1] = n2;
580
return 2;
581
}
582
583
/*
584
Slow Monte Carlo algorithm to test the group exponent, use for debugging/testing only.
585
*/
586
int hecurve_g1_test_exponent (long e, ff_t f[4])
587
{
588
hec_jc_t t[1];
589
ff_t x,y;
590
long m, n;
591
int i;
592
593
n = 1;
594
for ( i = 0 ; i < 100 ; i++ ) {
595
if ( ! hecurve_random_point(&x,&y,f) ) continue;
596
hecurve_g1_AJC_exp_ui(t,x,y,e,f[1]);
597
if ( ! hecurve_g1_JC_id(t) ) return 0;
598
m = hecurve_g1_fastorder (x,y,e,f[1]);
599
if ( n%m ) n = m*(n/ui_gcd(m,n));
600
}
601
return (n==e);
602
}
603
604
/*
605
Given non-trivial elements b1 and b2 of the p-Sylow subgroup with |b1|=q1 and |b2|=q2 powers of p (p here is a divisor of the group order, not the characteristic of the field),
606
The function below computes a basis for <b1,b2> and updates b1,b2,q1, and q2 appropriately, with q1 < q2 (possibly q1=1 if <b1,b2> is cyclic).
607
608
The complexity is O(log_p(q1)*(sqrt(p)+log(q2)).
609
AVS: the code below is a special case of a new algorithm for group structure computation (a simpler and faster version of Algorithm 9.2 in my thesis which also incorporates
610
a few ideas from Teske's Pohlig-Hellman paper). I am in the process of writing a paper which I will eventually reference here.
611
*/
612
void hecurve_g1_p_reduce (hec_jc_t *b1, long *q1, hec_jc_t *b2, long *q2, long p, ff_t f1)
613
{
614
hec_jc_t t[1],a1[1],a2[1];
615
long k;
616
int i;
617
618
if ( *q1 > *q2 ) { t[0] = *b1; *b1 = *b2; *b2 = t[0]; k = *q1; *q1 = *q2; *q2 = k; } // swap inputs if needed so that q1 <= q2
619
620
hecurve_g1_JC_exp_ui(a2,b2,*q2/p,f1); // a2=b2^(q2/p), <a2> is the subgroup of <b2> of order p
621
while (*q1>1) {
622
hecurve_g1_JC_exp_ui(a1,b1,(*q1)/p,f1); // a1=b1^(q1/p), <a1> is the subgroup of <b1> of order p
623
k = hecurve_g1_dlog (a2,a1,p,f1); // compute least k>0 s.t. a1=a2^k, if possible
624
if ( ! k ) break; // if we can't, then a1 and a2 are independent, hence so are b1 and b2, and we are done
625
hecurve_g1_JC_exp_ui(t,b2,(*q2)/(*q1)*k,f1); hecurve_g1_JC_invert(t); // compute t=b2^-(q2/q1*k)
626
hecurve_g1_JCJC (b1,t,f1); // replace b1 with b1*b2^-(q2/q1*k), this reduces the order of b1 to at most q1/p since
627
// (b1*b2^-(q2/q1*k))^(q1/p) = a1*a2^{-k} = 1, and we do not change <b1,b2> by doing this.
628
k = hecurve_g1_JC_pp_order(b1,p,*q1,f1); // recompute q1 (it could be less than q1/p)
629
if ( k >= *q1 ) { printf ("%ld: Error in hecurve_g1_p_reduce(%d,%d) element order %d not reduced\n", _ff_p, *q1, *q2, k); exit (0); } // sanity check
630
*q1 = k;
631
}
632
}
633
634
635
long hecurve_g1_JC_pp_order (hec_jc_t *a, long p, long q, ff_t f1)
636
{
637
register long r;
638
hec_jc_t t[1];
639
640
if ( hecurve_g1_JC_id(a) ) return 1;
641
if ( p==q ) return p;
642
hecurve_g1_JC_exp_ui (t,a,p,f1);
643
if ( hecurve_g1_JC_id(t) ) return p;
644
for ( r = p*p ; r < q ; r *= p ) {
645
hecurve_g1_JC_exp_ui (t,t,p,f1);
646
if ( hecurve_g1_JC_id(t) ) return r;
647
}
648
return q;
649
}
650
651
/*
652
Fast and simple small hash table lookup used by hecurve_g1_bsgs_search and also by hecurve_g1_dlog.
653
*/
654
655
#if FF_HALF_WORDS == 1
656
#define BSGS_MAX_STEPS 256 // guaranteed to handle p<2^31 (single word ff_t), assuming 2 torsion is known, enlarge if needed but it's nice to fit in L1 cache
657
#else
658
#define BSGS_MAX_STEPS 1024 // will BSGS up to 2^40, assuming 2 torsion is known, and nearly as high for dlogs
659
#endif
660
#define BSGS_TABSIZE BSGS_MAX_STEPS // don't make this too big, it takes time to initialize it. A few collisions won't kill us.
661
#define BSGS_TABMASK ((unsigned long)(BSGS_TABSIZE-1))
662
663
static hec_jc_t babys[BSGS_MAX_STEPS];
664
static hec_jc_t giants[BSGS_MAX_STEPS];
665
666
static ff_t stepzs[2*BSGS_MAX_STEPS];
667
#if FF_HALF_WORDS == 1
668
static unsigned char hashtab[BSGS_TABSIZE];
669
#else
670
static unsigned short hashtab[BSGS_TABSIZE];
671
#endif
672
static struct tab_entry {
673
ff_t x;
674
short i;
675
short next;
676
} entries[BSGS_MAX_STEPS+1];
677
short nexttabentry;
678
679
static inline void tab_clear() { memset(hashtab,0,sizeof(hashtab)); nexttabentry = 1; } // don't use entry 0
680
681
// we require inserts to have unique x values, return -1 if unique, otherwise return index value for existing entry
682
static inline int tab_insert(ff_t x, short i)
683
{
684
register struct tab_entry *p;
685
register int n,h;
686
687
h = x&BSGS_TABMASK;
688
n = hashtab[h];
689
if ( n ) {
690
p=entries+n;
691
for(;;) {
692
if ( _ff_equal(p->x,x) ) return p->i;
693
if ( ! p->next ) break;
694
p=entries+p->next;
695
}
696
}
697
p = entries+nexttabentry;
698
_ff_set(p->x,x);
699
p->i = i;
700
p->next = n;
701
hashtab[h] = nexttabentry++;
702
return -1;
703
}
704
705
static inline int tab_lookup(ff_t x)
706
{
707
register struct tab_entry *p;
708
register int n;
709
710
n = hashtab[x&BSGS_TABMASK];
711
if ( ! n ) return -1;
712
p=entries+n;
713
for(;;) {
714
if ( _ff_equal(p->x,x) ) return p->i;
715
if ( ! p->next ) return -1;
716
p=entries+p->next;
717
}
718
return -1;
719
}
720
721
/*
722
Computes the discrete log of b wrt a given the order of a using a BSGS search. Optimized for small searches (shares table lookup code with hecurve_g1_bsgs_search).
723
Returns 0 if b is not an element of <a> (this is not assumed).
724
725
Note that we are only called for prime values whose square divides the group order, so we only use O(N^(1/4)) steps in the worst case, and even this is very rare
726
since the prime must also divide p-1.
727
*/
728
int hecurve_g1_dlog (hec_jc_t *a, hec_jc_t *b, long o, ff_t f1)
729
{
730
hec_jc_t c[1];
731
long bsteps, giant, gstep, gsteps;
732
ff_t zinv[2], baby_x[1], baby_y[1], giant_x[1], giant_y[1];
733
ff_t t0, t1;
734
register int i, j;
735
736
// hard-wire small o cases
737
if ( hecurve_g1_JC_id(a) ) return o;
738
switch ( o ) {
739
case 1: return 0;
740
case 2: return ( hecurve_g1_JC_cmp (a,b) ? 1 : 0 );
741
case 3: i = hecurve_g1_JC_cmp (a,b); return ( i ? (i>0?1:2) : 0 );
742
}
743
744
//printf ("dlog(%d) a=(%d,%d,%d,%d) b=(%d,%d,%d,%d)\n", o, _ff_get_ui(a->x), _ff_get_ui(a->y), _ff_get_ui(a->z2), _ff_get_ui(a->z3), _ff_get_ui(b->x), _ff_get_ui(b->y), _ff_get_ui(b->z2), _ff_get_ui(b->z3));
745
746
// For small searches we will just search the whole range so we only match once (using just one inversion). The only optimization we use is the usual one for inverses.
747
bsteps = (long)sqrt(o/2);
748
gstep = 2*bsteps+1;
749
gsteps = _ui_ceil_ratio(o,gstep);
750
if ( bsteps > BSGS_MAX_STEPS || gsteps > BSGS_MAX_STEPS ) { printf ("%ld: order %ld caused BSGS_MAX_STEPS to be exceeded in hecurve_g1_dlog\n", _ff_p, o); exit (0); }
751
//printf("bsteps=%d, gsteps=%d, giant step=%d\n", bsteps, gsteps, gstep);
752
753
// Convert baby and giant step to affine coords for stepping
754
hecurve_g1_JC_exp_ui (c,a,gstep,f1);
755
hecurve_g1_JC_invert(c);
756
_ff_set(zinv[0],a->z3);
757
_ff_set(zinv[1],c->z3);
758
ff_parallel_invert (zinv, zinv, 2);
759
hecurve_g1_JC_to_A (baby_x, baby_y, a, zinv[0]);
760
hecurve_g1_JC_to_A (giant_x, giant_y, c, zinv[1]);
761
//printf ("baby step(a) = (%d,%d)\n", _ff_get_ui(baby_x[0]), _ff_get_ui(baby_y[0]));
762
//printf ("giant step(a^-%d) = (%d,%d)\n", gstep, _ff_get_ui(giant_x[0]), _ff_get_ui(giant_y[0]));
763
764
// Step
765
babys[0] = *a; giants[0] = *b;
766
j = hecurve_g1_AJC_steps_1 (babys, baby_x[0], baby_y[0], bsteps, f1); // baby steps
767
if ( j < bsteps ) { printf ("%ld: baby step hit identity in dlog with o=%ld\n", _ff_p, o); exit(0); }
768
j = hecurve_g1_AJC_steps_1 (giants, giant_x[0], giant_y[0], gsteps, f1); // giant steps
769
if ( j < gsteps ) return j*gstep;
770
771
// Convert to affine to get unique x coords
772
for ( i = j = 0 ; i < bsteps ; i++, j++ ) _ff_set(stepzs[j], babys[i].z2);
773
for ( i = 0 ; i < gsteps ; i++, j++ ) _ff_set(stepzs[j], giants[i].z2);
774
ff_parallel_invert(stepzs, stepzs, j); // we assume j < FF_MAX_PARALLEL_INVERTS here
775
for ( i = j = 0 ; i < bsteps ; i++,j++ ) ff_mult(babys[i].x, babys[i].x, stepzs[j]); // only invert x-coord here, we never need to invert y
776
for ( i = 0 ; i < gsteps ; i++,j++ ) ff_mult(giants[i].x, giants[i].x, stepzs[j]);
777
778
/*
779
printf ("affine baby x's:\n", j);
780
for ( i = 0 ; i < bsteps ; i++ ) printf (" %d: %ld\n", i+1, _ff_get_ui(babys[i].x));
781
printf ("affine giant x's:\n", j);
782
for ( i = 0 ; i < gsteps ; i++ ) printf (" %ld: %ld\n", i*gstep, _ff_get_ui(giants[i].x));
783
*/
784
// Populate the table with babys. Inserts should never fail (baby steps can't be inverses provided bsteps < o/2)
785
tab_clear();
786
for ( i = 0 ; i < bsteps ; i++ ) if ( (j=tab_insert(babys[i].x,i)) >= 0 ) break;
787
if ( i < bsteps ) { printf("%ld: baby step insert failed in dlog\n", _ff_p); exit (0); }
788
789
// Now match giant steps
790
for ( i = 0 ; i < gsteps ; i++ ) {
791
if ( (j=tab_lookup(giants[i].x)) >= 0 ) {
792
_ff_mult(t0,babys[j].y,giants[i].z3); _ff_mult(t1,giants[i].y,babys[j].z3); // normalize y values for comparison
793
if ( _ff_equal(t0,t1) ) return i*gstep+j+1;
794
return (i?i*gstep-j-1:o-j-1);
795
}
796
}
797
return 0;
798
}
799
800
801
/*
802
Support functions for hecurve_g1_bsgs_search...
803
*/
804
805
double baby_stretch[8] = {0.0,1.0,2.0,1.5,3.0,0.0,3.0,6.0};
806
807
// returns the (j+1)-th integer satisfying property determined by modcase.
808
static inline int baby_index(int j, int modcase)
809
{
810
register int k;
811
812
k=j&1;
813
switch (modcase) {
814
case 1: return j+1; // all values
815
case 2: return 2*j+1; // odd values (bspan,gsteps and giants even)
816
case 3: return 3*(j-k)/2+1+k; // prime to 3 (bspan, gstep, and giants multiples of 3)
817
case 4: return 3*(j+1); // 0 mod 3 (bspan and gstep multiples of 3, giants a mod 3)
818
case 6: return 6*(j-k)/2+1+4*k; // prime to 6 (bspan, gstep, and giants multiples of 6)
819
case 7: return 6*(j+1); // 0 mod 6 (bspan and gstep multiples of 6, giants a mod 6)
820
}
821
}
822
823
/*
824
sets exp to the unique multiple of k in [low,high] if there is one (and returns 1) or sets exp=k and returns 0
825
*/
826
static inline int set_exp (long *exp, long k, long low, long high)
827
{
828
register int i;
829
i = ui_multiples_in_range(k,low,high);
830
if ( ! i ) { printf ("no multiple of element order %ld in interval [%ld,%ld]\n", k, low, high); exit(0); }
831
if ( i == 1 ) { *exp = (high/k)*k; return 1; } else { *exp = k; return 0; }
832
}
833
834
835
/*
836
Uses BSGS (and fastorder) to compute the order k of the element b, given that some multiple of k lies in [low,high] (this is assumed, not verified).
837
The values m and a specify modular constraints on k as follows:
838
839
m=1 no constraint
840
m=2 k is odd (use odd baby steps, even giant steps)
841
m=3 k is prime to 3 (use baby steps 1,2 mod 3, giant steps 0 mod 3)
842
m=3 (a!=0) k = a mod 3 (use baby steps 0 mod 3, giant steps congruent to a mod 3)
843
m=6 k is prime to 6 (use baby steps 1,5 mod 6, giant steps 0 mod 6)
844
m=6 (a!=0) k = a mod 6 (use baby steps 0 mod 6, giant steps congruent to a mod 6)
845
846
The value modcase = m+(a?1:0) is used internally to uniquely identify these 6 situations.
847
Note that the giant steps are congruent to a mod m in every case.
848
849
If the return value is 1, then exp is the unique multiuple of k in [low,high].
850
If the return value is 0, exp=k is the order of b. This almost always means there is more than one multiple of k in [low,high], but not necessarily.
851
*/
852
int hecurve_g1_bsgs_search (long *exp, hec_jc_t *b, long low, long high, int m,int a, ff_t f1)
853
{
854
hec_jc_t p[64];
855
hec_jc_t baby_steps[3], giant_steps[1];
856
ff_t zinv[4], baby_x[3], baby_y[3], giant_x[1], giant_y[1];
857
register ff_t t0, t1, t2;
858
register int i,j,k,bsteps,gsteps,dsteps,usteps;
859
int bspan,gstep,modcase;
860
long o, o1, o2, gbase, range;
861
862
//printf ("%d: hecurve_g1_bsgs pt (%ld,%ld,%ld,%ld), low=%d, high=%d, m=%d, a=%d\n", _ff_p, _ff_get_ui(b->x), _ff_get_ui(b->y),_ff_get_ui(b->z2),_ff_get_ui(b->z3), low, high, m, a);
863
864
range = high-low+1; // range is inclusive, we assume high>low, so range > 1
865
bsteps = (long)sqrt((double)(range) / (2.0*baby_stretch[m]));
866
if ( ! bsteps ) bsteps = 1;
867
if ( m>2 && !a && (bsteps&1) ) bsteps++; // make sure bsteps is even if we are covering steps prime to 3 or 6
868
modcase = m + (a?1:0);
869
bspan = bsteps * baby_stretch[modcase]; // effective baby coverage depends on modular contraints m and a
870
if ( m > 1 ) bspan = m * _ui_ceil_ratio(bspan,m); // make sure bspan is a multiple of modulus m (we can safely round up)
871
gstep= 2*bspan; // giant span is effectively 2*bspan+1 due to inverses, but we use 2*bspan so bspan divides gspan...
872
if ( m==1 ) gstep++; // ...except when m=1, where we can use 2*bspan+1.
873
874
/*
875
Pick first giant step to be a multiple of as large a value of 2 as possible (and still lie in the interval [low,high]).
876
We then need to adjust to make it congruent to a mod m, but this changes at most 2 bits.
877
(An optimal approach might search for the value in [low,high] congruent to a mod m with minimal hamming weight, but this would likely take more time than it is worth).
878
879
The overall savings is only a few percent (it cuts the cost of computing the first giant step by 1/6, but computing the first giant step is less than 1/4 the total cost).
880
However, it's an easy optimization, so there is no reason not to do it.
881
882
This idea was suggested by Dan Bernstein.
883
*/
884
k = ui_lg_floor(range);
885
for (;;) {
886
o1 = (1L<<(k+1)); o = o1*_ui_ceil_ratio(low,o1);
887
if ( o > high ) break;
888
k++;
889
}
890
o1 = 1L<<k;
891
gbase = o1*_ui_ceil_ratio(low,o1); // gbase is now a multiple of a largish power of 2 in [low,high]
892
if ( m > 1 ) {
893
i = (gbase-a)%m;
894
gbase -= i; // gbase is now congruent to a mod m
895
if ( gbase < low ) gbase += m; // make sure we stay in [low,high]
896
if ( gbase > high ) gbase -= m;
897
}
898
for ( dsteps = (gbase-low)/gstep ; gbase - (dsteps-1)*gstep - bspan > low ; dsteps++ );
899
while ( gbase-(dsteps-1)*gstep <= 0 ) gbase += m; // make sure we don't step on zero or negative values
900
for ( usteps = (high-gbase)/gstep ; gbase + (usteps-1)*gstep + bspan < high ; usteps++ );
901
if ( dsteps+usteps+1 > BSGS_MAX_STEPS ) { printf ("Exceeded BSGS_MAX_STEPS=%d! p=%ld, low=%ld, high=%ld, m=%d, a=%d\n", BSGS_MAX_STEPS, _ff_p, low, high, m, a); exit (0); }
902
gsteps = dsteps+usteps-1; // dsteps and usteps both include starting step, so total is one less then the sum
903
904
//printf ("%d: bsteps=%d, gsteps=%d(%d,%d), bspan=%d, gstep=%d, first giant = %ld\n", _ff_p, bsteps, gsteps, dsteps, usteps, bspan, gstep, gbase);
905
906
// compute 2^k powers of b sufficient to compute b^gstep, and b^gbase
907
o = _ui_max(gstep,gbase);
908
k = ui_lg_floor(o);
909
p[0] = *b;
910
hecurve_g1_JC_powers (p, k+1, f1); // add 1 extra power for NAF
911
baby_steps[0] = *b; k = 1;
912
913
// compute baby step sizes depending on the value of modcase
914
switch (modcase) { // if m=1, fist step=step size=1
915
case 1: break;
916
case 2:
917
case 3: hecurve_g1_2JC (baby_steps+1, baby_steps, f1); k = 2; break; // first step is 1, steps size is 1 or 1,2
918
case 4: hecurve_g1_2JC (baby_steps+1,baby_steps,f1); hecurve_g1_JCJC(baby_steps+1,baby_steps,f1); k = 2; break; // first step = step size is 3
919
case 6: hecurve_g1_2JC (baby_steps+1, baby_steps, f1);
920
hecurve_g1_2JC (baby_steps+2, baby_steps+1, f1); k = 3; break; // first step is 1, step size is 2 or 4
921
case 7: hecurve_g1_2JC (baby_steps+1,baby_steps,f1); hecurve_g1_JCJC(baby_steps+1,baby_steps,f1);
922
hecurve_g1_2JC(baby_steps+1,baby_steps+1,f1); k = 2; break; // first step = step size is 6
923
default:
924
printf("Invalid modcase value %d in hecure_g1_bsgs\n", m); exit (0);
925
}
926
927
hecurve_g1_JC_exp_powers(giant_steps, p, gstep, f1);
928
hecurve_g1_JC_exp_powers(giants+dsteps-1, p, gbase, f1);
929
930
// If any of our baby steps is the identity, we won't get very far, so we need to check this situation (it can happen). The giant steps are handled below.
931
if ( k > 1 && hecurve_g1_JC_id(baby_steps+1) ) return set_exp(exp,(a?m:2),low,high);
932
if ( k > 2 && hecurve_g1_JC_id(baby_steps+2) ) return set_exp(exp,4,low,high);
933
934
// We need to convert the steps to affine coords before using them--do this with one field inversion.
935
for ( i = 0 ; i < k ; i++ ) _ff_set(zinv[i], baby_steps[i].z3);
936
j = k;
937
if ( ! hecurve_g1_JC_id(giant_steps) ) { _ff_set(zinv[j], giant_steps[0].z3); j++; } // giant step could be the identity
938
// invert all the z's
939
ff_parallel_invert (zinv, zinv, j);
940
for ( i = 0 ; i < k ; i++ ) hecurve_g1_JC_to_A (baby_x+i, baby_y+i, baby_steps+i, zinv[i]);
941
j=k;
942
if ( ! hecurve_g1_JC_id(giant_steps) ) { hecurve_g1_JC_to_A (giant_x, giant_y, giant_steps, zinv[j]); j++; }
943
944
// Now handle giant step = identity (we needed to get the baby in affine coords for the fastorder computation first)
945
if ( hecurve_g1_JC_id(giant_steps) ) { o = hecurve_g1_fastorder (baby_x[0], baby_y[0], gstep, f1); return set_exp(exp,o,low,high); }
946
if ( hecurve_g1_JC_id(giants+dsteps-1) ) { o = hecurve_g1_fastorder (baby_x[0], baby_y[0], gbase, f1); return set_exp(exp,o,low,high); }
947
948
/*
949
printf("affine baby step 0 (%ld,%ld)\n", _ff_get_ui(baby_x[0]),_ff_get_ui(baby_y[0]));
950
if (m>1) printf("affine baby step 1 (%ld,%ld)\n", _ff_get_ui(baby_x[1]),_ff_get_ui(baby_y[1]));
951
if (m==6) printf("affine baby step 2 (%ld,%ld)\n", _ff_get_ui(baby_x[2]),_ff_get_ui(baby_y[2]));
952
printf("affine giant step %d: (%ld,%ld)\n", gstep, _ff_get_ui(giant_x[0]),_ff_get_ui(giant_y[0]));
953
*/
954
955
// Baby steps
956
babys[0] = ( a ? baby_steps[1] : baby_steps[0] ); // take first baby step
957
switch (modcase) {
958
case 1: j=hecurve_g1_AJC_steps_1 (babys, baby_x[0], baby_y[0], bsteps, f1); break; // step on everything (use step size 1)
959
case 2: j=hecurve_g1_AJC_steps_1 (babys, baby_x[1], baby_y[1], bsteps, f1); break; // step on odd powers (use step size 2)
960
case 3: j=hecurve_g1_AJC_steps_2 (babys, baby_x[0], baby_y[0], baby_x[1], baby_y[1], bsteps, f1); break; // step on powers prime to 3 (alternate step sizes 1 and 2: 1,2,4,5,7,8,...)
961
case 4: j=hecurve_g1_AJC_steps_1 (babys, baby_x[1], baby_y[1], bsteps, f1); break; // step on multiples of 3
962
case 6: j=hecurve_g1_AJC_steps_2 (babys, baby_x[2], baby_y[2], baby_x[1], baby_y[1], bsteps, f1); break; // step on powers prime to 6 (alternate step sizes 4 and 2: 1,5,7,11,13,17,...)
963
case 7: j=hecurve_g1_AJC_steps_1 (babys, baby_x[1], baby_y[1], bsteps, f1); break; // step on multiples of 6
964
}
965
if ( j < bsteps ) { // if a baby step is the identity, we know the order of the element
966
k = baby_index(j,modcase); // note that we rely on the order being relatively prime to 2,3,6 (per m)
967
if ( a ) k/=m; // need to adjust for common divisor of baby steps when a!=0
968
return set_exp(exp,k,low,high);
969
}
970
971
// Giant steps
972
j = hecurve_g1_AJC_dsteps_1 (giants+dsteps-1, giant_x[0], giant_y[0], dsteps, f1); // downward steps first
973
if ( j < dsteps ) { // if a giant step hit the identity, it could be a multiple of the element order
974
// It might be faster to just continue in this situation, but it's rare in any event
975
o = hecurve_g1_fastorder (baby_x[0], baby_y[0], gbase-j*gstep, f1);
976
return set_exp(exp,o,low,high);
977
}
978
j = hecurve_g1_AJC_steps_1 (giants+dsteps-1, giant_x[0], giant_y[0], usteps, f1); // now upward steps
979
if ( j < usteps ) { // if a giant step hit the identity, it could be a multiple of the element order
980
// It might be faster to just continue in this situation, but it's rare in any event
981
o = hecurve_g1_fastorder (baby_x[0], baby_y[0], gbase+j*gstep, f1);
982
return set_exp(exp,o,low,high);
983
}
984
gbase -= (dsteps-1)*gstep; // reset gbase to match giants[0]
985
986
// Convert to affine to get unique x coords
987
for ( i = j = 0 ; i < bsteps ; i++, j++ ) _ff_set(stepzs[j], babys[i].z2);
988
for ( i = 0 ; i < gsteps ; i++, j++ ) _ff_set(stepzs[j], giants[i].z2);
989
ff_parallel_invert(stepzs, stepzs, j); // we assume j < FF_MAX_PARALLEL_INVERTS here
990
for ( i = j = 0 ; i < bsteps ; i++,j++ ) ff_mult(babys[i].x, babys[i].x, stepzs[j]); // only invert x-coord here, we never need to invert y
991
for ( i = 0 ; i < gsteps ; i++,j++ ) ff_mult(giants[i].x, giants[i].x, stepzs[j]);
992
/*
993
printf ("affine baby x's:\n", j);
994
for ( i = 0 ; i < bsteps ; i++ ) printf (" %d(%d): %ld\n", i, baby_index(i,modcase), _ff_get_ui(babys[i].x));
995
printf ("affine giant x's:\n", j);
996
for ( i = 0 ; i < gsteps ; i++ ) printf (" %ld: %ld\n", gbase+i*gstep, _ff_get_ui(giants[i].x));
997
*/
998
// Populate the table with babys. Insert will fail if we try to insert the same x value twice - this can only happen if two babys are inverses (they can't be equal because we didn't hit the identity).
999
tab_clear();
1000
for ( i = 0 ; i < bsteps ; i++ ) if ( (j=tab_insert(babys[i].x,i)) >= 0 ) break;
1001
/*
1002
If we encountered two baby steps with the same x value, they must be inverses, since otherwise we would have hit the identity in between.
1003
If m=1,2,3,6 we have stepped on every possible element order between the two, and if m=4,5 implies we have stepped on every multiple of a common divisor of the two,
1004
and hence on every possible value of the difference of their indices (which would be the identity if the steps were equal).
1005
*/
1006
if ( i < bsteps ) {
1007
k = baby_index(i,modcase)+baby_index(j,modcase);
1008
if ( a ) k/=m; // need to adjust for common divisor of baby steps when a != 0
1009
return set_exp(exp,k,low,high);
1010
}
1011
1012
// Now match giant steps
1013
o1 = o2 = 0;
1014
for ( i = 0 ; i < gsteps ; i++ ) {
1015
if ( (j=tab_lookup(giants[i].x)) >= 0 ) {
1016
_ff_mult(t0,babys[j].y,giants[i].z3); _ff_mult(t1,giants[i].y,babys[j].z3); // normalize y values for comparison
1017
if ( _ff_zero(t0) ) { k = 2*baby_index(j,modcase); return set_exp(exp,k,low,high); } // handle 2-torsion case
1018
k = baby_index(j, modcase);
1019
if ( _ff_equal(t0,t1) ) k = -k;
1020
o = gbase+i*gstep+k;
1021
if ( o==o1 ) continue; // ignore duplicate matches, this can happen for a != 0
1022
if ( o1 ) { o2 = o; break; }
1023
o1 = o;
1024
}
1025
}
1026
if ( ! o1 ) { printf ("%ld: Error, BSGS search of entire Hasse-Weil interval failed, low=%ld, high=%ld, m=%d\n", _ff_p, low, high, m); exit (0); }
1027
if ( ! o2 ) { *exp = o1; return 1; } // we know the o1 is the unique multiple of the element order in [low,high]
1028
k=i_abs(o2-o1); // otherwise, we know |o2-o1| is a multiple of the element order and there are at least two multiples in [low,high]
1029
*exp = (m==1 ? k : hecurve_g1_fastorder (baby_x[0], baby_y[0], k, f1) ); // if m is not 1 we don't know that o1 and o2 are adjacent multiples, but a fastorder computation will figure it out
1030
return 0;
1031
}
1032
1033
/*
1034
Compute the order of affine point (x,y) given that (x,y)^e=1 using the classical algorithm.
1035
1036
This algorithm is not particularly fast, but it doesn't need to be, as it is not invoked for most p.
1037
The average number of bits exponentiated per prime for a non-CM curve (fixed curve, varying p) is less than 1.
1038
*/
1039
long hecurve_g1_fastorder (ff_t x, ff_t y, long e, ff_t f1)
1040
{
1041
hec_jc_t b[MAX_UI_PP_FACTORS];
1042
unsigned long q, p[MAX_UI_PP_FACTORS], h[MAX_UI_PP_FACTORS];
1043
register long o;
1044
register int i, j, k, w;
1045
1046
w = ui_factor(p,h,e); // takes about 2 microseconds for e~2^32 (2.5MHz AMD-64), but e is generally much smaller
1047
if ( w == 1 && h[0] == 1 ) return e; // not hard when the exponent is prime
1048
1049
for ( i = 0 ; i < w ; i++ ) {
1050
for ( q=p[i],j=1 ; j < h[i] ; j++ ) q *= p[i];
1051
o = e/q;
1052
hecurve_g1_AJC_exp_ui (b+i, x, y, o, f1);
1053
}
1054
o = 1;
1055
for ( i = 0 ; i < w ; i++ ) {
1056
for ( j = 0 ; j < h[i]-1 ; j++ ) {
1057
if ( hecurve_g1_JC_id(b+i) ) break;
1058
hecurve_g1_JC_exp_ui (b+i, b+i, p[i], f1);
1059
}
1060
if ( ! hecurve_g1_JC_id (b+i) ) j++;
1061
for ( k = 0 ; k < j ; k++ ) o *= p[i];
1062
}
1063
//printf ("fastorder computed order of (%ld,%ld) is %ld from exponent %ld\n", _ff_get_ui(x), _ff_get_ui(y), o, e);
1064
return o;
1065
}
1066
1067
/*
1068
BSGS search hardwired for very short intervals (typically less than 50), which uses only one or two baby steps.
1069
These intervals can arise when the first order computation does not uniquely determine the exponent of the group (or when p is small).
1070
1071
We assume b does not have 2 torsion (hence its order is at least 3) and high > low.
1072
We rely on the existence of an exponent of b in [low,high] and do not always verify this (e.g., if high=low+1 and b^low != id then we conclude b^high == id)
1073
*/
1074
int hecurve_g1_bsgs_short (long *exp, hec_jc_t *b, long low, long high, ff_t f1)
1075
{
1076
long range;
1077
hec_jc_t b2[1],b5[1],g[1];
1078
register ff_t t0, t1;
1079
register long e;
1080
long o1,o2;
1081
int i;
1082
1083
range = high-low+1;
1084
switch (range) {
1085
case 2: // E no computation other than the g exponentiation, cost E
1086
hecurve_g1_JC_exp_ui(g, b, low, f1);
1087
*exp = ( hecurve_g1_JC_id(g) ? low : high );
1088
return 1;
1089
case 3: // E+4/3M (on average)
1090
hecurve_g1_JC_exp_ui(g, b, low+1, f1);
1091
if ( hecurve_g1_JC_id(g) ) { *exp = low+1; return 1; }
1092
// it must be the case that g = +/- b, we don't verify this but simply compare y values to distinguish
1093
_ff_mult(t0,b->y,g->z3); _ff_mult(t1,g->y,b->z3);
1094
*exp = ( _ff_equal (t0, t1) ? low : high );
1095
return 1;
1096
case 4: // E+1/4D+2M = E+5M+2A (roughly) (D indicates doubling(squaring) a point on the curve, which costs 11M+8A field operations)
1097
hecurve_g1_JC_exp_ui(g, b, low+1, f1);
1098
if ( hecurve_g1_JC_id(g) ) { *exp = low+1; return 1; }
1099
_ff_mult(t0,b->x,g->z2); _ff_mult(t1,g->x,b->z2);
1100
if ( _ff_equal (t0, t1) ) {
1101
_ff_mult(t0,b->y,g->z3); _ff_mult(t1,g->y,b->z3);
1102
if ( ! _ff_equal (t0, t1) ) { *exp = low+2; return 1; }
1103
// we know b^low = id, but it could also be that b^high = id if b has order 3, so we need to check this
1104
hecurve_g1_2JC (b2,b, f1);
1105
_ff_mult(t0,b->x,b2->z2); _ff_mult(t1,b2->x,b->z2); // given b != id, b^3==id iff b and b^2 have the same x coord
1106
if ( _ff_equal(t0,t1) ) { *exp = 3; return 0; }
1107
*exp = low;
1108
return 1;
1109
}
1110
*exp = high;
1111
return 1;
1112
case 5: // < E+4/5D+5M (on average)
1113
hecurve_g1_JC_exp_ui(g, b, low+2, f1);
1114
if ( hecurve_g1_JC_id(g) ) { *exp = low+2; return 1; }
1115
hecurve_g1_2JC (b2,b, f1);
1116
if ( hecurve_g1_JC_id(b2) ) { *exp = 2; return 0; }
1117
if ( hecurve_g1_JC_2tor(b2) ) { *exp = 4; return 0; }
1118
_ff_mult(t0,b->x,b2->z2); _ff_mult(t1,b2->x,b->z2);
1119
if ( _ff_equal(t0,t1) ) { *exp = 3; return 0; }
1120
// we now know |b|>4, so there is a unique multiple in the interval
1121
_ff_mult(t0,b->x,g->z2); _ff_mult(t1,g->x,b->z2);
1122
if ( _ff_equal (t0, t1) ) {
1123
_ff_mult(t0,b->y,g->z3); _ff_mult(t1,g->y,b->z3);
1124
*exp = ( _ff_equal(t0,t1) ? low+1 : low+3 );
1125
return 1;
1126
}
1127
_ff_mult(t0,b2->y,g->z3); _ff_mult(t1,g->y,b2->z3);
1128
*exp = ( _ff_equal (t0, t1) ? low : low+4 );
1129
return 1;
1130
}
1131
hecurve_g1_2JC (b2,b, f1);
1132
if ( hecurve_g1_JC_id(b2) ) { *exp = 2; return 0; }
1133
if ( hecurve_g1_JC_2tor (b2) ) { *exp = 4; return 0; }
1134
_ff_mult(t0,b->x,b2->z2); _ff_mult(t1,b2->x,b->z2);
1135
if ( _ff_equal(t0,t1) ) { *exp = 3; return 0; }
1136
hecurve_g1_2JC (b5,b2, f1);
1137
hecurve_g1_JCJC (b5,b, f1);
1138
if ( hecurve_g1_JC_id(b5) ) { *exp = 5; return 0; }
1139
// we know |b|>5
1140
e = low+2;
1141
hecurve_g1_JC_exp_ui(g, b, e, f1);
1142
o1 = o2 = 0;
1143
while ( e < high+3 ) {
1144
if ( hecurve_g1_JC_id(g) ) { o2 = o1; o1 = e; goto next; }
1145
_ff_mult(t0,b->x,g->z2); _ff_mult(t1,g->x,b->z2);
1146
if ( _ff_equal (t0, t1) ) {
1147
_ff_mult(t0,b->y,g->z3); _ff_mult(t1,g->y,b->z3);
1148
i = ( _ff_equal(t0,t1) ? -1 : 1 );
1149
o2 = o1; o1 = e+i; goto next;
1150
}
1151
_ff_mult(t0,b2->x,g->z2); _ff_mult(t1,g->x,b2->z2);
1152
if ( _ff_equal (t0, t1) ) {
1153
_ff_mult(t0,b2->y,g->z3); _ff_mult(t1,g->y,b2->z3);
1154
i = ( _ff_equal(t0,t1) ? -2 : 2 );
1155
o2 = o1; o1 = e+i; goto next;
1156
}
1157
next: if ( o2 ) break;
1158
hecurve_g1_JCJC (g,b5,f1);
1159
e += 5;
1160
}
1161
if ( ! o1 ) { printf ("%ld: No match found in short interval [%ld,%ld]\n", _ff_p, low, high); exit (0); }
1162
if ( o2 ) { *exp = o1-o2; return 0; }
1163
*exp = o1;
1164
// total cost is roughly E + 2D + (1+(range/5-1))*A + (range/5)*4M (D indicates point doubling, A indicates point addition)
1165
return 1;
1166
}
1167
1168
/*
1169
The rest of this file contains code that is not currently used because it was found to to be suboptimal in testing (on an AMD Athlon) but
1170
may be useful on other platforms and/or other applications.
1171
*/
1172
#if 0
1173
1174
/*
1175
Below are elliptic curve arithmetic functions for Jacobian coordinates. The reduced Chudnovsky coordinates
1176
are faster at everything except 2JC versus 2J is (11M+8A) vs (10M+10A), but this difference is negligible (1M~2.5A),
1177
and in testing it was found to be faster to use JC everywhere.
1178
*/
1179
1180
// converts Jacobian to affine coords, given the inverse of z
1181
static inline void hecurve_g1_J_to_A (ff_t *px, ff_t *py, hec_j_t *p, ff_t zinv)
1182
{
1183
register ff_t t0,t1;
1184
1185
_ff_square(t0,zinv);
1186
_ff_mult(*px,p->x,t0);
1187
_ff_mult(t1,t0,zinv);
1188
_ff_mult(*py,p->y,t1);
1189
// 4M
1190
}
1191
1192
// converts Jacobian to reduced Chudnovsky Jacobian coords
1193
static inline void hecurve_g1_J_to_JC (hec_jc_t *o, hec_j_t *p)
1194
{
1195
_ff_set(o->x,p->x);
1196
_ff_set(o->y,p->y);
1197
_ff_square(o->z2,p->z);
1198
_ff_mult(o->z3,o->z2,p->z);
1199
// 2M
1200
}
1201
1202
// squares an affine point into Jacobian coords
1203
static inline void hecurve_g1_2AJ (hec_j_t *p, ff_t x, ff_t y, ff_t f1)
1204
{
1205
register ff_t t0,t1,t2,t3,a,b;
1206
1207
_ff_add(p->z,y,y); _ff_mult(t1,p->z,y); _ff_add(t2,t1,t1); _ff_mult(a,x,t2); // z=2y, a=4xy^2, t1=2y^2, t2=4y^2
1208
_ff_mult(t3,t1,t2); // t3=8y^4
1209
_ff_square(t1,x); _ff_add(b,t1,t1); _ff_addto(b,t1); _ff_addto(b,f1); // b = 3x^2+f1*1^4
1210
_ff_square(t0,b); _ff_add(t2,a,a); _ff_sub(p->x,t0,t2); // x = b^2-2a
1211
_ff_sub(t1,a,p->x); _ff_mult(t2,t1,b); _ff_sub(p->y,t2,t3); // y = b(a-x)-8y^4 (note we use the new x here)
1212
// 6M+9A
1213
}
1214
1215
1216
// squares a point p1 in Jacobian coords (p3 is the output, may be equal to p1)
1217
static inline void hecurve_g1_2J (hec_j_t *p3, hec_j_t *p1, ff_t f1)
1218
{
1219
register ff_t a, b, c, t0, t1, t2;
1220
1221
_ff_square(t0,p1->x); _ff_add(t1,t0,t0); _ff_addto(t1,t0); // t1 = 3x^2
1222
_ff_square(t0,p1->z); _ff_square(t2,t0); _ff_mult(t0,f1,t2); _ff_add(b,t1,t0); // b = 3x^2+f1*z^4 (note that f1=a4 in 13.2.1.c of HECHECC p.282)
1223
_ff_add(c,p1->y,p1->y); ff_mult(p3->z,p1->z,c); // c=2y, z = 2yz
1224
_ff_mult(t2,c,p1->y); _ff_add(t0,t2,t2); _ff_mult(a,t0,p1->x); // a=4xy^2,t2=2y^2
1225
_ff_add(t0,a,a); _ff_square(t1,b); _ff_sub(p3->x,t1,t0); // x = b^2-2a
1226
_ff_square(t0,t2); _ff_add(c,t0,t0); // c = 8y^4
1227
_ff_sub(t0,a,p3->x); _ff_mult(t2,t0,b); _ff_sub(p3->y,t2,c); // y = b(a-x)-c -- note we use the new x value here
1228
// 10M+10A
1229
}
1230
1231
1232
// multiplies a point p in Jacobian coords by a (non-identity) affine point (p is an input and an output)
1233
static inline void hecurve_g1_AJ (hec_j_t *p, ff_t x0, ff_t y0, ff_t f1)
1234
{
1235
register ff_t a, c, e, f, t0, t1, t2;
1236
1237
if ( _ff_zero(p->z) ) { _ff_set(p->x,x0); _ff_set(p->y,y0); _ff_set_one(p->z); return; }
1238
_ff_square(t0,p->z); _ff_mult(a,x0,t0); // a = x0z^2, b = x*1^2=x (since z0=1), and t0=z^2
1239
_ff_sub(e,p->x,a); // e = a-b
1240
_ff_mult(t1,t0,p->z); _ff_mult(c,y0,t1); // c = y0z^3, d=y*1^3=y
1241
_ff_sub(f,p->y,c); // f = d-c
1242
if ( _ff_zero(e) && _ff_zero(f) ) { hecurve_g1_2AJ (p, x0,y0,f1); return; } // must use doubling code here, but at least its an affine double (6M+9A)
1243
_ff_square(t0,e); _ff_mult(t1,t0,e); _ff_mult(t2,t0,a); // t1=e^3, t2=ae^2
1244
_ff_square(t0,f); _ff_sub(p->x,t0,t1); _ff_add(t0,t2,t2); _ff_subfrom(p->x,t0); // x = f^2-e^3-2ae^2
1245
_ff_sub(t0,t2,p->x); _ff_mult(t2,f,t0); _ff_mult(t0,t1,c); _ff_sub(p->y,t2,t0); // y = f(ae^2-x) - ce^3
1246
ff_mult(p->z,p->z,e); // z = 1*z*e
1247
// 11M+6A
1248
}
1249
1250
1251
// multiplies p1 in Jacobian coords by p2 Jacobian coords and puts the result in p1. Handles identity and will square (double) if needed
1252
// this is the slowest case and should be avoided when possible, AJ, AJC, and JCJC are all better
1253
static inline void hecurve_g1_JJ (hec_j_t *p1, hec_j_t *p2, ff_t f1)
1254
{
1255
register ff_t a, b, c, d, e, f, t0, t1, t2;
1256
1257
// we need to handle identity cases in general (although in some cases we might be able to rule this out)
1258
if ( _ff_zero(p2->z) ) return;
1259
if ( _ff_zero(p1->z) ) { *p1=*p2; return; }
1260
_ff_square(t0,p2->z); _ff_mult(a,p1->x,t0); // a = x1z2^2, and t0=z2^2
1261
_ff_square(t1,p1->z); _ff_mult(b,p2->x,t1); // b = x2z1^2, and t1=z1^2
1262
_ff_sub(e,b,a); // e = a-b
1263
_ff_mult(t2,t0,p2->z); _ff_mult(c,p1->y,t2); // c = y1z2^3
1264
_ff_mult(t2,t1,p1->z); _ff_mult(d,p2->y,t2); // d = y2z1^3
1265
_ff_sub(f,d,c); // f = d-c
1266
if ( _ff_zero(e) && _ff_zero(f) ) { hecurve_g1_2J (p1,p1,f1); return; } // must double if pts are equal (10M+10A), inverses will end up with z=0, so we let them through
1267
_ff_square(t0,e); _ff_mult(t1,t0,e); _ff_mult(t2,t0,a); // t1=e^3, t2=ae^2
1268
_ff_square(t0,f); _ff_sub(p1->x,t0,t1); _ff_add(t0,t2,t2); _ff_subfrom(p1->x,t0); // x = f^2-e^3-2ae^2
1269
_ff_sub(t0,t2,p1->x); _ff_mult(t2,f,t0); _ff_mult(t0,t1,c); _ff_sub(p1->y,t2,t0); // y = f(ae^2-x) - ce^3 -- note we use the new x here
1270
ff_mult(t0,p1->z,p2->z); _ff_mult(p1->z,t0,e); // z = z1z2e
1271
// 16M+7A (ouch)
1272
}
1273
1274
1275
// Computes p=(x,y)^n where (x,y) !=1 is in affine coordinates and p is in Jacobian coordinates
1276
// It takes advantage of the fact that all additions are of the form J+A, requiring only 11M, doubling is done in J+J form, using 10M
1277
void hecurve_g1_AJ_exp_ui (hec_j_t *p, ff_t x0, ff_t y0, unsigned long n, ff_t f1)
1278
{
1279
register unsigned long m;
1280
unsigned long pbits, nbits;
1281
register ff_t negy0;
1282
int i;
1283
1284
if ( n == 0 ) { _ff_set_zero(p->z); return; }
1285
if ( n == 1 ) { _ff_set(p->x,x0); _ff_set(p->y,y0); _ff_set_one(p->z); return; }
1286
hecurve_g1_2AJ(p,x0,y0,f1);
1287
if ( n == 2 ) return;
1288
ui_NAF(&pbits,&nbits,n);
1289
i = _asm_highbit(pbits)-2; // we know the top two bits of the NAF are 10
1290
hecurve_expbits+=i+1;
1291
_ff_neg(negy0,y0);
1292
m = (1UL<<i);
1293
for ( ; m ; m >>= 1 ) {
1294
hecurve_g1_2J(p,p,f1); // 10M+10A
1295
if ( m&pbits ) hecurve_g1_AJ(p,x0,y0,f1); // 11M+6A
1296
if ( m&nbits ) hecurve_g1_AJ(p,x0,negy0,f1); // 11M+6A
1297
}
1298
}
1299
1300
1301
// Computes p=a^e where a!=1 is in Jacobian coords, and so is p.
1302
// This is slow and should be avoided. Overlap is ok
1303
void hecurve_g1_J_exp_ui (hec_j_t *p, hec_j_t *a, unsigned long n, ff_t f1)
1304
{
1305
register unsigned long m;
1306
unsigned long pbits, nbits;
1307
hec_j_t t, ai;
1308
int i;
1309
1310
if ( n == 0 ) { _ff_set_zero(p->z); return; }
1311
if ( n == 1 ) { *p=*a; return; }
1312
hecurve_g1_2J(&t,a,f1); // 10M+10A
1313
if ( n == 2 ) { *p = t; return; }
1314
ui_NAF(&pbits,&nbits,n);
1315
i = _asm_highbit(pbits)-2;
1316
hecurve_expbits+=i+1;
1317
ai=*a; ff_negate(ai.y);
1318
m = (1UL<<i);
1319
for ( ; m ; m >>= 1 ) {
1320
hecurve_g1_2J(&t,&t,f1); // 10M+10A
1321
if ( m&pbits ) hecurve_g1_JJ(&t,a,f1); // 16M+7A
1322
if ( m&nbits ) hecurve_g1_JJ(&t,&ai,f1); // 16M+7A
1323
}
1324
*p = t;
1325
}
1326
1327
1328
// Combines precomputed values p[i]=p[0]^(2^i) to compute p[0]^n. Assumes all required powers are present: NAF potentially requires one more bit!
1329
// CAUTION: if this is false, the resulting behavior is very strange and unpredictable. You have been warned.
1330
void hecurve_g1_J_exp_powers(hec_j_t *o, hec_j_t *p, unsigned long n, ff_t f1)
1331
{
1332
register unsigned long m;
1333
unsigned long pbits, nbits;
1334
hec_j_t t;
1335
int i;
1336
1337
// handle small n quickly
1338
switch (n) {
1339
case 0: _ff_set_zero(o->z); return;
1340
case 1: *o=p[0]; return;
1341
case 2: *o=p[1]; return;
1342
case 3: *o=p[0]; hecurve_g1_JJ(o,p+1,f1); return;
1343
case 4: *o=p[2]; return;
1344
case 5: *o=p[2]; hecurve_g1_JJ(o,p,f1); return;
1345
case 6: *o=p[2]; hecurve_g1_JJ(o,p+1,f1); return;
1346
case 7: *o=p[0]; ff_negate(o->y); hecurve_g1_JJ(o,p+3,f1); return;
1347
case 8: *o=p[3]; return;
1348
}
1349
ui_NAF(&pbits,&nbits,n);
1350
i = _asm_highbit(pbits);
1351
*o = p[i];
1352
i-=2;
1353
hecurve_expbits+=i+1;
1354
m = (1UL<<i);
1355
for ( ; m ; m >>= 1, i-- ) {
1356
if ( (m&pbits) ) hecurve_g1_JJ(o,p+i,f1); // 16M+7A (these are expensive)
1357
if ( (m&nbits) )
1358
{ ff_negate(o->y); hecurve_g1_JJ(o,p+i,f1); ff_negate(o->y); } // 16M+9A (slightly more expensive)
1359
}
1360
}
1361
1362
// Sets p[i] = p[0]^(2^i) for i in [0,k]. Input is an affine pt not equal to the identity, output is in Jacobian coords.
1363
void hecurve_g1_AJ_powers (hec_j_t p[], ff_t x0, ff_t y0, int k, ff_t f1)
1364
{
1365
ff_t a, b, c, x, y, z, t0, t1, t2;
1366
register hec_j_t *e;
1367
1368
_ff_set(p->x,x0); _ff_set(p->y,y0); _ff_set_one(p->z);
1369
hecurve_g1_2AJ (p+1, x0, y0, f1);
1370
for ( e=p+k,p+=2 ; p <= e ; p++ ) hecurve_g1_2J(p,p-1,f1); // 10M + 10A
1371
}
1372
#endif
1373
1374